diff --git a/.github/actions/setup-pytorch/action.yml b/.github/actions/setup-pytorch/action.yml index b6cc894722..88f482f78a 100644 --- a/.github/actions/setup-pytorch/action.yml +++ b/.github/actions/setup-pytorch/action.yml @@ -83,7 +83,7 @@ runs: uses: ./.github/actions/load env: # Increase this value to reset cache - CACHE_NUMBER: 12 + CACHE_NUMBER: 14 with: path: pytorch key: pytorch-$PYTORCH_CACHE_KEY-$CACHE_NUMBER @@ -120,7 +120,7 @@ runs: cd pytorch pip install wheel pip install -r requirements.txt - python setup.py bdist_wheel + USE_STATIC_MKL=1 python setup.py bdist_wheel - name: Install PyTorch (built from source) if: ${{ inputs.mode == 'source' }} diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index cfba6d7225..2922da501e 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -21,10 +21,12 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} permissions: read-all env: + TRITON_BUILD_WITH_CCACHE: "true" TRITON_BUILD_WITH_CLANG_LLD: "TRUE" TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE" TRITON_DISABLE_LINE_INFO: 1 PROTON_SKIP_PC_SAMPLING_TEST: 1 + CCACHE_COMPRESS: "true" jobs: Runner-Preparation: runs-on: ubuntu-latest @@ -39,6 +41,11 @@ jobs: if: github.event_name == 'pull_request' run: | echo "enable_integration=true" >> $GITHUB_ENV + - name: Decide manual trigger integration test enablement + # Always enable integration tests when manually triggered + if: github.event_name == 'workflow_dispatch' + run: | + echo "enable_integration=true" >> $GITHUB_ENV - name: Checkout post-submit commits if: github.event_name == 'push' uses: actions/checkout@v4 @@ -154,6 +161,8 @@ jobs: strategy: matrix: runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-CUDA)}} + env: + RUNNER_TYPE: ${{ matrix.runner[0] }} steps: - name: Checkout uses: actions/checkout@v4 @@ -199,22 +208,28 @@ jobs: # "restore" step. This is to prevent the caches from accumulating stale # files over time. name: Restore cache of ccache and Triton compilation artifacts - if: github.event_name != 'push' + id: restore-build-cache + if: github.ref != 'refs/heads/main' uses: actions/cache/restore@v4 with: path: | ~/.triton/cache - ~/.cache/ccache + ~/.ccache # Restore the most recent cache entry. - restore-keys: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}- + restore-keys: | + triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}- + triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}- # We expect this cache key never to hit and for us to fall back # unconditionally to the restore-key, so it doesn't actually matter # what we put here (so long as it doesn't hit an existing key). - key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} - - name: Inspect cache directory + key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} + - name: Inspect cache directories run: | mkdir -p ~/.triton - ls -alh ~/.triton + du -h -d 1 ~/.triton + + mkdir -p ~/.ccache + du -h -d 1 ~/.ccache - name: Update PATH run: | echo "$HOME/.local/bin" >> $GITHUB_PATH @@ -224,12 +239,14 @@ jobs: python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-forked pytest-xdist lit - name: Install Triton env: - TRITON_BUILD_WITH_CCACHE: "true" CUDA_HOME: "/usr/local/cuda" run: | echo "PATH is '$PATH'" cd python - python3 -m pip install '.[tests]' + ccache --zero-stats + python3 -m pip install -v '.[tests]' + - name: CCache Stats + run: ccache --print-stats - name: Run lit tests run: | cd python @@ -278,6 +295,13 @@ jobs: cd third_party/proton/test python3 -m pytest -s . cd .. + - name: Inspect cache directories + run: | + mkdir -p ~/.triton + du -h -d 1 ~/.triton + + mkdir -p ~/.ccache + du -h -d 1 ~/.ccache - # If we're on branch `main`, save the ccache Triton compilation artifacts # to the cache so they can be used by other (non-main) CI runs. # @@ -287,22 +311,17 @@ jobs: if: github.ref == 'refs/heads/main' uses: actions/cache/save@v4 with: - path: ~/.triton/cache ~/.cache/ccache - key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} - - name: Inspect cache directories - run: | - mkdir -p ~/.triton - ls -alh ~/.triton - du -sh ~/.triton/** - - mkdir -p ~/.cache/ccache - ls -alh ~/.cache/ccache - du -sh ~/.cache/ccache + path: | + ~/.triton/cache + ~/.ccache + key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} Integration-Tests-AMD: needs: Runner-Preparation if: needs.Runner-Preparation.outputs.matrix-HIP != '' runs-on: ${{ matrix.runner }} timeout-minutes: 30 + env: + RUNNER_TYPE: ${{ matrix.runner[1] }} strategy: matrix: runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-HIP)}} @@ -355,22 +374,28 @@ jobs: # "restore" step. This is to prevent the caches from accumulating stale # files over time. name: Restore cache of ccache and Triton compilation artifacts - if: github.event_name != 'push' + id: restore-build-cache + if: github.ref != 'refs/heads/main' uses: actions/cache/restore@v4 with: path: | ~/.triton/cache - ~/.cache/ccache + ~/.ccache # Restore the most recent cache entry. - restore-keys: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}- + restore-keys: | + triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}- + triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}- # We expect this cache key never to hit and for us to fall back # unconditionally to the restore-key, so it doesn't actually matter # what we put here (so long as it doesn't hit an existing key). - key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} - - name: Inspect cache directory + key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} + - name: Inspect cache directories run: | mkdir -p ~/.triton - ls -alh ~/.triton + du -h -d 1 ~/.triton + + mkdir -p ~/.ccache + du -h -d 1 ~/.ccache - name: Update PATH run: | echo "/opt/rocm/llvm/bin" >> $GITHUB_PATH @@ -378,17 +403,24 @@ jobs: run: | python3 -m pip install --upgrade pip python3 -m pip install lit + - name: Install apt dependencies + run: | + apt update + apt install ccache - name: Install Triton id: amd-install-triton run: | echo "PATH is '$PATH'" pip uninstall -y triton cd python + ccache --zero-stats pip install -v -e '.[tests]' - name: Clean up after an unsuccessful build if: ${{ !success() && steps.amd-install-triton.outcome != 'success' }} run: | rm -rf ~/.triton + - name: CCache Stats + run: ccache --print-stats - name: Run lit tests run: | cd python @@ -431,6 +463,13 @@ jobs: cd python cd "build/$(ls build | grep -i cmake)" ctest -j32 + - name: Inspect cache directories + run: | + mkdir -p ~/.triton + du -h -d 1 ~/.triton + + mkdir -p ~/.ccache + du -h -d 1 ~/.ccache - # If we're on branch `main`, save the ccache Triton compilation artifacts # to the cache so they can be used by other (non-main) CI runs. # @@ -440,17 +479,10 @@ jobs: if: github.ref == 'refs/heads/main' uses: actions/cache/save@v4 with: - path: ~/.triton/cache ~/.cache/ccache - key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} - - name: Inspect cache directories - run: | - mkdir -p ~/.triton - ls -alh ~/.triton - du -sh ~/.triton/** - - mkdir -p ~/.cache/ccache - ls -alh ~/.cache/ccache - du -sh ~/.cache/ccache + path: | + ~/.triton/cache + ~/.ccache + key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} - name: Clean up caches run: | rm -rf ~/.triton/cache @@ -458,10 +490,12 @@ jobs: needs: Runner-Preparation if: needs.Runner-Preparation.outputs.matrix-MACOS != '' runs-on: ${{ matrix.runner }} - timeout-minutes: 30 + timeout-minutes: 40 strategy: matrix: runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-MACOS)}} + env: + RUNNER_TYPE: ${{ matrix.runner[0] }} steps: - name: Checkout uses: actions/checkout@v4 @@ -470,7 +504,7 @@ jobs: - name: Install brew dependencies run: | brew update - brew install ccache llvm@19 lld + brew install ccache llvm@19 lld coreutils - name: Compute cache keys id: cache-key run: | @@ -511,22 +545,28 @@ jobs: # "restore" step. This is to prevent the caches from accumulating stale # files over time. name: Restore cache of ccache and Triton compilation artifacts - if: github.event_name != 'push' + id: restore-build-cache + if: github.ref != 'refs/heads/main' uses: actions/cache/restore@v4 with: path: | ~/.triton/cache - ~/.cache/ccache + ~/.ccache # Restore the most recent cache entry. - restore-keys: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}- + restore-keys: | + triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}- + triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}- # We expect this cache key never to hit and for us to fall back # unconditionally to the restore-key, so it doesn't actually matter # what we put here (so long as it doesn't hit an existing key). - key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} - - name: Inspect cache directory + key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} + - name: Inspect cache directories run: | mkdir -p ~/.triton - ls -alh ~/.triton + du -h -d 1 ~/.triton + + mkdir -p ~/.ccache + du -h -d 1 ~/.ccache - name: Update PATH run: | echo "$HOME/.local/bin" >> $GITHUB_PATH @@ -539,7 +579,6 @@ jobs: python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit pybind11 - name: Install Triton env: - TRITON_BUILD_WITH_CCACHE: "true" TRITON_BUILD_WITH_O1: "true" # macos-latest has 3 vcpus and 7GB DRAM, to save memory we limit the number of jobs to 3 # https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories @@ -548,7 +587,17 @@ jobs: source ~/.venv/bin/activate echo "PATH is '$PATH'" cd python - python3 -m pip install --no-build-isolation . + ccache --zero-stats + python3 -m pip install -v --no-build-isolation . + - name: CCache Stats + run: ccache --print-stats + - name: Inspect cache directories + run: | + mkdir -p ~/.triton + du -h -d 1 ~/.triton + + mkdir -p ~/.ccache + du -h -d 1 ~/.ccache - # If we're on branch `main`, save the ccache Triton compilation artifacts # to the cache so they can be used by other (non-main) CI runs. # @@ -558,14 +607,7 @@ jobs: if: github.ref == 'refs/heads/main' uses: actions/cache/save@v4 with: - path: ~/.triton/cache ~/.cache/ccache - key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} - - name: Inspect cache directories - run: | - mkdir -p ~/.triton - ls -alh ~/.triton - du -sh ~/.triton/** - - mkdir -p ~/.cache/ccache - ls -alh ~/.cache/ccache - du -sh ~/.cache/ccache + path: | + ~/.triton/cache + ~/.ccache + key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in index 7da4aa0793..7de7264272 100644 --- a/.github/workflows/integration-tests.yml.in +++ b/.github/workflows/integration-tests.yml.in @@ -23,10 +23,12 @@ concurrency: permissions: read-all env: + TRITON_BUILD_WITH_CCACHE: "true" TRITON_BUILD_WITH_CLANG_LLD: "TRUE" TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE" TRITON_DISABLE_LINE_INFO: 1 PROTON_SKIP_PC_SAMPLING_TEST: 1 + CCACHE_COMPRESS: "true" jobs: Runner-Preparation: @@ -43,6 +45,12 @@ jobs: run: | echo "enable_integration=true" >> $GITHUB_ENV + - name: Decide manual trigger integration test enablement + # Always enable integration tests when manually triggered + if: github.event_name == 'workflow_dispatch' + run: | + echo "enable_integration=true" >> $GITHUB_ENV + - name: Checkout post-submit commits if: github.event_name == 'push' uses: actions/checkout@v4 @@ -174,6 +182,9 @@ jobs: matrix: runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-CUDA)}} + env: + RUNNER_TYPE: ${{ matrix.runner[0] }} + steps: - name: Checkout uses: actions/checkout@v4 @@ -225,24 +236,30 @@ jobs: # files over time. - &restore-build-artifacts-step name: Restore cache of ccache and Triton compilation artifacts - if: github.event_name != 'push' + id: restore-build-cache + if: github.ref != 'refs/heads/main' uses: actions/cache/restore@v4 with: path: | ~/.triton/cache - ~/.cache/ccache + ~/.ccache # Restore the most recent cache entry. - restore-keys: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}- + restore-keys: | + triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}- + triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}- # We expect this cache key never to hit and for us to fall back # unconditionally to the restore-key, so it doesn't actually matter # what we put here (so long as it doesn't hit an existing key). - key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} + key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} - - &inspect-cache-directory-step - name: Inspect cache directory + - &inspect-cache-directories-step + name: Inspect cache directories run: | mkdir -p ~/.triton - ls -alh ~/.triton + du -h -d 1 ~/.triton + + mkdir -p ~/.ccache + du -h -d 1 ~/.ccache - name: Update PATH run: | @@ -255,12 +272,16 @@ jobs: - name: Install Triton env: - TRITON_BUILD_WITH_CCACHE: "true" CUDA_HOME: "/usr/local/cuda" run: | echo "PATH is '$PATH'" cd python - python3 -m pip install '.[tests]' + ccache --zero-stats + python3 -m pip install -v '.[tests]' + + - &print-ccache-stats + name: CCache Stats + run: ccache --print-stats - &run-lit-tests-step name: Run lit tests @@ -319,6 +340,8 @@ jobs: python3 -m pytest -s . cd .. + - *inspect-cache-directories-step + # If we're on branch `main`, save the ccache Triton compilation artifacts # to the cache so they can be used by other (non-main) CI runs. # @@ -329,19 +352,10 @@ jobs: if: github.ref == 'refs/heads/main' uses: actions/cache/save@v4 with: - path: ~/.triton/cache ~/.cache/ccache - key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} - - - &inspect-cache-directories-step - name: Inspect cache directories - run: | - mkdir -p ~/.triton - ls -alh ~/.triton - du -sh ~/.triton/** - - mkdir -p ~/.cache/ccache - ls -alh ~/.cache/ccache - du -sh ~/.cache/ccache + path: | + ~/.triton/cache + ~/.ccache + key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }} Integration-Tests-AMD: needs: Runner-Preparation @@ -350,6 +364,9 @@ jobs: runs-on: ${{ matrix.runner }} timeout-minutes: 30 + env: + RUNNER_TYPE: ${{ matrix.runner[1] }} + strategy: matrix: runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-HIP)}} @@ -369,7 +386,7 @@ jobs: - *compute-cache-keys-step - *cache-build-dependencies-step - *restore-build-artifacts-step - - *inspect-cache-directory-step + - *inspect-cache-directories-step - name: Update PATH run: | @@ -380,12 +397,18 @@ jobs: python3 -m pip install --upgrade pip python3 -m pip install lit + - name: Install apt dependencies + run: | + apt update + apt install ccache + - name: Install Triton id: amd-install-triton run: | echo "PATH is '$PATH'" pip uninstall -y triton cd python + ccache --zero-stats pip install -v -e '.[tests]' - name: Clean up after an unsuccessful build @@ -393,6 +416,7 @@ jobs: run: | rm -rf ~/.triton + - *print-ccache-stats - *run-lit-tests-step - name: Run python tests on HIP @@ -423,8 +447,8 @@ jobs: - *run-proton-tests-step - *run-cpp-unittests-step - - *save-build-artifacts-step - *inspect-cache-directories-step + - *save-build-artifacts-step - name: Clean up caches run: | @@ -434,10 +458,14 @@ jobs: needs: Runner-Preparation if: needs.Runner-Preparation.outputs.matrix-MACOS != '' runs-on: ${{ matrix.runner }} - timeout-minutes: 30 + timeout-minutes: 40 strategy: matrix: runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-MACOS)}} + + env: + RUNNER_TYPE: ${{ matrix.runner[0] }} + steps: - name: Checkout uses: actions/checkout@v4 @@ -446,12 +474,12 @@ jobs: - name: Install brew dependencies run: | brew update - brew install ccache llvm@19 lld + brew install ccache llvm@19 lld coreutils - *compute-cache-keys-step - *cache-build-dependencies-step - *restore-build-artifacts-step - - *inspect-cache-directory-step + - *inspect-cache-directories-step - name: Update PATH run: | @@ -465,7 +493,6 @@ jobs: python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit pybind11 - name: Install Triton env: - TRITON_BUILD_WITH_CCACHE: "true" TRITON_BUILD_WITH_O1: "true" # macos-latest has 3 vcpus and 7GB DRAM, to save memory we limit the number of jobs to 3 # https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories @@ -474,7 +501,9 @@ jobs: source ~/.venv/bin/activate echo "PATH is '$PATH'" cd python - python3 -m pip install --no-build-isolation . + ccache --zero-stats + python3 -m pip install -v --no-build-isolation . - - *save-build-artifacts-step + - *print-ccache-stats - *inspect-cache-directories-step + - *save-build-artifacts-step diff --git a/CMakeLists.txt b/CMakeLists.txt index aa9bd605c9..e4d16d4f9d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,13 @@ if(TRITON_BUILD_WITH_CCACHE) endif() endif() +set(TRITON_PARALLEL_LINK_JOBS "" CACHE STRING + "Define the maximum number of concurrent link jobs (Ninja only).") +if (TRITON_PARALLEL_LINK_JOBS) + set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${TRITON_PARALLEL_LINK_JOBS}) + set(CMAKE_JOB_POOL_LINK link_job_pool) +endif() + # Ensure Python3 vars are set correctly # used conditionally in this file and by lit tests @@ -226,6 +233,9 @@ if(TRITON_BUILD_PYTHON_MODULE) if (TRITON_BUILD_PROTON) add_subdirectory(third_party/proton) endif() + # We always build proton dialect + list(APPEND TRITON_PLUGIN_NAMES "proton") + add_subdirectory(third_party/proton/dialect) get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS) get_property(triton_plugins GLOBAL PROPERTY TRITON_PLUGINS) @@ -334,6 +344,7 @@ if(NOT TRITON_BUILD_PYTHON_MODULE) foreach(CODEGEN_BACKEND ${TRITON_CODEGEN_BACKENDS}) add_subdirectory(third_party/${CODEGEN_BACKEND}) endforeach() + add_subdirectory(third_party/proton/dialect) endif() if(WIN32) option(CMAKE_USE_WIN32_THREADS_INIT "using WIN32 threads" ON) diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt index b32e533b64..aa1293bd49 100644 --- a/bin/CMakeLists.txt +++ b/bin/CMakeLists.txt @@ -102,6 +102,7 @@ add_llvm_executable(triton-tensor-layout triton-tensor-layout.cpp PARTIAL_SOURCE target_link_libraries(triton-tensor-layout PRIVATE TritonGPUIR TritonNvidiaGPUIR + TritonIntelGPUIR ${triton_libs} ${conversion_libs} ${dialect_libs} diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h index b103adeaba..a59956af5c 100644 --- a/bin/RegisterTritonDialects.h +++ b/bin/RegisterTritonDialects.h @@ -12,6 +12,7 @@ #include "amd/include/Dialect/TritonAMDGPU/IR/Dialect.h" #include "amd/include/TritonAMDGPUTransforms/Passes.h" #include "third_party/nvidia/include/Dialect/NVGPU/IR/Dialect.h" +#include "third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h" @@ -93,14 +94,15 @@ inline void registerTritonDialects(mlir::DialectRegistry ®istry) { mlir::triton::registerTritonAMDGPULowerInstructionSchedHints(); // TODO: register Triton & TritonGPU passes - registry.insert(); + registry + .insert(); } diff --git a/bin/triton-tensor-layout.cpp b/bin/triton-tensor-layout.cpp index 4087ac1350..b330cfb5aa 100644 --- a/bin/triton-tensor-layout.cpp +++ b/bin/triton-tensor-layout.cpp @@ -80,17 +80,9 @@ static cl::opt TensorStr( //===--------------------------------------------------------------------===// LogicalResult layoutPrint(RankedTensorType tensorType, raw_ostream &os) { - StringRef dialectName = tensorType.getEncoding().getDialect().getNamespace(); - // Dispatch to the corresponding dialect helper function to print the layout. - if (dialectName == "triton_gpu") { - os << triton::gpu::getLayoutStr(tensorType, UseHWPointOfView); - return success(); - } - - llvm::errs() << "Unsupported tensor layout attribute: " - << tensorType.getEncoding() << "\n"; - return failure(); + os << triton::gpu::getLayoutStr(tensorType, UseHWPointOfView); + return success(); } LogicalResult printLayoutFromFile(MLIRContext *context, StringRef filename, diff --git a/docs/update_sycl_libdevice.md b/docs/update_sycl_libdevice.md new file mode 100644 index 0000000000..307b0c887a --- /dev/null +++ b/docs/update_sycl_libdevice.md @@ -0,0 +1,90 @@ +# Guide to Update SYCL Device Library + +This guide will walk you through the steps to update the SYCL device library using the Intel DPC++ compiler. + +## Step 1: Display Commands used during Compilation Process +1. Open a terminal. +2. Run the following command to compile a C++ file: +```sh +dpcpp -save-temps -#x t.cpp +``` +Replace t.cpp with any C++ file of your choice. This command will display the commands used during the compilation process. + +## Step 2: Locate the llvm-link Command +From the output of the previous command, find the llvm-link command line. It should look similar to the following example: +```sh +"/opt/intel/oneapi/compiler/2025.0/bin/compiler/llvm-link" \ + -only-needed \ + t-sycl-spir64-unknown-unknown-b331ea.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-crt.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-complex.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-complex-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-cmath.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-cmath-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf-bf16.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cassert.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cstring.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-complex.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-complex-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cmath.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cmath-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf-bf16.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-user-wrappers.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-compiler-wrappers.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-stubs.bc \ + -o \ + t-sycl-spir64-unknown-unknown-d81f68.bc \ + --suppress-warnings +``` + +## Step 3: Modify the llvm-link Command +Remove the `-only-needed` option and the intermediate file `t-sycl-spir64-unknown-unknown-b331ea.bc` from the command line. +And modify to output file name to `libsycl-spir64-unknown-unknown.bc`. +The modified command should look like this: +```sh +"/opt/intel/oneapi/compiler/2025.0/bin/compiler/llvm-link" \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-crt.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-complex.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-complex-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-cmath.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-cmath-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf-bf16.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cassert.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cstring.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-complex.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-complex-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cmath.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cmath-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf-fp64.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf-bf16.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-user-wrappers.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-compiler-wrappers.bc \ + /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-stubs.bc \ + -o \ + libsycl-spir64-unknown-unknown.bc \ + --suppress-warnings +``` + +## Step 4: Execute the Modified Command +Copy the modified llvm-link command. +Paste and run it in the terminal. + +## Step 5: Check for Manual Changes +Check the log of the existing device library to see what manual changes need to be made: +```sh +git log third_party/intel/backend/lib/libsycl-spir64-unknown-unknown.bc +``` +Look for any specific changes mentioned in the commit messages. For example, from commit 0dd37fc92c46f35c6ced34801e51058b6b89ea47, you need to change one of the module metadata from 4 to 3. + +## Step 6: Apply Manual Changes +`llvm-dis` to disassemble the bitcode library, then based on the information from the git log, apply the necessary manual changes to the updated device library. +Reassemble the modified LLVMIR device library using `llvm-as`. + +By following these steps, you will have successfully updated the SYCL device library and applied any necessary manual changes. diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp index 8e8b089549..2d06980809 100644 --- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp @@ -374,24 +374,24 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion // TODO (Keren): Currently, we handle general mma/blocked/slice/dot(ampere) // -> mma/blocked/slice/dot(ampere) conversions. The following tasks must be // completed before we can remove the layoutIsOK check: - // 1. Support for AMD's MFMA and WMMA + // 1. Support for AMD's WMMA std::function layoutIsOK = [&](Attribute layout) { - if (auto nvidiaMma = dyn_cast(layout)) { - if (useLegacyMMAConversion) { - return false; - } - return true; + if (isa(layout)) { + return !useLegacyMMAConversion; } if (auto dotOperand = dyn_cast(layout)) { - if (auto nvidiaMma = - dyn_cast(dotOperand.getParent())) { - if (useLegacyMMAConversion) { - return false; - } + auto parent = dotOperand.getParent(); + if (isa(parent) && useLegacyMMAConversion) { + return false; + } + if (auto nvidiaMma = dyn_cast(parent)) { if (nvidiaMma.isAmpere()) { return true; } } + if (isa(parent)) { + return true; + } return false; } if (isa(layout)) { diff --git a/python/setup.py b/python/setup.py index 65388d8664..1e6dee4cf6 100644 --- a/python/setup.py +++ b/python/setup.py @@ -523,6 +523,7 @@ def build_extension(self, ext): "TRITON_BUILD_PROTON", "TRITON_BUILD_TUTORIALS", "TRITON_BUILD_WITH_CCACHE", + "TRITON_PARALLEL_LINK_JOBS", ] cmake_args += [f"-D{option}={os.getenv(option)}" for option in passthrough_args if option in os.environ] diff --git a/scripts/compile-pytorch-ipex.sh b/scripts/compile-pytorch-ipex.sh index 7c5a41f6c8..d753abe113 100755 --- a/scripts/compile-pytorch-ipex.sh +++ b/scripts/compile-pytorch-ipex.sh @@ -117,7 +117,7 @@ if [[ $BUILD_PYTORCH = true ]]; then echo "****** Building $PYTORCH_PROJ ******" pip install -r requirements.txt pip install cmake ninja "numpy<2.0" - python setup.py bdist_wheel + USE_STATIC_MKL=1 python setup.py bdist_wheel echo "****** Installing PyTorch ******" pip install dist/*.whl diff --git a/scripts/install-pytorch.sh b/scripts/install-pytorch.sh index a8a0f2b83a..74b3ac5158 100755 --- a/scripts/install-pytorch.sh +++ b/scripts/install-pytorch.sh @@ -155,7 +155,7 @@ $SCRIPTS_DIR/patch-pytorch.sh echo "****** Building $PYTORCH_PROJ ******" pip install -r requirements.txt pip install cmake ninja -python setup.py bdist_wheel +USE_STATIC_MKL=1 python setup.py bdist_wheel echo "****** Installing PyTorch ******" pip install dist/*.whl diff --git a/scripts/skiplist/a770/language.txt b/scripts/skiplist/a770/language.txt index e833b924bd..7e3e8d62fc 100644 --- a/scripts/skiplist/a770/language.txt +++ b/scripts/skiplist/a770/language.txt @@ -1,7 +1,5 @@ # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] -# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 -test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] diff --git a/scripts/skiplist/conda/language.txt b/scripts/skiplist/conda/language.txt index 41035163ff..1f2dcf0d10 100644 --- a/scripts/skiplist/conda/language.txt +++ b/scripts/skiplist/conda/language.txt @@ -115,8 +115,6 @@ test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[64-float8e4b15-1 test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[128-float8e5-128-256-128-128-256-256] # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] -# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 -test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] diff --git a/scripts/skiplist/default/language.txt b/scripts/skiplist/default/language.txt index fb018c5e0f..a891b802b5 100644 --- a/scripts/skiplist/default/language.txt +++ b/scripts/skiplist/default/language.txt @@ -1,6 +1,4 @@ # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] -# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 -test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] diff --git a/scripts/skiplist/mtl/language.txt b/scripts/skiplist/mtl/language.txt index df2e44aae4..69530824f3 100644 --- a/scripts/skiplist/mtl/language.txt +++ b/scripts/skiplist/mtl/language.txt @@ -1,7 +1,5 @@ # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] -# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 -test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] diff --git a/scripts/skiplist/xe2/language.txt b/scripts/skiplist/xe2/language.txt index fb018c5e0f..a891b802b5 100644 --- a/scripts/skiplist/xe2/language.txt +++ b/scripts/skiplist/xe2/language.txt @@ -1,6 +1,4 @@ # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] -# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 -test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] diff --git a/test/Conversion/amd/mfma-shortcut.mlir b/test/Conversion/amd/mfma-shortcut.mlir index 83c9e535d8..a2c8f48718 100644 --- a/test/Conversion/amd/mfma-shortcut.mlir +++ b/test/Conversion/amd/mfma-shortcut.mlir @@ -7,6 +7,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.func public @shortcut_mfma16(%arg0: tensor<16x16xf16, #mfma>) { // CHECK-NOT: store // CHECK-NOT: load + // CHECK: llvm.return %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mfma> -> tensor<16x16xf16, #dotop> tt.return } @@ -21,6 +22,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.func public @no_shortcut_mfma16(%arg0: tensor<16x16xf16, #mfma>) { // CHECK: store // CHECK: load + // CHECK: llvm.return %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mfma> -> tensor<16x16xf16, #dotop> tt.return } diff --git a/test/Conversion/intel/dot_layout_offset.mlir b/test/Conversion/intel/dot_layout_offset.mlir index 92129848d0..09615f4252 100644 --- a/test/Conversion/intel/dot_layout_offset.mlir +++ b/test/Conversion/intel/dot_layout_offset.mlir @@ -12,317 +12,307 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war // CHECK: %[[THREAD_ID_I64:.*]] = llvm.call spir_funccc @_Z12get_local_idj // CHECK: %[[THREAD_ID_I32:.*]] = llvm.trunc %[[THREAD_ID_I64]] : i64 to i32 // CHECK: %[[VAL_145:.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK: %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_I32]], %[[VAL_145]] : i32 // CHECK: %[[WARP_ID:.*]] = llvm.udiv %[[THREAD_ID_I32]], %[[VAL_145]] : i32 - // CHECK: %[[VAL_147:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_I32]], %[[VAL_147]] : i32 + // CHECK-COUNT-3: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_149:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[WARP_ID_N:.*]] = llvm.urem %[[WARP_ID]], %[[VAL_149]] : i32 - // CHECK: %[[VAL_151:.*]] = llvm.udiv %[[WARP_ID]], %[[VAL_149]] : i32 + // CHECK: %[[VAL_150:.*]] = llvm.and %[[LANE_ID]], %[[VAL_149]] : i32 + // CHECK: %[[VAL_151:.*]] = llvm.icmp "eq" %[[VAL_150]], %[[CST_0]] : i32 // CHECK: %[[VAL_152:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[WARP_ID_M:.*]] = llvm.urem %[[VAL_151]], %[[VAL_152]] : i32 - // CHECK: %[[VAL_154:.*]] = llvm.udiv %[[VAL_151]], %[[VAL_152]] : i32 + // CHECK: %[[VAL_153:.*]] = llvm.select %[[VAL_151]], %[[CST_0]], %[[VAL_152]] : i1, i32 + // CHECK: %[[VAL_154:.*]] = llvm.xor %[[CST_0]], %[[VAL_153]] : i32 // CHECK: %[[VAL_155:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: %[[ROUNDED_WARP_ID_M:.*]] = llvm.urem %[[WARP_ID_M]], %[[VAL_155]] : i32 - // CHECK: %[[warpShape_M:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[warpOffset:.*]] = llvm.mul %[[ROUNDED_WARP_ID_M]], %[[warpShape_M]] : i32 - // CHECK: %[[VAL_159:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[laneRowIndex:.*]] = llvm.udiv %[[LANE_ID]], %[[VAL_159]] : i32 - // CHECK: %[[VAL_161:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_162:.*]] = llvm.urem %[[LANE_ID]], %[[VAL_161]] : i32 - // CHECK: %[[VAL_163:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[multiDimBase_N:.*]] = llvm.mul %[[VAL_162]], %[[VAL_163]] : i32 - // CHECK: %[[multiDimBase_M:.*]] = llvm.add %[[laneRowIndex]], %[[warpOffset]] : i32 - // CHECK: %[[VAL_166:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_167:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_168:.*]] = llvm.urem %[[VAL_166]], %[[VAL_167]] : i32 - // CHECK: %[[VAL_169:.*]] = llvm.udiv %[[VAL_166]], %[[VAL_167]] : i32 - // CHECK: %[[VAL_170:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_171:.*]] = llvm.urem %[[VAL_169]], %[[VAL_170]] : i32 - // CHECK: %[[VAL_172:.*]] = llvm.udiv %[[VAL_169]], %[[VAL_170]] : i32 - // CHECK: %[[VAL_173:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_174:.*]] = llvm.urem %[[VAL_171]], %[[VAL_173]] : i32 - // CHECK: %[[VAL_175:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_176:.*]] = llvm.urem %[[VAL_168]], %[[VAL_175]] : i32 - // CHECK: %[[VAL_177:.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK: %[[CTAOffset_M:.*]] = llvm.mul %[[VAL_174]], %[[VAL_177]] : i32 - // CHECK: %[[VAL_179:.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK: %[[CTAOffset_N:.*]] = llvm.mul %[[VAL_176]], %[[VAL_179]] : i32 - // CHECK: %[[VAL_181:.*]] = llvm.add %[[multiDimBase_M]], %[[CTAOffset_M]] : i32 - // CHECK: %[[VAL_182:.*]] = llvm.add %[[multiDimBase_N]], %[[CTAOffset_N]] : i32 + // CHECK: %[[VAL_156:.*]] = llvm.and %[[LANE_ID]], %[[VAL_155]] : i32 + // CHECK: %[[VAL_157:.*]] = llvm.icmp "eq" %[[VAL_156]], %[[CST_0]] : i32 + // CHECK: %[[VAL_158:.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: %[[VAL_159:.*]] = llvm.select %[[VAL_157]], %[[CST_0]], %[[VAL_158]] : i1, i32 + // CHECK: %[[VAL_160:.*]] = llvm.xor %[[VAL_154]], %[[VAL_159]] : i32 + // CHECK: %[[VAL_161:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: %[[VAL_162:.*]] = llvm.and %[[LANE_ID]], %[[VAL_161]] : i32 + // CHECK: %[[VAL_163:.*]] = llvm.icmp "eq" %[[VAL_162]], %[[CST_0]] : i32 + // CHECK: %[[VAL_164:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: %[[VAL_165:.*]] = llvm.select %[[VAL_163]], %[[CST_0]], %[[VAL_164]] : i1, i32 + // CHECK: %[[VAL_182:.*]] = llvm.xor %[[VAL_160]], %[[VAL_165]] : i32 + // CHECK: %[[VAL_167:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK: %[[VAL_168:.*]] = llvm.and %[[LANE_ID]], %[[VAL_167]] : i32 + // CHECK: %[[VAL_169:.*]] = llvm.icmp "eq" %[[VAL_168]], %[[CST_0]] : i32 + // CHECK: %[[VAL_170:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK: %[[VAL_171:.*]] = llvm.select %[[VAL_169]], %[[CST_0]], %[[VAL_170]] : i1, i32 + // CHECK: %[[VAL_181:.*]] = llvm.xor %[[VAL_182]], %[[VAL_171]] : i32 // COM: There are total [4, 2] repetitions of tensor shape [32, 32] per warp. // COM: The repetitions are clustered as [2, 1] for A operand. The repetitions orders are [0, 0], [1, 0], [0, 1], [1, 1], [2, 0], [3, 0], [2, 1], [3, 1] // COM: Offsets of rep [0, 0]. // CHECK: %[[VAL_183:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_184:.*]] = llvm.add %[[VAL_181]], %[[VAL_183]] : i32 + // CHECK: %[[VAL_184:.*]] = llvm.xor %[[CST_0]], %[[VAL_183]] : i32 // CHECK: %[[VAL_185:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_186:.*]] = llvm.add %[[VAL_182]], %[[VAL_185]] : i32 + // CHECK: %[[VAL_186:.*]] = llvm.xor %[[VAL_181]], %[[VAL_185]] : i32 // CHECK: %[[VAL_187:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_188:.*]] = llvm.add %[[VAL_181]], %[[VAL_187]] : i32 + // CHECK: %[[VAL_188:.*]] = llvm.xor %[[CST_0]], %[[VAL_187]] : i32 // CHECK: %[[VAL_189:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_190:.*]] = llvm.add %[[VAL_182]], %[[VAL_189]] : i32 + // CHECK: %[[VAL_190:.*]] = llvm.xor %[[VAL_181]], %[[VAL_189]] : i32 // CHECK: %[[VAL_191:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: %[[VAL_192:.*]] = llvm.add %[[VAL_181]], %[[VAL_191]] : i32 + // CHECK: %[[VAL_192:.*]] = llvm.xor %[[CST_0]], %[[VAL_191]] : i32 // CHECK: %[[VAL_193:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_194:.*]] = llvm.add %[[VAL_182]], %[[VAL_193]] : i32 + // CHECK: %[[VAL_194:.*]] = llvm.xor %[[VAL_181]], %[[VAL_193]] : i32 // CHECK: %[[VAL_195:.*]] = llvm.mlir.constant(3 : i32) : i32 - // CHECK: %[[VAL_196:.*]] = llvm.add %[[VAL_181]], %[[VAL_195]] : i32 + // CHECK: %[[VAL_196:.*]] = llvm.xor %[[CST_0]], %[[VAL_195]] : i32 // CHECK: %[[VAL_197:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_198:.*]] = llvm.add %[[VAL_182]], %[[VAL_197]] : i32 + // CHECK: %[[VAL_198:.*]] = llvm.xor %[[VAL_181]], %[[VAL_197]] : i32 // CHECK: %[[VAL_199:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: %[[VAL_200:.*]] = llvm.add %[[VAL_181]], %[[VAL_199]] : i32 + // CHECK: %[[VAL_200:.*]] = llvm.xor %[[CST_0]], %[[VAL_199]] : i32 // CHECK: %[[VAL_201:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_202:.*]] = llvm.add %[[VAL_182]], %[[VAL_201]] : i32 + // CHECK: %[[VAL_202:.*]] = llvm.xor %[[VAL_181]], %[[VAL_201]] : i32 // CHECK: %[[VAL_203:.*]] = llvm.mlir.constant(5 : i32) : i32 - // CHECK: %[[VAL_204:.*]] = llvm.add %[[VAL_181]], %[[VAL_203]] : i32 + // CHECK: %[[VAL_204:.*]] = llvm.xor %[[CST_0]], %[[VAL_203]] : i32 // CHECK: %[[VAL_205:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_206:.*]] = llvm.add %[[VAL_182]], %[[VAL_205]] : i32 + // CHECK: %[[VAL_206:.*]] = llvm.xor %[[VAL_181]], %[[VAL_205]] : i32 // CHECK: %[[VAL_207:.*]] = llvm.mlir.constant(6 : i32) : i32 - // CHECK: %[[VAL_208:.*]] = llvm.add %[[VAL_181]], %[[VAL_207]] : i32 + // CHECK: %[[VAL_208:.*]] = llvm.xor %[[CST_0]], %[[VAL_207]] : i32 // CHECK: %[[VAL_209:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_210:.*]] = llvm.add %[[VAL_182]], %[[VAL_209]] : i32 + // CHECK: %[[VAL_210:.*]] = llvm.xor %[[VAL_181]], %[[VAL_209]] : i32 // CHECK: %[[VAL_211:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_212:.*]] = llvm.add %[[VAL_181]], %[[VAL_211]] : i32 + // CHECK: %[[VAL_212:.*]] = llvm.xor %[[CST_0]], %[[VAL_211]] : i32 // CHECK: %[[VAL_213:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_214:.*]] = llvm.add %[[VAL_182]], %[[VAL_213]] : i32 + // CHECK: %[[VAL_214:.*]] = llvm.xor %[[VAL_181]], %[[VAL_213]] : i32 // COM: Offsets of rep [1, 0]. // CHECK: %[[VAL_215:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: %[[VAL_216:.*]] = llvm.add %[[VAL_181]], %[[VAL_215]] : i32 + // CHECK: %[[VAL_216:.*]] = llvm.xor %[[CST_0]], %[[VAL_215]] : i32 // CHECK: %[[VAL_217:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_218:.*]] = llvm.add %[[VAL_182]], %[[VAL_217]] : i32 + // CHECK: %[[VAL_218:.*]] = llvm.xor %[[VAL_181]], %[[VAL_217]] : i32 // CHECK: %[[VAL_219:.*]] = llvm.mlir.constant(9 : i32) : i32 - // CHECK: %[[VAL_220:.*]] = llvm.add %[[VAL_181]], %[[VAL_219]] : i32 + // CHECK: %[[VAL_220:.*]] = llvm.xor %[[CST_0]], %[[VAL_219]] : i32 // CHECK: %[[VAL_221:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_222:.*]] = llvm.add %[[VAL_182]], %[[VAL_221]] : i32 + // CHECK: %[[VAL_222:.*]] = llvm.xor %[[VAL_181]], %[[VAL_221]] : i32 // CHECK: %[[VAL_223:.*]] = llvm.mlir.constant(10 : i32) : i32 - // CHECK: %[[VAL_224:.*]] = llvm.add %[[VAL_181]], %[[VAL_223]] : i32 + // CHECK: %[[VAL_224:.*]] = llvm.xor %[[CST_0]], %[[VAL_223]] : i32 // CHECK: %[[VAL_225:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_226:.*]] = llvm.add %[[VAL_182]], %[[VAL_225]] : i32 + // CHECK: %[[VAL_226:.*]] = llvm.xor %[[VAL_181]], %[[VAL_225]] : i32 // CHECK: %[[VAL_227:.*]] = llvm.mlir.constant(11 : i32) : i32 - // CHECK: %[[VAL_228:.*]] = llvm.add %[[VAL_181]], %[[VAL_227]] : i32 + // CHECK: %[[VAL_228:.*]] = llvm.xor %[[CST_0]], %[[VAL_227]] : i32 // CHECK: %[[VAL_229:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_230:.*]] = llvm.add %[[VAL_182]], %[[VAL_229]] : i32 + // CHECK: %[[VAL_230:.*]] = llvm.xor %[[VAL_181]], %[[VAL_229]] : i32 // CHECK: %[[VAL_231:.*]] = llvm.mlir.constant(12 : i32) : i32 - // CHECK: %[[VAL_232:.*]] = llvm.add %[[VAL_181]], %[[VAL_231]] : i32 + // CHECK: %[[VAL_232:.*]] = llvm.xor %[[CST_0]], %[[VAL_231]] : i32 // CHECK: %[[VAL_233:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_234:.*]] = llvm.add %[[VAL_182]], %[[VAL_233]] : i32 + // CHECK: %[[VAL_234:.*]] = llvm.xor %[[VAL_181]], %[[VAL_233]] : i32 // CHECK: %[[VAL_235:.*]] = llvm.mlir.constant(13 : i32) : i32 - // CHECK: %[[VAL_236:.*]] = llvm.add %[[VAL_181]], %[[VAL_235]] : i32 + // CHECK: %[[VAL_236:.*]] = llvm.xor %[[CST_0]], %[[VAL_235]] : i32 // CHECK: %[[VAL_237:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_238:.*]] = llvm.add %[[VAL_182]], %[[VAL_237]] : i32 + // CHECK: %[[VAL_238:.*]] = llvm.xor %[[VAL_181]], %[[VAL_237]] : i32 // CHECK: %[[VAL_239:.*]] = llvm.mlir.constant(14 : i32) : i32 - // CHECK: %[[VAL_240:.*]] = llvm.add %[[VAL_181]], %[[VAL_239]] : i32 + // CHECK: %[[VAL_240:.*]] = llvm.xor %[[CST_0]], %[[VAL_239]] : i32 // CHECK: %[[VAL_241:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_242:.*]] = llvm.add %[[VAL_182]], %[[VAL_241]] : i32 + // CHECK: %[[VAL_242:.*]] = llvm.xor %[[VAL_181]], %[[VAL_241]] : i32 // CHECK: %[[VAL_243:.*]] = llvm.mlir.constant(15 : i32) : i32 - // CHECK: %[[VAL_244:.*]] = llvm.add %[[VAL_181]], %[[VAL_243]] : i32 + // CHECK: %[[VAL_244:.*]] = llvm.xor %[[CST_0]], %[[VAL_243]] : i32 // CHECK: %[[VAL_245:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_246:.*]] = llvm.add %[[VAL_182]], %[[VAL_245]] : i32 + // CHECK: %[[VAL_246:.*]] = llvm.xor %[[VAL_181]], %[[VAL_245]] : i32 // COM: Offsets of rep [0, 1]. // CHECK: %[[VAL_247:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_248:.*]] = llvm.add %[[VAL_181]], %[[VAL_247]] : i32 + // CHECK: %[[VAL_248:.*]] = llvm.xor %[[CST_0]], %[[VAL_247]] : i32 // CHECK: %[[VAL_249:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_250:.*]] = llvm.add %[[VAL_182]], %[[VAL_249]] : i32 + // CHECK: %[[VAL_250:.*]] = llvm.xor %[[VAL_181]], %[[VAL_249]] : i32 // CHECK: %[[VAL_251:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_252:.*]] = llvm.add %[[VAL_181]], %[[VAL_251]] : i32 + // CHECK: %[[VAL_252:.*]] = llvm.xor %[[CST_0]], %[[VAL_251]] : i32 // CHECK: %[[VAL_253:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_254:.*]] = llvm.add %[[VAL_182]], %[[VAL_253]] : i32 + // CHECK: %[[VAL_254:.*]] = llvm.xor %[[VAL_181]], %[[VAL_253]] : i32 // CHECK: %[[VAL_255:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: %[[VAL_256:.*]] = llvm.add %[[VAL_181]], %[[VAL_255]] : i32 + // CHECK: %[[VAL_256:.*]] = llvm.xor %[[CST_0]], %[[VAL_255]] : i32 // CHECK: %[[VAL_257:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_258:.*]] = llvm.add %[[VAL_182]], %[[VAL_257]] : i32 + // CHECK: %[[VAL_258:.*]] = llvm.xor %[[VAL_181]], %[[VAL_257]] : i32 // CHECK: %[[VAL_259:.*]] = llvm.mlir.constant(3 : i32) : i32 - // CHECK: %[[VAL_260:.*]] = llvm.add %[[VAL_181]], %[[VAL_259]] : i32 + // CHECK: %[[VAL_260:.*]] = llvm.xor %[[CST_0]], %[[VAL_259]] : i32 // CHECK: %[[VAL_261:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_262:.*]] = llvm.add %[[VAL_182]], %[[VAL_261]] : i32 + // CHECK: %[[VAL_262:.*]] = llvm.xor %[[VAL_181]], %[[VAL_261]] : i32 // CHECK: %[[VAL_263:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: %[[VAL_264:.*]] = llvm.add %[[VAL_181]], %[[VAL_263]] : i32 + // CHECK: %[[VAL_264:.*]] = llvm.xor %[[CST_0]], %[[VAL_263]] : i32 // CHECK: %[[VAL_265:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_266:.*]] = llvm.add %[[VAL_182]], %[[VAL_265]] : i32 + // CHECK: %[[VAL_266:.*]] = llvm.xor %[[VAL_181]], %[[VAL_265]] : i32 // CHECK: %[[VAL_267:.*]] = llvm.mlir.constant(5 : i32) : i32 - // CHECK: %[[VAL_268:.*]] = llvm.add %[[VAL_181]], %[[VAL_267]] : i32 + // CHECK: %[[VAL_268:.*]] = llvm.xor %[[CST_0]], %[[VAL_267]] : i32 // CHECK: %[[VAL_269:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_270:.*]] = llvm.add %[[VAL_182]], %[[VAL_269]] : i32 + // CHECK: %[[VAL_270:.*]] = llvm.xor %[[VAL_181]], %[[VAL_269]] : i32 // CHECK: %[[VAL_271:.*]] = llvm.mlir.constant(6 : i32) : i32 - // CHECK: %[[VAL_272:.*]] = llvm.add %[[VAL_181]], %[[VAL_271]] : i32 + // CHECK: %[[VAL_272:.*]] = llvm.xor %[[CST_0]], %[[VAL_271]] : i32 // CHECK: %[[VAL_273:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_274:.*]] = llvm.add %[[VAL_182]], %[[VAL_273]] : i32 + // CHECK: %[[VAL_274:.*]] = llvm.xor %[[VAL_181]], %[[VAL_273]] : i32 // CHECK: %[[VAL_275:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_276:.*]] = llvm.add %[[VAL_181]], %[[VAL_275]] : i32 + // CHECK: %[[VAL_276:.*]] = llvm.xor %[[CST_0]], %[[VAL_275]] : i32 // CHECK: %[[VAL_277:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_278:.*]] = llvm.add %[[VAL_182]], %[[VAL_277]] : i32 + // CHECK: %[[VAL_278:.*]] = llvm.xor %[[VAL_181]], %[[VAL_277]] : i32 // COM: Offsets of rep [1, 1]. // CHECK: %[[VAL_279:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: %[[VAL_280:.*]] = llvm.add %[[VAL_181]], %[[VAL_279]] : i32 + // CHECK: %[[VAL_280:.*]] = llvm.xor %[[CST_0]], %[[VAL_279]] : i32 // CHECK: %[[VAL_281:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_282:.*]] = llvm.add %[[VAL_182]], %[[VAL_281]] : i32 + // CHECK: %[[VAL_282:.*]] = llvm.xor %[[VAL_181]], %[[VAL_281]] : i32 // CHECK: %[[VAL_283:.*]] = llvm.mlir.constant(9 : i32) : i32 - // CHECK: %[[VAL_284:.*]] = llvm.add %[[VAL_181]], %[[VAL_283]] : i32 + // CHECK: %[[VAL_284:.*]] = llvm.xor %[[CST_0]], %[[VAL_283]] : i32 // CHECK: %[[VAL_285:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_286:.*]] = llvm.add %[[VAL_182]], %[[VAL_285]] : i32 + // CHECK: %[[VAL_286:.*]] = llvm.xor %[[VAL_181]], %[[VAL_285]] : i32 // CHECK: %[[VAL_287:.*]] = llvm.mlir.constant(10 : i32) : i32 - // CHECK: %[[VAL_288:.*]] = llvm.add %[[VAL_181]], %[[VAL_287]] : i32 + // CHECK: %[[VAL_288:.*]] = llvm.xor %[[CST_0]], %[[VAL_287]] : i32 // CHECK: %[[VAL_289:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_290:.*]] = llvm.add %[[VAL_182]], %[[VAL_289]] : i32 + // CHECK: %[[VAL_290:.*]] = llvm.xor %[[VAL_181]], %[[VAL_289]] : i32 // CHECK: %[[VAL_291:.*]] = llvm.mlir.constant(11 : i32) : i32 - // CHECK: %[[VAL_292:.*]] = llvm.add %[[VAL_181]], %[[VAL_291]] : i32 + // CHECK: %[[VAL_292:.*]] = llvm.xor %[[CST_0]], %[[VAL_291]] : i32 // CHECK: %[[VAL_293:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_294:.*]] = llvm.add %[[VAL_182]], %[[VAL_293]] : i32 + // CHECK: %[[VAL_294:.*]] = llvm.xor %[[VAL_181]], %[[VAL_293]] : i32 // CHECK: %[[VAL_295:.*]] = llvm.mlir.constant(12 : i32) : i32 - // CHECK: %[[VAL_296:.*]] = llvm.add %[[VAL_181]], %[[VAL_295]] : i32 + // CHECK: %[[VAL_296:.*]] = llvm.xor %[[CST_0]], %[[VAL_295]] : i32 // CHECK: %[[VAL_297:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_298:.*]] = llvm.add %[[VAL_182]], %[[VAL_297]] : i32 + // CHECK: %[[VAL_298:.*]] = llvm.xor %[[VAL_181]], %[[VAL_297]] : i32 // CHECK: %[[VAL_299:.*]] = llvm.mlir.constant(13 : i32) : i32 - // CHECK: %[[VAL_300:.*]] = llvm.add %[[VAL_181]], %[[VAL_299]] : i32 + // CHECK: %[[VAL_300:.*]] = llvm.xor %[[CST_0]], %[[VAL_299]] : i32 // CHECK: %[[VAL_301:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_302:.*]] = llvm.add %[[VAL_182]], %[[VAL_301]] : i32 + // CHECK: %[[VAL_302:.*]] = llvm.xor %[[VAL_181]], %[[VAL_301]] : i32 // CHECK: %[[VAL_303:.*]] = llvm.mlir.constant(14 : i32) : i32 - // CHECK: %[[VAL_304:.*]] = llvm.add %[[VAL_181]], %[[VAL_303]] : i32 + // CHECK: %[[VAL_304:.*]] = llvm.xor %[[CST_0]], %[[VAL_303]] : i32 // CHECK: %[[VAL_305:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_306:.*]] = llvm.add %[[VAL_182]], %[[VAL_305]] : i32 + // CHECK: %[[VAL_306:.*]] = llvm.xor %[[VAL_181]], %[[VAL_305]] : i32 // CHECK: %[[VAL_307:.*]] = llvm.mlir.constant(15 : i32) : i32 - // CHECK: %[[VAL_308:.*]] = llvm.add %[[VAL_181]], %[[VAL_307]] : i32 + // CHECK: %[[VAL_308:.*]] = llvm.xor %[[CST_0]], %[[VAL_307]] : i32 // CHECK: %[[VAL_309:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_310:.*]] = llvm.add %[[VAL_182]], %[[VAL_309]] : i32 + // CHECK: %[[VAL_310:.*]] = llvm.xor %[[VAL_181]], %[[VAL_309]] : i32 // COM: Offsets of rep [2, 0]. // CHECK: %[[VAL_311:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_312:.*]] = llvm.add %[[VAL_181]], %[[VAL_311]] : i32 + // CHECK: %[[VAL_312:.*]] = llvm.xor %[[CST_0]], %[[VAL_311]] : i32 // CHECK: %[[VAL_313:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_314:.*]] = llvm.add %[[VAL_182]], %[[VAL_313]] : i32 + // CHECK: %[[VAL_314:.*]] = llvm.xor %[[VAL_181]], %[[VAL_313]] : i32 // CHECK: %[[VAL_315:.*]] = llvm.mlir.constant(17 : i32) : i32 - // CHECK: %[[VAL_316:.*]] = llvm.add %[[VAL_181]], %[[VAL_315]] : i32 + // CHECK: %[[VAL_316:.*]] = llvm.xor %[[CST_0]], %[[VAL_315]] : i32 // CHECK: %[[VAL_317:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_318:.*]] = llvm.add %[[VAL_182]], %[[VAL_317]] : i32 + // CHECK: %[[VAL_318:.*]] = llvm.xor %[[VAL_181]], %[[VAL_317]] : i32 // CHECK: %[[VAL_319:.*]] = llvm.mlir.constant(18 : i32) : i32 - // CHECK: %[[VAL_320:.*]] = llvm.add %[[VAL_181]], %[[VAL_319]] : i32 + // CHECK: %[[VAL_320:.*]] = llvm.xor %[[CST_0]], %[[VAL_319]] : i32 // CHECK: %[[VAL_321:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_322:.*]] = llvm.add %[[VAL_182]], %[[VAL_321]] : i32 + // CHECK: %[[VAL_322:.*]] = llvm.xor %[[VAL_181]], %[[VAL_321]] : i32 // CHECK: %[[VAL_323:.*]] = llvm.mlir.constant(19 : i32) : i32 - // CHECK: %[[VAL_324:.*]] = llvm.add %[[VAL_181]], %[[VAL_323]] : i32 + // CHECK: %[[VAL_324:.*]] = llvm.xor %[[CST_0]], %[[VAL_323]] : i32 // CHECK: %[[VAL_325:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_326:.*]] = llvm.add %[[VAL_182]], %[[VAL_325]] : i32 + // CHECK: %[[VAL_326:.*]] = llvm.xor %[[VAL_181]], %[[VAL_325]] : i32 // CHECK: %[[VAL_327:.*]] = llvm.mlir.constant(20 : i32) : i32 - // CHECK: %[[VAL_328:.*]] = llvm.add %[[VAL_181]], %[[VAL_327]] : i32 + // CHECK: %[[VAL_328:.*]] = llvm.xor %[[CST_0]], %[[VAL_327]] : i32 // CHECK: %[[VAL_329:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_330:.*]] = llvm.add %[[VAL_182]], %[[VAL_329]] : i32 + // CHECK: %[[VAL_330:.*]] = llvm.xor %[[VAL_181]], %[[VAL_329]] : i32 // CHECK: %[[VAL_331:.*]] = llvm.mlir.constant(21 : i32) : i32 - // CHECK: %[[VAL_332:.*]] = llvm.add %[[VAL_181]], %[[VAL_331]] : i32 + // CHECK: %[[VAL_332:.*]] = llvm.xor %[[CST_0]], %[[VAL_331]] : i32 // CHECK: %[[VAL_333:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_334:.*]] = llvm.add %[[VAL_182]], %[[VAL_333]] : i32 + // CHECK: %[[VAL_334:.*]] = llvm.xor %[[VAL_181]], %[[VAL_333]] : i32 // CHECK: %[[VAL_335:.*]] = llvm.mlir.constant(22 : i32) : i32 - // CHECK: %[[VAL_336:.*]] = llvm.add %[[VAL_181]], %[[VAL_335]] : i32 + // CHECK: %[[VAL_336:.*]] = llvm.xor %[[CST_0]], %[[VAL_335]] : i32 // CHECK: %[[VAL_337:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_338:.*]] = llvm.add %[[VAL_182]], %[[VAL_337]] : i32 + // CHECK: %[[VAL_338:.*]] = llvm.xor %[[VAL_181]], %[[VAL_337]] : i32 // CHECK: %[[VAL_339:.*]] = llvm.mlir.constant(23 : i32) : i32 - // CHECK: %[[VAL_340:.*]] = llvm.add %[[VAL_181]], %[[VAL_339]] : i32 + // CHECK: %[[VAL_340:.*]] = llvm.xor %[[CST_0]], %[[VAL_339]] : i32 // CHECK: %[[VAL_341:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_342:.*]] = llvm.add %[[VAL_182]], %[[VAL_341]] : i32 + // CHECK: %[[VAL_342:.*]] = llvm.xor %[[VAL_181]], %[[VAL_341]] : i32 // COM: Offsets of rep [3, 0]. // CHECK: %[[VAL_343:.*]] = llvm.mlir.constant(24 : i32) : i32 - // CHECK: %[[VAL_344:.*]] = llvm.add %[[VAL_181]], %[[VAL_343]] : i32 + // CHECK: %[[VAL_344:.*]] = llvm.xor %[[CST_0]], %[[VAL_343]] : i32 // CHECK: %[[VAL_345:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_346:.*]] = llvm.add %[[VAL_182]], %[[VAL_345]] : i32 + // CHECK: %[[VAL_346:.*]] = llvm.xor %[[VAL_181]], %[[VAL_345]] : i32 // CHECK: %[[VAL_347:.*]] = llvm.mlir.constant(25 : i32) : i32 - // CHECK: %[[VAL_348:.*]] = llvm.add %[[VAL_181]], %[[VAL_347]] : i32 + // CHECK: %[[VAL_348:.*]] = llvm.xor %[[CST_0]], %[[VAL_347]] : i32 // CHECK: %[[VAL_349:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_350:.*]] = llvm.add %[[VAL_182]], %[[VAL_349]] : i32 + // CHECK: %[[VAL_350:.*]] = llvm.xor %[[VAL_181]], %[[VAL_349]] : i32 // CHECK: %[[VAL_351:.*]] = llvm.mlir.constant(26 : i32) : i32 - // CHECK: %[[VAL_352:.*]] = llvm.add %[[VAL_181]], %[[VAL_351]] : i32 + // CHECK: %[[VAL_352:.*]] = llvm.xor %[[CST_0]], %[[VAL_351]] : i32 // CHECK: %[[VAL_353:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_354:.*]] = llvm.add %[[VAL_182]], %[[VAL_353]] : i32 + // CHECK: %[[VAL_354:.*]] = llvm.xor %[[VAL_181]], %[[VAL_353]] : i32 // CHECK: %[[VAL_355:.*]] = llvm.mlir.constant(27 : i32) : i32 - // CHECK: %[[VAL_356:.*]] = llvm.add %[[VAL_181]], %[[VAL_355]] : i32 + // CHECK: %[[VAL_356:.*]] = llvm.xor %[[CST_0]], %[[VAL_355]] : i32 // CHECK: %[[VAL_357:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_358:.*]] = llvm.add %[[VAL_182]], %[[VAL_357]] : i32 + // CHECK: %[[VAL_358:.*]] = llvm.xor %[[VAL_181]], %[[VAL_357]] : i32 // CHECK: %[[VAL_359:.*]] = llvm.mlir.constant(28 : i32) : i32 - // CHECK: %[[VAL_360:.*]] = llvm.add %[[VAL_181]], %[[VAL_359]] : i32 + // CHECK: %[[VAL_360:.*]] = llvm.xor %[[CST_0]], %[[VAL_359]] : i32 // CHECK: %[[VAL_361:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_362:.*]] = llvm.add %[[VAL_182]], %[[VAL_361]] : i32 + // CHECK: %[[VAL_362:.*]] = llvm.xor %[[VAL_181]], %[[VAL_361]] : i32 // CHECK: %[[VAL_363:.*]] = llvm.mlir.constant(29 : i32) : i32 - // CHECK: %[[VAL_364:.*]] = llvm.add %[[VAL_181]], %[[VAL_363]] : i32 + // CHECK: %[[VAL_364:.*]] = llvm.xor %[[CST_0]], %[[VAL_363]] : i32 // CHECK: %[[VAL_365:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_366:.*]] = llvm.add %[[VAL_182]], %[[VAL_365]] : i32 + // CHECK: %[[VAL_366:.*]] = llvm.xor %[[VAL_181]], %[[VAL_365]] : i32 // CHECK: %[[VAL_367:.*]] = llvm.mlir.constant(30 : i32) : i32 - // CHECK: %[[VAL_368:.*]] = llvm.add %[[VAL_181]], %[[VAL_367]] : i32 + // CHECK: %[[VAL_368:.*]] = llvm.xor %[[CST_0]], %[[VAL_367]] : i32 // CHECK: %[[VAL_369:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_370:.*]] = llvm.add %[[VAL_182]], %[[VAL_369]] : i32 + // CHECK: %[[VAL_370:.*]] = llvm.xor %[[VAL_181]], %[[VAL_369]] : i32 // CHECK: %[[VAL_371:.*]] = llvm.mlir.constant(31 : i32) : i32 - // CHECK: %[[VAL_372:.*]] = llvm.add %[[VAL_181]], %[[VAL_371]] : i32 + // CHECK: %[[VAL_372:.*]] = llvm.xor %[[CST_0]], %[[VAL_371]] : i32 // CHECK: %[[VAL_373:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_374:.*]] = llvm.add %[[VAL_182]], %[[VAL_373]] : i32 + // CHECK: %[[VAL_374:.*]] = llvm.xor %[[VAL_181]], %[[VAL_373]] : i32 // COM: Offsets of rep [2, 1]. // CHECK: %[[VAL_375:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_376:.*]] = llvm.add %[[VAL_181]], %[[VAL_375]] : i32 + // CHECK: %[[VAL_376:.*]] = llvm.xor %[[CST_0]], %[[VAL_375]] : i32 // CHECK: %[[VAL_377:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_378:.*]] = llvm.add %[[VAL_182]], %[[VAL_377]] : i32 + // CHECK: %[[VAL_378:.*]] = llvm.xor %[[VAL_181]], %[[VAL_377]] : i32 // CHECK: %[[VAL_379:.*]] = llvm.mlir.constant(17 : i32) : i32 - // CHECK: %[[VAL_380:.*]] = llvm.add %[[VAL_181]], %[[VAL_379]] : i32 + // CHECK: %[[VAL_380:.*]] = llvm.xor %[[CST_0]], %[[VAL_379]] : i32 // CHECK: %[[VAL_381:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_382:.*]] = llvm.add %[[VAL_182]], %[[VAL_381]] : i32 + // CHECK: %[[VAL_382:.*]] = llvm.xor %[[VAL_181]], %[[VAL_381]] : i32 // CHECK: %[[VAL_383:.*]] = llvm.mlir.constant(18 : i32) : i32 - // CHECK: %[[VAL_384:.*]] = llvm.add %[[VAL_181]], %[[VAL_383]] : i32 + // CHECK: %[[VAL_384:.*]] = llvm.xor %[[CST_0]], %[[VAL_383]] : i32 // CHECK: %[[VAL_385:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_386:.*]] = llvm.add %[[VAL_182]], %[[VAL_385]] : i32 + // CHECK: %[[VAL_386:.*]] = llvm.xor %[[VAL_181]], %[[VAL_385]] : i32 // CHECK: %[[VAL_387:.*]] = llvm.mlir.constant(19 : i32) : i32 - // CHECK: %[[VAL_388:.*]] = llvm.add %[[VAL_181]], %[[VAL_387]] : i32 + // CHECK: %[[VAL_388:.*]] = llvm.xor %[[CST_0]], %[[VAL_387]] : i32 // CHECK: %[[VAL_389:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_390:.*]] = llvm.add %[[VAL_182]], %[[VAL_389]] : i32 + // CHECK: %[[VAL_390:.*]] = llvm.xor %[[VAL_181]], %[[VAL_389]] : i32 // CHECK: %[[VAL_391:.*]] = llvm.mlir.constant(20 : i32) : i32 - // CHECK: %[[VAL_392:.*]] = llvm.add %[[VAL_181]], %[[VAL_391]] : i32 + // CHECK: %[[VAL_392:.*]] = llvm.xor %[[CST_0]], %[[VAL_391]] : i32 // CHECK: %[[VAL_393:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_394:.*]] = llvm.add %[[VAL_182]], %[[VAL_393]] : i32 + // CHECK: %[[VAL_394:.*]] = llvm.xor %[[VAL_181]], %[[VAL_393]] : i32 // CHECK: %[[VAL_395:.*]] = llvm.mlir.constant(21 : i32) : i32 - // CHECK: %[[VAL_396:.*]] = llvm.add %[[VAL_181]], %[[VAL_395]] : i32 + // CHECK: %[[VAL_396:.*]] = llvm.xor %[[CST_0]], %[[VAL_395]] : i32 // CHECK: %[[VAL_397:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_398:.*]] = llvm.add %[[VAL_182]], %[[VAL_397]] : i32 + // CHECK: %[[VAL_398:.*]] = llvm.xor %[[VAL_181]], %[[VAL_397]] : i32 // CHECK: %[[VAL_399:.*]] = llvm.mlir.constant(22 : i32) : i32 - // CHECK: %[[VAL_400:.*]] = llvm.add %[[VAL_181]], %[[VAL_399]] : i32 + // CHECK: %[[VAL_400:.*]] = llvm.xor %[[CST_0]], %[[VAL_399]] : i32 // CHECK: %[[VAL_401:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_402:.*]] = llvm.add %[[VAL_182]], %[[VAL_401]] : i32 + // CHECK: %[[VAL_402:.*]] = llvm.xor %[[VAL_181]], %[[VAL_401]] : i32 // CHECK: %[[VAL_403:.*]] = llvm.mlir.constant(23 : i32) : i32 - // CHECK: %[[VAL_404:.*]] = llvm.add %[[VAL_181]], %[[VAL_403]] : i32 + // CHECK: %[[VAL_404:.*]] = llvm.xor %[[CST_0]], %[[VAL_403]] : i32 // CHECK: %[[VAL_405:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_406:.*]] = llvm.add %[[VAL_182]], %[[VAL_405]] : i32 + // CHECK: %[[VAL_406:.*]] = llvm.xor %[[VAL_181]], %[[VAL_405]] : i32 // COM: Offsets of rep [2, 2]. // CHECK: %[[VAL_407:.*]] = llvm.mlir.constant(24 : i32) : i32 - // CHECK: %[[VAL_408:.*]] = llvm.add %[[VAL_181]], %[[VAL_407]] : i32 + // CHECK: %[[VAL_408:.*]] = llvm.xor %[[CST_0]], %[[VAL_407]] : i32 // CHECK: %[[VAL_409:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_410:.*]] = llvm.add %[[VAL_182]], %[[VAL_409]] : i32 + // CHECK: %[[VAL_410:.*]] = llvm.xor %[[VAL_181]], %[[VAL_409]] : i32 // CHECK: %[[VAL_411:.*]] = llvm.mlir.constant(25 : i32) : i32 - // CHECK: %[[VAL_412:.*]] = llvm.add %[[VAL_181]], %[[VAL_411]] : i32 + // CHECK: %[[VAL_412:.*]] = llvm.xor %[[CST_0]], %[[VAL_411]] : i32 // CHECK: %[[VAL_413:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_414:.*]] = llvm.add %[[VAL_182]], %[[VAL_413]] : i32 + // CHECK: %[[VAL_414:.*]] = llvm.xor %[[VAL_181]], %[[VAL_413]] : i32 // CHECK: %[[VAL_415:.*]] = llvm.mlir.constant(26 : i32) : i32 - // CHECK: %[[VAL_416:.*]] = llvm.add %[[VAL_181]], %[[VAL_415]] : i32 + // CHECK: %[[VAL_416:.*]] = llvm.xor %[[CST_0]], %[[VAL_415]] : i32 // CHECK: %[[VAL_417:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_418:.*]] = llvm.add %[[VAL_182]], %[[VAL_417]] : i32 + // CHECK: %[[VAL_418:.*]] = llvm.xor %[[VAL_181]], %[[VAL_417]] : i32 // CHECK: %[[VAL_419:.*]] = llvm.mlir.constant(27 : i32) : i32 - // CHECK: %[[VAL_420:.*]] = llvm.add %[[VAL_181]], %[[VAL_419]] : i32 + // CHECK: %[[VAL_420:.*]] = llvm.xor %[[CST_0]], %[[VAL_419]] : i32 // CHECK: %[[VAL_421:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_422:.*]] = llvm.add %[[VAL_182]], %[[VAL_421]] : i32 + // CHECK: %[[VAL_422:.*]] = llvm.xor %[[VAL_181]], %[[VAL_421]] : i32 // CHECK: %[[VAL_423:.*]] = llvm.mlir.constant(28 : i32) : i32 - // CHECK: %[[VAL_424:.*]] = llvm.add %[[VAL_181]], %[[VAL_423]] : i32 + // CHECK: %[[VAL_424:.*]] = llvm.xor %[[CST_0]], %[[VAL_423]] : i32 // CHECK: %[[VAL_425:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_426:.*]] = llvm.add %[[VAL_182]], %[[VAL_425]] : i32 + // CHECK: %[[VAL_426:.*]] = llvm.xor %[[VAL_181]], %[[VAL_425]] : i32 // CHECK: %[[VAL_427:.*]] = llvm.mlir.constant(29 : i32) : i32 - // CHECK: %[[VAL_428:.*]] = llvm.add %[[VAL_181]], %[[VAL_427]] : i32 + // CHECK: %[[VAL_428:.*]] = llvm.xor %[[CST_0]], %[[VAL_427]] : i32 // CHECK: %[[VAL_429:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_430:.*]] = llvm.add %[[VAL_182]], %[[VAL_429]] : i32 + // CHECK: %[[VAL_430:.*]] = llvm.xor %[[VAL_181]], %[[VAL_429]] : i32 // CHECK: %[[VAL_431:.*]] = llvm.mlir.constant(30 : i32) : i32 - // CHECK: %[[VAL_432:.*]] = llvm.add %[[VAL_181]], %[[VAL_431]] : i32 + // CHECK: %[[VAL_432:.*]] = llvm.xor %[[CST_0]], %[[VAL_431]] : i32 // CHECK: %[[VAL_433:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_434:.*]] = llvm.add %[[VAL_182]], %[[VAL_433]] : i32 + // CHECK: %[[VAL_434:.*]] = llvm.xor %[[VAL_181]], %[[VAL_433]] : i32 // CHECK: %[[VAL_435:.*]] = llvm.mlir.constant(31 : i32) : i32 - // CHECK: %[[VAL_436:.*]] = llvm.add %[[VAL_181]], %[[VAL_435]] : i32 + // CHECK: %[[VAL_436:.*]] = llvm.xor %[[CST_0]], %[[VAL_435]] : i32 // CHECK: %[[VAL_437:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[VAL_438:.*]] = llvm.add %[[VAL_182]], %[[VAL_437]] : i32 + // CHECK: %[[VAL_438:.*]] = llvm.xor %[[VAL_181]], %[[VAL_437]] : i32 tt.print " x: " {hex = false, isSigned = array} : %cst : tensor<32x32xf16, #dot_operand_a> tt.return } diff --git a/test/Proton/ops.mlir b/test/Proton/ops.mlir new file mode 100644 index 0000000000..22a17e3f0f --- /dev/null +++ b/test/Proton/ops.mlir @@ -0,0 +1,15 @@ +// RUN: triton-opt --split-input-file %s -cse -canonicalize | FileCheck %s + +module { + // CHECK-LABEL: proton_record + tt.func @proton_record() { + // CHECK: proton.record() {isStart = true, regionId = 1 : i32} + // CHECK-NEXT: proton.record() {isStart = false, regionId = 1 : i32} + // CHECK-NEXT: tt.return + proton.record() {isStart = true, regionId = 1 : i32} + proton.record() {isStart = false, regionId = 1 : i32} + tt.return + } +} // end module + +// ----- diff --git a/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir b/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir index 7bfff4fc36..48c9850418 100644 --- a/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir +++ b/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir @@ -6,46 +6,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { tt.func public @convert_dpas_to_dot_rep_cluster_1_2(%arg: tensor<1024x32xf16, #dpas>) { // COM: The repetitions order of dot layout and dpas layout are same when the GEMM tiling is clustered as repCluster [1, 2]. - // CHECK: %[[VAL_81:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_0:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_81]] : i32] : vector<8xf16> - // CHECK: %[[VAL_98:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_1:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_98]] : i32] : vector<8xf16> - // CHECK: %[[VAL_115:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_2:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_115]] : i32] : vector<8xf16> - // CHECK: %[[VAL_132:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_3:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_132]] : i32] : vector<8xf16> - // CHECK: %[[VAL_149:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_4:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_149]] : i32] : vector<8xf16> - // CHECK: %[[VAL_166:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_5:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_166]] : i32] : vector<8xf16> - // CHECK: %[[VAL_183:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_6:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_183]] : i32] : vector<8xf16> - // CHECK: %[[VAL_200:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_7:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_200]] : i32] : vector<8xf16> - // CHECK: %[[VAL_216:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_217:.*]] = llvm.extractelement %[[REP_0]]{{\[}}%[[VAL_216]] : i32] : vector<8xf16> - // CHECK: %[[VAL_232:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_233:.*]] = llvm.extractelement %[[REP_1]]{{\[}}%[[VAL_232]] : i32] : vector<8xf16> - // CHECK: %[[VAL_248:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_249:.*]] = llvm.extractelement %[[REP_2]]{{\[}}%[[VAL_248]] : i32] : vector<8xf16> - // CHECK: %[[VAL_264:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_265:.*]] = llvm.extractelement %[[REP_3]]{{\[}}%[[VAL_264]] : i32] : vector<8xf16> - // CHECK: %[[VAL_280:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_281:.*]] = llvm.extractelement %[[REP_4]]{{\[}}%[[VAL_280]] : i32] : vector<8xf16> - // CHECK: %[[VAL_296:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_297:.*]] = llvm.extractelement %[[REP_5]]{{\[}}%[[VAL_296]] : i32] : vector<8xf16> - // CHECK: %[[VAL_312:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_313:.*]] = llvm.extractelement %[[REP_6]]{{\[}}%[[VAL_312]] : i32] : vector<8xf16> - // CHECK: %[[VAL_328:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_329:.*]] = llvm.extractelement %[[REP_7]]{{\[}}%[[VAL_328]] : i32] : vector<8xf16> - // CHECK: %[[VAL_338:.*]] = llvm.insertvalue %[[VAL_217]], {{.*}}[7] - // CHECK: %[[VAL_346:.*]] = llvm.insertvalue %[[VAL_233]], {{.*}}[15] - // CHECK: %[[VAL_354:.*]] = llvm.insertvalue %[[VAL_249]], {{.*}}[23] - // CHECK: %[[VAL_362:.*]] = llvm.insertvalue %[[VAL_265]], {{.*}}[31] - // CHECK: %[[VAL_370:.*]] = llvm.insertvalue %[[VAL_281]], {{.*}}[39] - // CHECK: %[[VAL_378:.*]] = llvm.insertvalue %[[VAL_297]], {{.*}}[47] - // CHECK: %[[VAL_386:.*]] = llvm.insertvalue %[[VAL_313]], {{.*}}[55] - // CHECK: %[[VAL_394:.*]] = llvm.insertvalue %[[VAL_329]], {{.*}}[63] + // CHECK-NO: llvm.insertvalue + // CHECK-NO: llvm.extractvalue %108 = triton_gpu.convert_layout %arg : tensor<1024x32xf16, #dpas> -> tensor<1024x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>> tt.return } @@ -62,46 +24,135 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 // COM: - 0, 1, 2, 3, 4, 5, 6, 7. // COM: The repetitions order of dot layout when the GEMM tiling is clustered as repCluster [2, 2]: // COM: - 0, 2, 1, 3, 4, 6, 5, 7. - // CHECK: %[[VAL_81:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_0:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_81]] : i32] : vector<8xf16> - // CHECK: %[[VAL_98:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_1:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_98]] : i32] : vector<8xf16> - // CHECK: %[[VAL_115:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_2:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_115]] : i32] : vector<8xf16> - // CHECK: %[[VAL_132:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_3:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_132]] : i32] : vector<8xf16> - // CHECK: %[[VAL_149:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_4:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_149]] : i32] : vector<8xf16> - // CHECK: %[[VAL_166:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_5:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_166]] : i32] : vector<8xf16> - // CHECK: %[[VAL_183:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_6:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_183]] : i32] : vector<8xf16> - // CHECK: %[[VAL_200:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_7:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_200]] : i32] : vector<8xf16> - // CHECK: %[[VAL_216:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_217:.*]] = llvm.extractelement %[[REP_0]]{{\[}}%[[VAL_216]] : i32] : vector<8xf16> - // CHECK: %[[VAL_232:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_233:.*]] = llvm.extractelement %[[REP_2]]{{\[}}%[[VAL_232]] : i32] : vector<8xf16> - // CHECK: %[[VAL_248:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_249:.*]] = llvm.extractelement %[[REP_1]]{{\[}}%[[VAL_248]] : i32] : vector<8xf16> - // CHECK: %[[VAL_264:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_265:.*]] = llvm.extractelement %[[REP_3]]{{\[}}%[[VAL_264]] : i32] : vector<8xf16> - // CHECK: %[[VAL_280:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_281:.*]] = llvm.extractelement %[[REP_4]]{{\[}}%[[VAL_280]] : i32] : vector<8xf16> - // CHECK: %[[VAL_296:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_297:.*]] = llvm.extractelement %[[REP_6]]{{\[}}%[[VAL_296]] : i32] : vector<8xf16> - // CHECK: %[[VAL_312:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_313:.*]] = llvm.extractelement %[[REP_5]]{{\[}}%[[VAL_312]] : i32] : vector<8xf16> - // CHECK: %[[VAL_328:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_329:.*]] = llvm.extractelement %[[REP_7]]{{\[}}%[[VAL_328]] : i32] : vector<8xf16> - // CHECK: %[[VAL_338:.*]] = llvm.insertvalue %[[VAL_217]], {{.*}}[7] - // CHECK: %[[VAL_346:.*]] = llvm.insertvalue %[[VAL_233]], {{.*}}[15] - // CHECK: %[[VAL_354:.*]] = llvm.insertvalue %[[VAL_249]], {{.*}}[23] - // CHECK: %[[VAL_362:.*]] = llvm.insertvalue %[[VAL_265]], {{.*}}[31] - // CHECK: %[[VAL_370:.*]] = llvm.insertvalue %[[VAL_281]], {{.*}}[39] - // CHECK: %[[VAL_378:.*]] = llvm.insertvalue %[[VAL_297]], {{.*}}[47] - // CHECK: %[[VAL_386:.*]] = llvm.insertvalue %[[VAL_313]], {{.*}}[55] - // CHECK: %[[VAL_394:.*]] = llvm.insertvalue %[[VAL_329]], {{.*}}[63] + // CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] + // CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][1] + // CHECK: %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][2] + // CHECK: %[[VAL_4:.*]] = llvm.extractvalue %[[VAL_0]][3] + // CHECK: %[[VAL_5:.*]] = llvm.extractvalue %[[VAL_0]][4] + // CHECK: %[[VAL_6:.*]] = llvm.extractvalue %[[VAL_0]][5] + // CHECK: %[[VAL_7:.*]] = llvm.extractvalue %[[VAL_0]][6] + // CHECK: %[[VAL_8:.*]] = llvm.extractvalue %[[VAL_0]][7] + // CHECK: %[[VAL_9:.*]] = llvm.extractvalue %[[VAL_0]][8] + // CHECK: %[[VAL_10:.*]] = llvm.extractvalue %[[VAL_0]][9] + // CHECK: %[[VAL_11:.*]] = llvm.extractvalue %[[VAL_0]][10] + // CHECK: %[[VAL_12:.*]] = llvm.extractvalue %[[VAL_0]][11] + // CHECK: %[[VAL_13:.*]] = llvm.extractvalue %[[VAL_0]][12] + // CHECK: %[[VAL_14:.*]] = llvm.extractvalue %[[VAL_0]][13] + // CHECK: %[[VAL_15:.*]] = llvm.extractvalue %[[VAL_0]][14] + // CHECK: %[[VAL_16:.*]] = llvm.extractvalue %[[VAL_0]][15] + // CHECK: %[[VAL_17:.*]] = llvm.extractvalue %[[VAL_0]][16] + // CHECK: %[[VAL_18:.*]] = llvm.extractvalue %[[VAL_0]][17] + // CHECK: %[[VAL_19:.*]] = llvm.extractvalue %[[VAL_0]][18] + // CHECK: %[[VAL_20:.*]] = llvm.extractvalue %[[VAL_0]][19] + // CHECK: %[[VAL_21:.*]] = llvm.extractvalue %[[VAL_0]][20] + // CHECK: %[[VAL_22:.*]] = llvm.extractvalue %[[VAL_0]][21] + // CHECK: %[[VAL_23:.*]] = llvm.extractvalue %[[VAL_0]][22] + // CHECK: %[[VAL_24:.*]] = llvm.extractvalue %[[VAL_0]][23] + // CHECK: %[[VAL_25:.*]] = llvm.extractvalue %[[VAL_0]][24] + // CHECK: %[[VAL_26:.*]] = llvm.extractvalue %[[VAL_0]][25] + // CHECK: %[[VAL_27:.*]] = llvm.extractvalue %[[VAL_0]][26] + // CHECK: %[[VAL_28:.*]] = llvm.extractvalue %[[VAL_0]][27] + // CHECK: %[[VAL_29:.*]] = llvm.extractvalue %[[VAL_0]][28] + // CHECK: %[[VAL_30:.*]] = llvm.extractvalue %[[VAL_0]][29] + // CHECK: %[[VAL_31:.*]] = llvm.extractvalue %[[VAL_0]][30] + // CHECK: %[[VAL_32:.*]] = llvm.extractvalue %[[VAL_0]][31] + // CHECK: %[[VAL_33:.*]] = llvm.extractvalue %[[VAL_0]][32] + // CHECK: %[[VAL_34:.*]] = llvm.extractvalue %[[VAL_0]][33] + // CHECK: %[[VAL_35:.*]] = llvm.extractvalue %[[VAL_0]][34] + // CHECK: %[[VAL_36:.*]] = llvm.extractvalue %[[VAL_0]][35] + // CHECK: %[[VAL_37:.*]] = llvm.extractvalue %[[VAL_0]][36] + // CHECK: %[[VAL_38:.*]] = llvm.extractvalue %[[VAL_0]][37] + // CHECK: %[[VAL_39:.*]] = llvm.extractvalue %[[VAL_0]][38] + // CHECK: %[[VAL_40:.*]] = llvm.extractvalue %[[VAL_0]][39] + // CHECK: %[[VAL_41:.*]] = llvm.extractvalue %[[VAL_0]][40] + // CHECK: %[[VAL_42:.*]] = llvm.extractvalue %[[VAL_0]][41] + // CHECK: %[[VAL_43:.*]] = llvm.extractvalue %[[VAL_0]][42] + // CHECK: %[[VAL_44:.*]] = llvm.extractvalue %[[VAL_0]][43] + // CHECK: %[[VAL_45:.*]] = llvm.extractvalue %[[VAL_0]][44] + // CHECK: %[[VAL_46:.*]] = llvm.extractvalue %[[VAL_0]][45] + // CHECK: %[[VAL_47:.*]] = llvm.extractvalue %[[VAL_0]][46] + // CHECK: %[[VAL_48:.*]] = llvm.extractvalue %[[VAL_0]][47] + // CHECK: %[[VAL_49:.*]] = llvm.extractvalue %[[VAL_0]][48] + // CHECK: %[[VAL_50:.*]] = llvm.extractvalue %[[VAL_0]][49] + // CHECK: %[[VAL_51:.*]] = llvm.extractvalue %[[VAL_0]][50] + // CHECK: %[[VAL_52:.*]] = llvm.extractvalue %[[VAL_0]][51] + // CHECK: %[[VAL_53:.*]] = llvm.extractvalue %[[VAL_0]][52] + // CHECK: %[[VAL_54:.*]] = llvm.extractvalue %[[VAL_0]][53] + // CHECK: %[[VAL_55:.*]] = llvm.extractvalue %[[VAL_0]][54] + // CHECK: %[[VAL_56:.*]] = llvm.extractvalue %[[VAL_0]][55] + // CHECK: %[[VAL_57:.*]] = llvm.extractvalue %[[VAL_0]][56] + // CHECK: %[[VAL_58:.*]] = llvm.extractvalue %[[VAL_0]][57] + // CHECK: %[[VAL_59:.*]] = llvm.extractvalue %[[VAL_0]][58] + // CHECK: %[[VAL_60:.*]] = llvm.extractvalue %[[VAL_0]][59] + // CHECK: %[[VAL_61:.*]] = llvm.extractvalue %[[VAL_0]][60] + // CHECK: %[[VAL_62:.*]] = llvm.extractvalue %[[VAL_0]][61] + // CHECK: %[[VAL_63:.*]] = llvm.extractvalue %[[VAL_0]][62] + // CHECK: %[[VAL_64:.*]] = llvm.extractvalue %[[VAL_0]][63] + // CHECK: %[[VAL_65:.*]] = llvm.mlir.undef + // CHECK: %[[VAL_66:.*]] = llvm.insertvalue %[[VAL_1]], %[[VAL_65]][0] + // CHECK: %[[VAL_67:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_66]][1] + // CHECK: %[[VAL_68:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_67]][2] + // CHECK: %[[VAL_69:.*]] = llvm.insertvalue %[[VAL_4]], %[[VAL_68]][3] + // CHECK: %[[VAL_70:.*]] = llvm.insertvalue %[[VAL_5]], %[[VAL_69]][4] + // CHECK: %[[VAL_71:.*]] = llvm.insertvalue %[[VAL_6]], %[[VAL_70]][5] + // CHECK: %[[VAL_72:.*]] = llvm.insertvalue %[[VAL_7]], %[[VAL_71]][6] + // CHECK: %[[VAL_73:.*]] = llvm.insertvalue %[[VAL_8]], %[[VAL_72]][7] + // CHECK: %[[VAL_74:.*]] = llvm.insertvalue %[[VAL_17]], %[[VAL_73]][8] + // CHECK: %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_18]], %[[VAL_74]][9] + // CHECK: %[[VAL_76:.*]] = llvm.insertvalue %[[VAL_19]], %[[VAL_75]][10] + // CHECK: %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_20]], %[[VAL_76]][11] + // CHECK: %[[VAL_78:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_77]][12] + // CHECK: %[[VAL_79:.*]] = llvm.insertvalue %[[VAL_22]], %[[VAL_78]][13] + // CHECK: %[[VAL_80:.*]] = llvm.insertvalue %[[VAL_23]], %[[VAL_79]][14] + // CHECK: %[[VAL_81:.*]] = llvm.insertvalue %[[VAL_24]], %[[VAL_80]][15] + // CHECK: %[[VAL_82:.*]] = llvm.insertvalue %[[VAL_9]], %[[VAL_81]][16] + // CHECK: %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_10]], %[[VAL_82]][17] + // CHECK: %[[VAL_84:.*]] = llvm.insertvalue %[[VAL_11]], %[[VAL_83]][18] + // CHECK: %[[VAL_85:.*]] = llvm.insertvalue %[[VAL_12]], %[[VAL_84]][19] + // CHECK: %[[VAL_86:.*]] = llvm.insertvalue %[[VAL_13]], %[[VAL_85]][20] + // CHECK: %[[VAL_87:.*]] = llvm.insertvalue %[[VAL_14]], %[[VAL_86]][21] + // CHECK: %[[VAL_88:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_87]][22] + // CHECK: %[[VAL_89:.*]] = llvm.insertvalue %[[VAL_16]], %[[VAL_88]][23] + // CHECK: %[[VAL_90:.*]] = llvm.insertvalue %[[VAL_25]], %[[VAL_89]][24] + // CHECK: %[[VAL_91:.*]] = llvm.insertvalue %[[VAL_26]], %[[VAL_90]][25] + // CHECK: %[[VAL_92:.*]] = llvm.insertvalue %[[VAL_27]], %[[VAL_91]][26] + // CHECK: %[[VAL_93:.*]] = llvm.insertvalue %[[VAL_28]], %[[VAL_92]][27] + // CHECK: %[[VAL_94:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_93]][28] + // CHECK: %[[VAL_95:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_94]][29] + // CHECK: %[[VAL_96:.*]] = llvm.insertvalue %[[VAL_31]], %[[VAL_95]][30] + // CHECK: %[[VAL_97:.*]] = llvm.insertvalue %[[VAL_32]], %[[VAL_96]][31] + // CHECK: %[[VAL_98:.*]] = llvm.insertvalue %[[VAL_33]], %[[VAL_97]][32] + // CHECK: %[[VAL_99:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_98]][33] + // CHECK: %[[VAL_100:.*]] = llvm.insertvalue %[[VAL_35]], %[[VAL_99]][34] + // CHECK: %[[VAL_101:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_100]][35] + // CHECK: %[[VAL_102:.*]] = llvm.insertvalue %[[VAL_37]], %[[VAL_101]][36] + // CHECK: %[[VAL_103:.*]] = llvm.insertvalue %[[VAL_38]], %[[VAL_102]][37] + // CHECK: %[[VAL_104:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_103]][38] + // CHECK: %[[VAL_105:.*]] = llvm.insertvalue %[[VAL_40]], %[[VAL_104]][39] + // CHECK: %[[VAL_106:.*]] = llvm.insertvalue %[[VAL_49]], %[[VAL_105]][40] + // CHECK: %[[VAL_107:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_106]][41] + // CHECK: %[[VAL_108:.*]] = llvm.insertvalue %[[VAL_51]], %[[VAL_107]][42] + // CHECK: %[[VAL_109:.*]] = llvm.insertvalue %[[VAL_52]], %[[VAL_108]][43] + // CHECK: %[[VAL_110:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_109]][44] + // CHECK: %[[VAL_111:.*]] = llvm.insertvalue %[[VAL_54]], %[[VAL_110]][45] + // CHECK: %[[VAL_112:.*]] = llvm.insertvalue %[[VAL_55]], %[[VAL_111]][46] + // CHECK: %[[VAL_113:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_112]][47] + // CHECK: %[[VAL_114:.*]] = llvm.insertvalue %[[VAL_41]], %[[VAL_113]][48] + // CHECK: %[[VAL_115:.*]] = llvm.insertvalue %[[VAL_42]], %[[VAL_114]][49] + // CHECK: %[[VAL_116:.*]] = llvm.insertvalue %[[VAL_43]], %[[VAL_115]][50] + // CHECK: %[[VAL_117:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_116]][51] + // CHECK: %[[VAL_118:.*]] = llvm.insertvalue %[[VAL_45]], %[[VAL_117]][52] + // CHECK: %[[VAL_119:.*]] = llvm.insertvalue %[[VAL_46]], %[[VAL_118]][53] + // CHECK: %[[VAL_120:.*]] = llvm.insertvalue %[[VAL_47]], %[[VAL_119]][54] + // CHECK: %[[VAL_121:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_120]][55] + // CHECK: %[[VAL_122:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_121]][56] + // CHECK: %[[VAL_123:.*]] = llvm.insertvalue %[[VAL_58]], %[[VAL_122]][57] + // CHECK: %[[VAL_124:.*]] = llvm.insertvalue %[[VAL_59]], %[[VAL_123]][58] + // CHECK: %[[VAL_125:.*]] = llvm.insertvalue %[[VAL_60]], %[[VAL_124]][59] + // CHECK: %[[VAL_126:.*]] = llvm.insertvalue %[[VAL_61]], %[[VAL_125]][60] + // CHECK: %[[VAL_127:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_126]][61] + // CHECK: %[[VAL_128:.*]] = llvm.insertvalue %[[VAL_63]], %[[VAL_127]][62] + // CHECK: %[[VAL_129:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_128]][63] %108 = triton_gpu.convert_layout %arg : tensor<1024x32xf16, #dpas> -> tensor<1024x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>> tt.return } @@ -118,46 +169,135 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 // COM: - 0, 1, 2, 3, 4, 5, 6, 7. // COM: The repetitions order of dot layout when the GEMM tiling is clustered as repCluster [4, 2]: // COM: - 0, 2, 4, 6, 1, 3, 5, 7. - // CHECK: %[[VAL_81:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_0:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_81]] : i32] : vector<8xf16> - // CHECK: %[[VAL_98:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_1:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_98]] : i32] : vector<8xf16> - // CHECK: %[[VAL_115:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_2:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_115]] : i32] : vector<8xf16> - // CHECK: %[[VAL_132:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_3:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_132]] : i32] : vector<8xf16> - // CHECK: %[[VAL_149:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_4:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_149]] : i32] : vector<8xf16> - // CHECK: %[[VAL_166:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_5:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_166]] : i32] : vector<8xf16> - // CHECK: %[[VAL_183:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_6:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_183]] : i32] : vector<8xf16> - // CHECK: %[[VAL_200:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[REP_7:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_200]] : i32] : vector<8xf16> - // CHECK: %[[VAL_216:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_217:.*]] = llvm.extractelement %[[REP_0]]{{\[}}%[[VAL_216]] : i32] : vector<8xf16> - // CHECK: %[[VAL_232:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_233:.*]] = llvm.extractelement %[[REP_2]]{{\[}}%[[VAL_232]] : i32] : vector<8xf16> - // CHECK: %[[VAL_248:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_249:.*]] = llvm.extractelement %[[REP_4]]{{\[}}%[[VAL_248]] : i32] : vector<8xf16> - // CHECK: %[[VAL_264:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_265:.*]] = llvm.extractelement %[[REP_6]]{{\[}}%[[VAL_264]] : i32] : vector<8xf16> - // CHECK: %[[VAL_280:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_281:.*]] = llvm.extractelement %[[REP_1]]{{\[}}%[[VAL_280]] : i32] : vector<8xf16> - // CHECK: %[[VAL_296:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_297:.*]] = llvm.extractelement %[[REP_3]]{{\[}}%[[VAL_296]] : i32] : vector<8xf16> - // CHECK: %[[VAL_312:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_313:.*]] = llvm.extractelement %[[REP_5]]{{\[}}%[[VAL_312]] : i32] : vector<8xf16> - // CHECK: %[[VAL_328:.*]] = llvm.mlir.constant(7 : i32) : i32 - // CHECK: %[[VAL_329:.*]] = llvm.extractelement %[[REP_7]]{{\[}}%[[VAL_328]] : i32] : vector<8xf16> - // CHECK: %[[VAL_338:.*]] = llvm.insertvalue %[[VAL_217]], {{.*}}[7] - // CHECK: %[[VAL_346:.*]] = llvm.insertvalue %[[VAL_233]], {{.*}}[15] - // CHECK: %[[VAL_354:.*]] = llvm.insertvalue %[[VAL_249]], {{.*}}[23] - // CHECK: %[[VAL_362:.*]] = llvm.insertvalue %[[VAL_265]], {{.*}}[31] - // CHECK: %[[VAL_370:.*]] = llvm.insertvalue %[[VAL_281]], {{.*}}[39] - // CHECK: %[[VAL_378:.*]] = llvm.insertvalue %[[VAL_297]], {{.*}}[47] - // CHECK: %[[VAL_386:.*]] = llvm.insertvalue %[[VAL_313]], {{.*}}[55] - // CHECK: %[[VAL_394:.*]] = llvm.insertvalue %[[VAL_329]], {{.*}}[63] + // CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] + // CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][1] + // CHECK: %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][2] + // CHECK: %[[VAL_4:.*]] = llvm.extractvalue %[[VAL_0]][3] + // CHECK: %[[VAL_5:.*]] = llvm.extractvalue %[[VAL_0]][4] + // CHECK: %[[VAL_6:.*]] = llvm.extractvalue %[[VAL_0]][5] + // CHECK: %[[VAL_7:.*]] = llvm.extractvalue %[[VAL_0]][6] + // CHECK: %[[VAL_8:.*]] = llvm.extractvalue %[[VAL_0]][7] + // CHECK: %[[VAL_9:.*]] = llvm.extractvalue %[[VAL_0]][8] + // CHECK: %[[VAL_10:.*]] = llvm.extractvalue %[[VAL_0]][9] + // CHECK: %[[VAL_11:.*]] = llvm.extractvalue %[[VAL_0]][10] + // CHECK: %[[VAL_12:.*]] = llvm.extractvalue %[[VAL_0]][11] + // CHECK: %[[VAL_13:.*]] = llvm.extractvalue %[[VAL_0]][12] + // CHECK: %[[VAL_14:.*]] = llvm.extractvalue %[[VAL_0]][13] + // CHECK: %[[VAL_15:.*]] = llvm.extractvalue %[[VAL_0]][14] + // CHECK: %[[VAL_16:.*]] = llvm.extractvalue %[[VAL_0]][15] + // CHECK: %[[VAL_17:.*]] = llvm.extractvalue %[[VAL_0]][16] + // CHECK: %[[VAL_18:.*]] = llvm.extractvalue %[[VAL_0]][17] + // CHECK: %[[VAL_19:.*]] = llvm.extractvalue %[[VAL_0]][18] + // CHECK: %[[VAL_20:.*]] = llvm.extractvalue %[[VAL_0]][19] + // CHECK: %[[VAL_21:.*]] = llvm.extractvalue %[[VAL_0]][20] + // CHECK: %[[VAL_22:.*]] = llvm.extractvalue %[[VAL_0]][21] + // CHECK: %[[VAL_23:.*]] = llvm.extractvalue %[[VAL_0]][22] + // CHECK: %[[VAL_24:.*]] = llvm.extractvalue %[[VAL_0]][23] + // CHECK: %[[VAL_25:.*]] = llvm.extractvalue %[[VAL_0]][24] + // CHECK: %[[VAL_26:.*]] = llvm.extractvalue %[[VAL_0]][25] + // CHECK: %[[VAL_27:.*]] = llvm.extractvalue %[[VAL_0]][26] + // CHECK: %[[VAL_28:.*]] = llvm.extractvalue %[[VAL_0]][27] + // CHECK: %[[VAL_29:.*]] = llvm.extractvalue %[[VAL_0]][28] + // CHECK: %[[VAL_30:.*]] = llvm.extractvalue %[[VAL_0]][29] + // CHECK: %[[VAL_31:.*]] = llvm.extractvalue %[[VAL_0]][30] + // CHECK: %[[VAL_32:.*]] = llvm.extractvalue %[[VAL_0]][31] + // CHECK: %[[VAL_33:.*]] = llvm.extractvalue %[[VAL_0]][32] + // CHECK: %[[VAL_34:.*]] = llvm.extractvalue %[[VAL_0]][33] + // CHECK: %[[VAL_35:.*]] = llvm.extractvalue %[[VAL_0]][34] + // CHECK: %[[VAL_36:.*]] = llvm.extractvalue %[[VAL_0]][35] + // CHECK: %[[VAL_37:.*]] = llvm.extractvalue %[[VAL_0]][36] + // CHECK: %[[VAL_38:.*]] = llvm.extractvalue %[[VAL_0]][37] + // CHECK: %[[VAL_39:.*]] = llvm.extractvalue %[[VAL_0]][38] + // CHECK: %[[VAL_40:.*]] = llvm.extractvalue %[[VAL_0]][39] + // CHECK: %[[VAL_41:.*]] = llvm.extractvalue %[[VAL_0]][40] + // CHECK: %[[VAL_42:.*]] = llvm.extractvalue %[[VAL_0]][41] + // CHECK: %[[VAL_43:.*]] = llvm.extractvalue %[[VAL_0]][42] + // CHECK: %[[VAL_44:.*]] = llvm.extractvalue %[[VAL_0]][43] + // CHECK: %[[VAL_45:.*]] = llvm.extractvalue %[[VAL_0]][44] + // CHECK: %[[VAL_46:.*]] = llvm.extractvalue %[[VAL_0]][45] + // CHECK: %[[VAL_47:.*]] = llvm.extractvalue %[[VAL_0]][46] + // CHECK: %[[VAL_48:.*]] = llvm.extractvalue %[[VAL_0]][47] + // CHECK: %[[VAL_49:.*]] = llvm.extractvalue %[[VAL_0]][48] + // CHECK: %[[VAL_50:.*]] = llvm.extractvalue %[[VAL_0]][49] + // CHECK: %[[VAL_51:.*]] = llvm.extractvalue %[[VAL_0]][50] + // CHECK: %[[VAL_52:.*]] = llvm.extractvalue %[[VAL_0]][51] + // CHECK: %[[VAL_53:.*]] = llvm.extractvalue %[[VAL_0]][52] + // CHECK: %[[VAL_54:.*]] = llvm.extractvalue %[[VAL_0]][53] + // CHECK: %[[VAL_55:.*]] = llvm.extractvalue %[[VAL_0]][54] + // CHECK: %[[VAL_56:.*]] = llvm.extractvalue %[[VAL_0]][55] + // CHECK: %[[VAL_57:.*]] = llvm.extractvalue %[[VAL_0]][56] + // CHECK: %[[VAL_58:.*]] = llvm.extractvalue %[[VAL_0]][57] + // CHECK: %[[VAL_59:.*]] = llvm.extractvalue %[[VAL_0]][58] + // CHECK: %[[VAL_60:.*]] = llvm.extractvalue %[[VAL_0]][59] + // CHECK: %[[VAL_61:.*]] = llvm.extractvalue %[[VAL_0]][60] + // CHECK: %[[VAL_62:.*]] = llvm.extractvalue %[[VAL_0]][61] + // CHECK: %[[VAL_63:.*]] = llvm.extractvalue %[[VAL_0]][62] + // CHECK: %[[VAL_64:.*]] = llvm.extractvalue %[[VAL_0]][63] + // CHECK: %[[VAL_65:.*]] = llvm.mlir.undef + // CHECK: %[[VAL_66:.*]] = llvm.insertvalue %[[VAL_1]], %[[VAL_65]][0] + // CHECK: %[[VAL_67:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_66]][1] + // CHECK: %[[VAL_68:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_67]][2] + // CHECK: %[[VAL_69:.*]] = llvm.insertvalue %[[VAL_4]], %[[VAL_68]][3] + // CHECK: %[[VAL_70:.*]] = llvm.insertvalue %[[VAL_5]], %[[VAL_69]][4] + // CHECK: %[[VAL_71:.*]] = llvm.insertvalue %[[VAL_6]], %[[VAL_70]][5] + // CHECK: %[[VAL_72:.*]] = llvm.insertvalue %[[VAL_7]], %[[VAL_71]][6] + // CHECK: %[[VAL_73:.*]] = llvm.insertvalue %[[VAL_8]], %[[VAL_72]][7] + // CHECK: %[[VAL_74:.*]] = llvm.insertvalue %[[VAL_17]], %[[VAL_73]][8] + // CHECK: %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_18]], %[[VAL_74]][9] + // CHECK: %[[VAL_76:.*]] = llvm.insertvalue %[[VAL_19]], %[[VAL_75]][10] + // CHECK: %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_20]], %[[VAL_76]][11] + // CHECK: %[[VAL_78:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_77]][12] + // CHECK: %[[VAL_79:.*]] = llvm.insertvalue %[[VAL_22]], %[[VAL_78]][13] + // CHECK: %[[VAL_80:.*]] = llvm.insertvalue %[[VAL_23]], %[[VAL_79]][14] + // CHECK: %[[VAL_81:.*]] = llvm.insertvalue %[[VAL_24]], %[[VAL_80]][15] + // CHECK: %[[VAL_82:.*]] = llvm.insertvalue %[[VAL_33]], %[[VAL_81]][16] + // CHECK: %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_82]][17] + // CHECK: %[[VAL_84:.*]] = llvm.insertvalue %[[VAL_35]], %[[VAL_83]][18] + // CHECK: %[[VAL_85:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_84]][19] + // CHECK: %[[VAL_86:.*]] = llvm.insertvalue %[[VAL_37]], %[[VAL_85]][20] + // CHECK: %[[VAL_87:.*]] = llvm.insertvalue %[[VAL_38]], %[[VAL_86]][21] + // CHECK: %[[VAL_88:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_87]][22] + // CHECK: %[[VAL_89:.*]] = llvm.insertvalue %[[VAL_40]], %[[VAL_88]][23] + // CHECK: %[[VAL_90:.*]] = llvm.insertvalue %[[VAL_49]], %[[VAL_89]][24] + // CHECK: %[[VAL_91:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_90]][25] + // CHECK: %[[VAL_92:.*]] = llvm.insertvalue %[[VAL_51]], %[[VAL_91]][26] + // CHECK: %[[VAL_93:.*]] = llvm.insertvalue %[[VAL_52]], %[[VAL_92]][27] + // CHECK: %[[VAL_94:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_93]][28] + // CHECK: %[[VAL_95:.*]] = llvm.insertvalue %[[VAL_54]], %[[VAL_94]][29] + // CHECK: %[[VAL_96:.*]] = llvm.insertvalue %[[VAL_55]], %[[VAL_95]][30] + // CHECK: %[[VAL_97:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_96]][31] + // CHECK: %[[VAL_98:.*]] = llvm.insertvalue %[[VAL_9]], %[[VAL_97]][32] + // CHECK: %[[VAL_99:.*]] = llvm.insertvalue %[[VAL_10]], %[[VAL_98]][33] + // CHECK: %[[VAL_100:.*]] = llvm.insertvalue %[[VAL_11]], %[[VAL_99]][34] + // CHECK: %[[VAL_101:.*]] = llvm.insertvalue %[[VAL_12]], %[[VAL_100]][35] + // CHECK: %[[VAL_102:.*]] = llvm.insertvalue %[[VAL_13]], %[[VAL_101]][36] + // CHECK: %[[VAL_103:.*]] = llvm.insertvalue %[[VAL_14]], %[[VAL_102]][37] + // CHECK: %[[VAL_104:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_103]][38] + // CHECK: %[[VAL_105:.*]] = llvm.insertvalue %[[VAL_16]], %[[VAL_104]][39] + // CHECK: %[[VAL_106:.*]] = llvm.insertvalue %[[VAL_25]], %[[VAL_105]][40] + // CHECK: %[[VAL_107:.*]] = llvm.insertvalue %[[VAL_26]], %[[VAL_106]][41] + // CHECK: %[[VAL_108:.*]] = llvm.insertvalue %[[VAL_27]], %[[VAL_107]][42] + // CHECK: %[[VAL_109:.*]] = llvm.insertvalue %[[VAL_28]], %[[VAL_108]][43] + // CHECK: %[[VAL_110:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_109]][44] + // CHECK: %[[VAL_111:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_110]][45] + // CHECK: %[[VAL_112:.*]] = llvm.insertvalue %[[VAL_31]], %[[VAL_111]][46] + // CHECK: %[[VAL_113:.*]] = llvm.insertvalue %[[VAL_32]], %[[VAL_112]][47] + // CHECK: %[[VAL_114:.*]] = llvm.insertvalue %[[VAL_41]], %[[VAL_113]][48] + // CHECK: %[[VAL_115:.*]] = llvm.insertvalue %[[VAL_42]], %[[VAL_114]][49] + // CHECK: %[[VAL_116:.*]] = llvm.insertvalue %[[VAL_43]], %[[VAL_115]][50] + // CHECK: %[[VAL_117:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_116]][51] + // CHECK: %[[VAL_118:.*]] = llvm.insertvalue %[[VAL_45]], %[[VAL_117]][52] + // CHECK: %[[VAL_119:.*]] = llvm.insertvalue %[[VAL_46]], %[[VAL_118]][53] + // CHECK: %[[VAL_120:.*]] = llvm.insertvalue %[[VAL_47]], %[[VAL_119]][54] + // CHECK: %[[VAL_121:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_120]][55] + // CHECK: %[[VAL_122:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_121]][56] + // CHECK: %[[VAL_123:.*]] = llvm.insertvalue %[[VAL_58]], %[[VAL_122]][57] + // CHECK: %[[VAL_124:.*]] = llvm.insertvalue %[[VAL_59]], %[[VAL_123]][58] + // CHECK: %[[VAL_125:.*]] = llvm.insertvalue %[[VAL_60]], %[[VAL_124]][59] + // CHECK: %[[VAL_126:.*]] = llvm.insertvalue %[[VAL_61]], %[[VAL_125]][60] + // CHECK: %[[VAL_127:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_126]][61] + // CHECK: %[[VAL_128:.*]] = llvm.insertvalue %[[VAL_63]], %[[VAL_127]][62] + // CHECK: %[[VAL_129:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_128]][63] %108 = triton_gpu.convert_layout %arg : tensor<1024x32xf16, #dpas> -> tensor<1024x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>> tt.return } diff --git a/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir b/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir index 2bb504d76f..1ecb0a5a2c 100644 --- a/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir +++ b/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir @@ -69,14 +69,13 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 // CHECK-DAG: %[[CST_3:.*]] = llvm.mlir.constant(3 : i32) : i32 + // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 // CHECK-DAG: %[[CST_5:.*]] = llvm.mlir.constant(5 : i32) : i32 // CHECK-DAG: %[[CST_6:.*]] = llvm.mlir.constant(6 : i32) : i32 // CHECK-DAG: %[[CST_7:.*]] = llvm.mlir.constant(7 : i32) : i32 + // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 // CHECK-DAG: %[[CST_17:.*]] = llvm.mlir.constant(17 : i32) : i32 // CHECK-DAG: %[[CST_18:.*]] = llvm.mlir.constant(18 : i32) : i32 // CHECK-DAG: %[[CST_19:.*]] = llvm.mlir.constant(19 : i32) : i32 @@ -86,43 +85,46 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-DAG: %[[CST_23:.*]] = llvm.mlir.constant(23 : i32) : i32 // CHECK: %[[THREAD_ID:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]]) // CHECK: %[[THREAD_ID_32:.*]] = llvm.trunc %[[THREAD_ID]] : i64 to i32 - // CHECK: %[[WARP_ID:.*]] = llvm.udiv %[[THREAD_ID_32]], %[[CST_16]] : i32 // CHECK: %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_32]], %[[CST_16]] : i32 - // CHECK: %[[VAL_29:.*]] = llvm.udiv %[[WARP_ID]], %[[CST_2]] : i32 - // CHECK: %[[WARP_ID_X:.*]] = llvm.urem %[[VAL_29]], %[[CST_2]] : i32 - // CHECK: %[[ROUNDED_WARP_ID_X:.*]] = llvm.urem %[[WARP_ID_X]], %[[CST_4]] : i32 - // CHECK: %[[WARP_OFFSET:.*]] = llvm.mul %[[ROUNDED_WARP_ID_X]], %[[CST_8]] : i32 - // CHECK: %[[LANE_ID_X:.*]] = llvm.udiv %[[LANE_ID]], %[[CST_16]] : i32 - // CHECK: %[[LANE_ID_Y:.*]] = llvm.urem %[[LANE_ID]], %[[CST_16]] : i32 - // CHECK: %[[OFFSET_Y:.*]] = llvm.mul %[[LANE_ID_Y]], %[[CST_2]] : i32 - // CHECK: %[[OFFSET_x:.*]] = llvm.add %[[LANE_ID_X]], %[[WARP_OFFSET]] : i32 - // CHECK: %[[VAL_37:.*]] = llvm.urem %[[CST_0]], %[[CST_1]] : i32 - // CHECK: %[[VAL_38:.*]] = llvm.udiv %[[CST_0]], %[[CST_1]] : i32 - // CHECK: %[[VAL_39:.*]] = llvm.urem %[[VAL_38]], %[[CST_1]] : i32 - // CHECK: %[[VAL_40:.*]] = llvm.urem %[[VAL_39]], %[[CST_1]] : i32 - // CHECK: %[[VAL_41:.*]] = llvm.urem %[[VAL_37]], %[[CST_1]] : i32 - // CHECK: %[[CTA_OFFSET_X:.*]] = llvm.mul %[[VAL_40]], %[[CST_32]] : i32 - // CHECK: %[[CTA_OFFSET_Y:.*]] = llvm.mul %[[VAL_41]], %[[CST_32]] : i32 - // CHECK: %[[VAL_44:.*]] = llvm.add %[[OFFSET_x]], %[[CTA_OFFSET_X]] : i32 - // CHECK: %[[VAL_45:.*]] = llvm.add %[[OFFSET_Y]], %[[CTA_OFFSET_Y]] : i32 - // CHECK: %[[OFFSET_X_0:.*]] = llvm.add %[[VAL_44]], %[[CST_0]] : i32 - // CHECK: %[[OFFSET_Y_0:.*]] = llvm.add %[[VAL_45]], %[[CST_0]] : i32 - // CHECK: %[[OFFSET_Y_1:.*]] = llvm.add %[[VAL_45]], %[[CST_1]] : i32 - // CHECK: %[[OFFSET_X_1:.*]] = llvm.add %[[VAL_44]], %[[CST_1]] : i32 - // CHECK: %[[OFFSET_X_2:.*]] = llvm.add %[[VAL_44]], %[[CST_2]] : i32 - // CHECK: %[[OFFSET_X_3:.*]] = llvm.add %[[VAL_44]], %[[CST_3]] : i32 - // CHECK: %[[OFFSET_X_4:.*]] = llvm.add %[[VAL_44]], %[[CST_4]] : i32 - // CHECK: %[[OFFSET_X_5:.*]] = llvm.add %[[VAL_44]], %[[CST_5]] : i32 - // CHECK: %[[OFFSET_X_6:.*]] = llvm.add %[[VAL_44]], %[[CST_6]] : i32 - // CHECK: %[[OFFSET_X_7:.*]] = llvm.add %[[VAL_44]], %[[CST_7]] : i32 - // CHECK: %[[OFFSET_X_8:.*]] = llvm.add %[[VAL_44]], %[[CST_16]] : i32 - // CHECK: %[[OFFSET_X_9:.*]] = llvm.add %[[VAL_44]], %[[CST_17]] : i32 - // CHECK: %[[OFFSET_X_10:.*]] = llvm.add %[[VAL_44]], %[[CST_18]] : i32 - // CHECK: %[[OFFSET_X_11:.*]] = llvm.add %[[VAL_44]], %[[CST_19]] : i32 - // CHECK: %[[OFFSET_X_12:.*]] = llvm.add %[[VAL_44]], %[[CST_20]] : i32 - // CHECK: %[[OFFSET_X_13:.*]] = llvm.add %[[VAL_44]], %[[CST_21]] : i32 - // CHECK: %[[OFFSET_X_14:.*]] = llvm.add %[[VAL_44]], %[[CST_22]] : i32 - // CHECK: %[[OFFSET_X_15:.*]] = llvm.add %[[VAL_44]], %[[CST_23]] : i32 + // CHECK: %[[WARP_ID:.*]] = llvm.udiv %[[THREAD_ID_32]], %[[CST_16]] : i32 + // CHECK: %[[VAL_27:.*]] = llvm.and %[[LANE_ID]], %[[CST_1]] : i32 + // CHECK: %[[VAL_28:.*]] = llvm.icmp "eq" %[[VAL_27]], %[[CST_0]] : i32 + // CHECK: %[[VAL_29:.*]] = llvm.select %[[VAL_28]], %[[CST_0]], %[[CST_2]] : i1, i32 + // CHECK: %[[VAL_30:.*]] = llvm.xor %[[CST_0]], %[[VAL_29]] : i32 + // CHECK: %[[VAL_31:.*]] = llvm.and %[[LANE_ID]], %[[CST_2]] : i32 + // CHECK: %[[VAL_32:.*]] = llvm.icmp "eq" %[[VAL_31]], %[[CST_0]] : i32 + // CHECK: %[[VAL_33:.*]] = llvm.select %[[VAL_32]], %[[CST_0]], %[[CST_4]] : i1, i32 + // CHECK: %[[VAL_34:.*]] = llvm.xor %[[VAL_30]], %[[VAL_33]] : i32 + // CHECK: %[[VAL_35:.*]] = llvm.and %[[LANE_ID]], %[[CST_4]] : i32 + // CHECK: %[[VAL_36:.*]] = llvm.icmp "eq" %[[VAL_35]], %[[CST_0]] : i32 + // CHECK: %[[VAL_37:.*]] = llvm.select %[[VAL_36]], %[[CST_0]], %[[CST_8]] : i1, i32 + // CHECK: %[[VAL_38:.*]] = llvm.xor %[[VAL_34]], %[[VAL_37]] : i32 + // CHECK: %[[VAL_39:.*]] = llvm.and %[[LANE_ID]], %[[CST_8]] : i32 + // CHECK: %[[VAL_40:.*]] = llvm.icmp "eq" %[[VAL_39]], %[[CST_0]] : i32 + // CHECK: %[[VAL_41:.*]] = llvm.select %[[VAL_40]], %[[CST_0]], %[[CST_16]] : i1, i32 + // CHECK: %[[VAL_42:.*]] = llvm.xor %[[VAL_38]], %[[VAL_41]] : i32 + // CHECK: %[[VAL_43:.*]] = llvm.and %[[WARP_ID]], %[[CST_2]] : i32 + // CHECK: %[[VAL_44:.*]] = llvm.icmp "eq" %[[VAL_43]], %[[CST_0]] : i32 + // CHECK: %[[VAL_45:.*]] = llvm.select %[[VAL_44]], %[[CST_0]], %[[CST_8]] : i1, i32 + // CHECK: %[[VAL_46:.*]] = llvm.xor %[[CST_0]], %[[VAL_45]] : i32 + // CHECK: %[[OFFSET_X_0:.*]] = llvm.xor %[[VAL_46]], %[[CST_0]] : i32 + // CHECK: %[[OFFSET_Y_0:.*]] = llvm.xor %[[VAL_42]], %[[CST_0]] : i32 + // CHECK: %[[OFFSET_Y_1:.*]] = llvm.xor %[[VAL_42]], %[[CST_1]] : i32 + // CHECK: %[[OFFSET_X_1:.*]] = llvm.xor %[[VAL_46]], %[[CST_1]] : i32 + // CHECK: %[[OFFSET_X_2:.*]] = llvm.xor %[[VAL_46]], %[[CST_2]] : i32 + // CHECK: %[[OFFSET_X_3:.*]] = llvm.xor %[[VAL_46]], %[[CST_3]] : i32 + // CHECK: %[[OFFSET_X_4:.*]] = llvm.xor %[[VAL_46]], %[[CST_4]] : i32 + // CHECK: %[[OFFSET_X_5:.*]] = llvm.xor %[[VAL_46]], %[[CST_5]] : i32 + // CHECK: %[[OFFSET_X_6:.*]] = llvm.xor %[[VAL_46]], %[[CST_6]] : i32 + // CHECK: %[[OFFSET_X_7:.*]] = llvm.xor %[[VAL_46]], %[[CST_7]] : i32 + // CHECK: %[[OFFSET_X_8:.*]] = llvm.xor %[[VAL_46]], %[[CST_16]] : i32 + // CHECK: %[[OFFSET_X_9:.*]] = llvm.xor %[[VAL_46]], %[[CST_17]] : i32 + // CHECK: %[[OFFSET_X_10:.*]] = llvm.xor %[[VAL_46]], %[[CST_18]] : i32 + // CHECK: %[[OFFSET_X_11:.*]] = llvm.xor %[[VAL_46]], %[[CST_19]] : i32 + // CHECK: %[[OFFSET_X_12:.*]] = llvm.xor %[[VAL_46]], %[[CST_20]] : i32 + // CHECK: %[[OFFSET_X_13:.*]] = llvm.xor %[[VAL_46]], %[[CST_21]] : i32 + // CHECK: %[[OFFSET_X_14:.*]] = llvm.xor %[[VAL_46]], %[[CST_22]] : i32 + // CHECK: %[[OFFSET_X_15:.*]] = llvm.xor %[[VAL_46]], %[[CST_23]] : i32 // CHECK: llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_0]], %[[OFFSET_Y_0]], {{.*}}, {{.*}}) // CHECK: llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_0]], %[[OFFSET_Y_1]], {{.*}}, {{.*}}) // CHECK: llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_1]], %[[OFFSET_Y_0]], {{.*}}, {{.*}}) @@ -172,14 +174,13 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 // CHECK-DAG: %[[CST_3:.*]] = llvm.mlir.constant(3 : i32) : i32 + // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 // CHECK-DAG: %[[CST_5:.*]] = llvm.mlir.constant(5 : i32) : i32 // CHECK-DAG: %[[CST_6:.*]] = llvm.mlir.constant(6 : i32) : i32 // CHECK-DAG: %[[CST_7:.*]] = llvm.mlir.constant(7 : i32) : i32 + // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 // CHECK-DAG: %[[CST_17:.*]] = llvm.mlir.constant(17 : i32) : i32 // CHECK-DAG: %[[CST_18:.*]] = llvm.mlir.constant(18 : i32) : i32 // CHECK-DAG: %[[CST_19:.*]] = llvm.mlir.constant(19 : i32) : i32 @@ -190,34 +191,26 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[THREADS_ID:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]]) // CHECK: %[[THREADS_ID_32:.*]] = llvm.trunc %[[THREADS_ID]] : i64 to i32 // CHECK: %[[WARP_ID:.*]] = llvm.udiv %[[THREADS_ID_32]], %[[CST_16]] : i32 - // CHECK: %[[LANE_ID:.*]] = llvm.urem %[[THREADS_ID_32]], %[[CST_16]] : i32 - // CHECK: %[[VAL_29:.*]] = llvm.udiv %[[WARP_ID]], %[[CST_2]] : i32 - // CHECK: %[[WARP_ID_X:.*]] = llvm.urem %[[VAL_29]], %[[CST_2]] : i32 - // CHECK: %[[ROUNDED_WARP_ID_X:.*]] = llvm.urem %[[WARP_ID_X]], %[[CST_4]] : i32 - // CHECK: %[[WARP_OFFSET_X:.*]] = llvm.mul %[[ROUNDED_WARP_ID_X]], %[[CST_8]] : i32 - // CHECK: %[[LANE_OFFSET_X:.*]] = llvm.udiv %[[LANE_ID]], %[[CST_16]] : i32 - // CHECK: %[[OFFSET_X:.*]] = llvm.add %[[LANE_OFFSET_X]], %[[WARP_OFFSET_X]] : i32 - // CHECK: %[[VAL_35:.*]] = llvm.udiv %[[CST_0]], %[[CST_1]] : i32 - // CHECK: %[[VAL_36:.*]] = llvm.urem %[[VAL_35]], %[[CST_1]] : i32 - // CHECK: %[[VAL_37:.*]] = llvm.urem %[[VAL_36]], %[[CST_1]] : i32 - // CHECK: %[[CTA_OFFSET_X:.*]] = llvm.mul %[[VAL_37]], %[[CST_32]] : i32 - // CHECK: %[[VAL_39:.*]] = llvm.add %[[OFFSET_X]], %[[CTA_OFFSET_X]] : i32 - // CHECK: %[[OFFSET_X_0:.*]] = llvm.add %[[VAL_39]], %[[CST_0]] : i32 - // CHECK: %[[OFFSET_X_1:.*]] = llvm.add %[[VAL_39]], %[[CST_1]] : i32 - // CHECK: %[[OFFSET_X_2:.*]] = llvm.add %[[VAL_39]], %[[CST_2]] : i32 - // CHECK: %[[OFFSET_X_3:.*]] = llvm.add %[[VAL_39]], %[[CST_3]] : i32 - // CHECK: %[[OFFSET_X_4:.*]] = llvm.add %[[VAL_39]], %[[CST_4]] : i32 - // CHECK: %[[OFFSET_X_5:.*]] = llvm.add %[[VAL_39]], %[[CST_5]] : i32 - // CHECK: %[[OFFSET_X_6:.*]] = llvm.add %[[VAL_39]], %[[CST_6]] : i32 - // CHECK: %[[OFFSET_X_7:.*]] = llvm.add %[[VAL_39]], %[[CST_7]] : i32 - // CHECK: %[[OFFSET_X_8:.*]] = llvm.add %[[VAL_39]], %[[CST_16]] : i32 - // CHECK: %[[OFFSET_X_9:.*]] = llvm.add %[[VAL_39]], %[[CST_17]] : i32 - // CHECK: %[[OFFSET_X_10:.*]] = llvm.add %[[VAL_39]], %[[CST_18]] : i32 - // CHECK: %[[OFFSET_X_11:.*]] = llvm.add %[[VAL_39]], %[[CST_19]] : i32 - // CHECK: %[[OFFSET_X_12:.*]] = llvm.add %[[VAL_39]], %[[CST_20]] : i32 - // CHECK: %[[OFFSET_X_13:.*]] = llvm.add %[[VAL_39]], %[[CST_21]] : i32 - // CHECK: %[[OFFSET_X_14:.*]] = llvm.add %[[VAL_39]], %[[CST_22]] : i32 - // CHECK: %[[OFFSET_X_15:.*]] = llvm.add %[[VAL_39]], %[[CST_23]] : i32 + // CHECK: %[[VAL_26:.*]] = llvm.and %[[WARP_ID]], %[[CST_2]] : i32 + // CHECK: %[[VAL_27:.*]] = llvm.icmp "eq" %[[VAL_26]], %[[CST_0]] : i32 + // CHECK: %[[VAL_28:.*]] = llvm.select %[[VAL_27]], %[[CST_0]], %[[CST_8]] : i1, i32 + // CHECK: %[[VAL_29:.*]] = llvm.xor %[[CST_0]], %[[VAL_28]] : i32 + // CHECK: %[[OFFSET_X_0:.*]] = llvm.xor %[[VAL_29]], %[[CST_0]] : i32 + // CHECK: %[[OFFSET_X_1:.*]] = llvm.xor %[[VAL_29]], %[[CST_1]] : i32 + // CHECK: %[[OFFSET_X_2:.*]] = llvm.xor %[[VAL_29]], %[[CST_2]] : i32 + // CHECK: %[[OFFSET_X_3:.*]] = llvm.xor %[[VAL_29]], %[[CST_3]] : i32 + // CHECK: %[[OFFSET_X_4:.*]] = llvm.xor %[[VAL_29]], %[[CST_4]] : i32 + // CHECK: %[[OFFSET_X_5:.*]] = llvm.xor %[[VAL_29]], %[[CST_5]] : i32 + // CHECK: %[[OFFSET_X_6:.*]] = llvm.xor %[[VAL_29]], %[[CST_6]] : i32 + // CHECK: %[[OFFSET_X_7:.*]] = llvm.xor %[[VAL_29]], %[[CST_7]] : i32 + // CHECK: %[[OFFSET_X_8:.*]] = llvm.xor %[[VAL_29]], %[[CST_16]] : i32 + // CHECK: %[[OFFSET_X_9:.*]] = llvm.xor %[[VAL_29]], %[[CST_17]] : i32 + // CHECK: %[[OFFSET_X_10:.*]] = llvm.xor %[[VAL_29]], %[[CST_18]] : i32 + // CHECK: %[[OFFSET_X_11:.*]] = llvm.xor %[[VAL_29]], %[[CST_19]] : i32 + // CHECK: %[[OFFSET_X_12:.*]] = llvm.xor %[[VAL_29]], %[[CST_20]] : i32 + // CHECK: %[[OFFSET_X_13:.*]] = llvm.xor %[[VAL_29]], %[[CST_21]] : i32 + // CHECK: %[[OFFSET_X_14:.*]] = llvm.xor %[[VAL_29]], %[[CST_22]] : i32 + // CHECK: %[[OFFSET_X_15:.*]] = llvm.xor %[[VAL_29]], %[[CST_23]] : i32 // CHECK: %[[VAL_56:.*]] = llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_0]], {{.*}}, {{.*}}) // CHECK: %[[VAL_57:.*]] = llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_1]], {{.*}}, {{.*}}) // CHECK: %[[VAL_58:.*]] = llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_2]], {{.*}}, {{.*}}) diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp index 4ee77e934d..6b902003fb 100644 --- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp +++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp @@ -341,38 +341,44 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout, } // anonymous namespace +// clang-format off // The layout example repeat_count=8, systolic_depth=8, // execution_size=16 and operands_per_chan=2 for warp size 32. // For A operand: -// systolic depth = 8 -//<-----------------------------------------------------> -// opsPerChan=2 -//<---------> -// t0 ... t0 t1 ... t1 ~ t6 ... t6 t7 ... t7 ^ -// t8 ... t8 t9 ... t9 ~ t14 ... t14 t15 ... t15 | -// t16 ... t16 t17 ... t17 ~ t22 ... t22 t23 ... t23 | -// t24 ... t24 t25 ... t25 ~ t30 ... t30 t31 ... t31 | repeat count <= 8 -// t0 ... t0 t1 ... t1 ~ t6 ... t6 t7 ... t7 | -// t8 ... t8 t9 ... t9 ~ t14 ... t14 t15 ... t15 | -// t16 ... t16 t17 ... t17 ~ t22 ... t22 t23 ... t23 | -// t24 ... t24 t25 ... t25 ~ t30 ... t30 t31 ... t31 v +// K = 16 (K = systolic depth * opsPerChan) +// <----------------------------------------------------------------------------> +// t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 ^ +// t16 t17 t18 t19 t20 t21 t22 t23 t24 t25 t26 t27 t28 t29 t30 t31 | +// t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 | +// t16 t17 t18 t19 t20 t21 t22 t23 t24 t25 t26 t27 t28 t29 t30 t31 | +// t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 | M = 8 (repeat count) +// t16 t17 t18 t19 t20 t21 t22 t23 t24 t25 t26 t27 t28 t29 t30 t31 | +// t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 | +// t16 t17 t18 t19 t20 t21 t22 t23 t24 t25 t26 t27 t28 t29 t30 t31 v // In this case, the LinearLayout bases are: -// Register: {{0,1}, {4,0}} -// Lane: {{0,2}, {0,4}, {0,8}, {1,0}, {2,0}} +// Register: {{2,0}, {4,0}} +// Lane: {{0,1}, {0,2}, {0,4}, {0,8}, {1,0}} +// clang-format on std::vector> DPASRegBasesA(int opsPerChannel, int repeatCount, int threadsPerWarp, int systolicDepth) { - int rowPerWarp = threadsPerWarp / systolicDepth; - int warpRepeats = repeatCount / rowPerWarp; std::vector> regBases; - for (int opc = 1; opc < opsPerChannel; opc *= 2) { + // pack the value to i16 for scalar bit width <=16. + assert((opsPerChannel == 4 || opsPerChannel == 2 || opsPerChannel == 1) && + "invalid opsPerChannel number."); + int packedOpsPerLane = opsPerChannel == 4 ? 2 : 1; + int packedColNum = (systolicDepth * opsPerChannel) / packedOpsPerLane; + int rowsPerWarp = mlir::ceil(threadsPerWarp, packedColNum); + int warpRepeats = repeatCount / rowsPerWarp; + + for (int opc = 1; opc < packedOpsPerLane; opc *= 2) { regBases.push_back({0, opc}); } for (int warp = 1; warp < warpRepeats; warp *= 2) { - regBases.push_back({warp * rowPerWarp, 0}); + regBases.push_back({warp * rowsPerWarp, 0}); } return regBases; @@ -382,11 +388,17 @@ std::vector> DPASLaneBasesA(int opsPerChannel, int threadsPerWarp, int systolicDepth) { std::vector> laneBases; - for (int tid = 1; tid < systolicDepth; tid *= 2) { - laneBases.push_back({0, opsPerChannel * tid}); + // pack the value to i16 for scalar bit width <=16. + assert((opsPerChannel == 4 || opsPerChannel == 2 || opsPerChannel == 1) && + "invalid opsPerChannel number."); + int packedOpsPerLane = opsPerChannel == 4 ? 2 : 1; + int packedColNum = (systolicDepth * opsPerChannel) / packedOpsPerLane; + + for (int tid = 1; tid < packedColNum; tid *= 2) { + laneBases.push_back({0, packedOpsPerLane * tid}); } - for (int tid = systolicDepth; tid < threadsPerWarp; tid *= 2) { - laneBases.push_back({tid / systolicDepth, 0}); + for (int tid = packedColNum; tid < threadsPerWarp; tid *= 2) { + laneBases.push_back({tid / packedColNum, 0}); } return laneBases; @@ -602,8 +614,7 @@ std::optional dotOperandDpasToLinearLayout(DotOperandEncodingAttr dotDpasLayout, ArrayRef shape) { auto dpasLayout = cast(dotDpasLayout.getParent()); - if (dotDpasLayout.getOpIdx() == 0) - return std::nullopt; + return DPAStoLinearLayout(shape, dpasLayout, dotDpasLayout.getOpIdx()); } diff --git a/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp b/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp index 6d42c9948a..d4f6d0b821 100644 --- a/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp +++ b/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp @@ -59,17 +59,47 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_perInst) { }, {S("dim0"), S("dim1")})); // Test Operand A (opIdx=0) + EXPECT_EQ( + DPAStoLinearLayout({8, 32}, dpas({1, 1}, 8, 8, 16, 4, {1, 1}, 32), 0), + LinearLayout( + { + {S("register"), {{0, 1}, {2, 0}, {4, 0}}}, + {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {0, 16}, {1, 0}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); EXPECT_EQ( DPAStoLinearLayout({8, 16}, dpas({1, 1}, 8, 8, 16, 2, {1, 1}, 32), 0), LinearLayout( { - {S("register"), {{0, 1}, {4, 0}}}, - {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}}}, + {S("register"), {{2, 0}, {4, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); + EXPECT_EQ( + DPAStoLinearLayout({8, 8}, dpas({1, 1}, 8, 8, 16, 1, {1, 1}, 32), 0), + LinearLayout( + { + {S("register"), {{4, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {1, 0}, {2, 0}}}, {S("warp"), {}}, {S("block"), {}}, }, {S("dim0"), S("dim1")})); // Test Operand B (opIdx=1) + EXPECT_EQ( + DPAStoLinearLayout({32, 16}, dpas({1, 1}, 8, 8, 16, 4, {1, 1}, 32), 1), + LinearLayout( + { + {S("register"), {{1, 0}, {2, 0}, {8, 0}, {16, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {4, 0}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); EXPECT_EQ( DPAStoLinearLayout({16, 16}, dpas({1, 1}, 8, 8, 16, 2, {1, 1}, 32), 1), LinearLayout( @@ -80,6 +110,16 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_perInst) { {S("block"), {}}, }, {S("dim0"), S("dim1")})); + EXPECT_EQ( + DPAStoLinearLayout({8, 16}, dpas({1, 1}, 8, 8, 16, 1, {1, 1}, 32), 1), + LinearLayout( + { + {S("register"), {{2, 0}, {4, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); } TEST_F(DPAStoLinearLayoutTest, DPAS_withRepCluster) { @@ -98,8 +138,8 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_withRepCluster) { DPAStoLinearLayout({32, 16}, dpas({1, 1}, 8, 8, 16, 2, {4, 2}, 32), 0), LinearLayout( { - {S("register"), {{0, 1}, {4, 0}, {8, 0}, {16, 0}}}, - {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}}}, + {S("register"), {{2, 0}, {4, 0}, {8, 0}, {16, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}}, {S("warp"), {}}, {S("block"), {}}, }, @@ -154,8 +194,8 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_withWarpOperandA) { LinearLayout( { {S("register"), - {{0, 1}, {4, 0}, {8, 0}, {16, 0}, {0, 16}, {0, 32}}}, - {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}}}, + {{2, 0}, {4, 0}, {8, 0}, {16, 0}, {0, 16}, {0, 32}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}}, {S("warp"), {{0, 0}, {32, 0}}}, {S("block"), {}}, }, diff --git a/third_party/proton/dialect/CMakeLists.txt b/third_party/proton/dialect/CMakeLists.txt new file mode 100644 index 0000000000..c7b5413a0e --- /dev/null +++ b/third_party/proton/dialect/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) +add_subdirectory(include) +add_subdirectory(lib) +if(TRITON_BUILD_PYTHON_MODULE) + add_triton_plugin(TritonProton ${CMAKE_CURRENT_SOURCE_DIR}/triton_proton.cc LINK_LIBS ProtonIR) +endif() diff --git a/third_party/proton/dialect/include/CMakeLists.txt b/third_party/proton/dialect/include/CMakeLists.txt new file mode 100644 index 0000000000..0ca0f41c5a --- /dev/null +++ b/third_party/proton/dialect/include/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(Dialect) diff --git a/third_party/proton/dialect/include/Dialect/CMakeLists.txt b/third_party/proton/dialect/include/Dialect/CMakeLists.txt new file mode 100644 index 0000000000..f18c30ba1a --- /dev/null +++ b/third_party/proton/dialect/include/Dialect/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(Proton) diff --git a/third_party/proton/dialect/include/Dialect/Proton/CMakeLists.txt b/third_party/proton/dialect/include/Dialect/Proton/CMakeLists.txt new file mode 100644 index 0000000000..f33061b2d8 --- /dev/null +++ b/third_party/proton/dialect/include/Dialect/Proton/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(IR) diff --git a/third_party/proton/dialect/include/Dialect/Proton/IR/CMakeLists.txt b/third_party/proton/dialect/include/Dialect/Proton/IR/CMakeLists.txt new file mode 100644 index 0000000000..4645b0ebcd --- /dev/null +++ b/third_party/proton/dialect/include/Dialect/Proton/IR/CMakeLists.txt @@ -0,0 +1,18 @@ +set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) + +set(LLVM_TARGET_DEFINITIONS ProtonOps.td) +mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=proton) +mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=proton) +mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions) +mlir_tablegen(Ops.h.inc -gen-op-decls) +mlir_tablegen(Ops.cpp.inc -gen-op-defs) +mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) +mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) +add_mlir_doc(ProtonDialect ProtonDialect dialects/ -gen-dialect-doc) +add_mlir_doc(ProtonOps ProtonOps dialects/ -gen-op-doc) +add_public_tablegen_target(ProtonTableGen) + +set(LLVM_TARGET_DEFINITIONS ProtonAttrDefs.td) +mlir_tablegen(ProtonAttrDefs.h.inc -gen-attrdef-decls) +mlir_tablegen(ProtonAttrDefs.cpp.inc -gen-attrdef-defs) +add_public_tablegen_target(ProtonAttrDefsIncGen) diff --git a/third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h b/third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h new file mode 100644 index 0000000000..680a205f08 --- /dev/null +++ b/third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h @@ -0,0 +1,23 @@ +#ifndef TRITON_DIALECT_PROTON_IR_DIALECT_H_ +#define TRITON_DIALECT_PROTON_IR_DIALECT_H_ + +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Dialect.h" +#include "mlir/IR/PatternMatch.h" +#include "proton/dialect/include/Dialect/Proton/IR/Dialect.h.inc" +#include "proton/dialect/include/Dialect/Proton/IR/OpsEnums.h.inc" + +#define GET_ATTRDEF_CLASSES +#include "proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.h.inc" + +#define GET_OP_CLASSES +#include "proton/dialect/include/Dialect/Proton/IR/Ops.h.inc" + +namespace mlir { +namespace triton { +namespace proton {} // namespace proton +} // namespace triton +} // namespace mlir + +#endif // TRITON_DIALECT_PROTON_IR_DIALECT_H_ diff --git a/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.td b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.td new file mode 100644 index 0000000000..d469fbb35f --- /dev/null +++ b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.td @@ -0,0 +1,12 @@ +#ifndef PROTON_ATTRDEFS +#define PROTON_ATTRDEFS + +include "mlir/IR/AttrTypeBase.td" +include "ProtonDialect.td" + +class Proton_Attr traits = [], + string baseCppClass = "::mlir::Attribute"> + : AttrDef { +} + +#endif // PROTON_ATTRDEFS diff --git a/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonDialect.td b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonDialect.td new file mode 100644 index 0000000000..245f2e09a2 --- /dev/null +++ b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonDialect.td @@ -0,0 +1,18 @@ +#ifndef PROTON_DIALECT +#define PROTON_DIALECT + +include "mlir/IR/OpBase.td" + +def Proton_Dialect : Dialect { + let name = "proton"; + let cppNamespace = "::mlir::triton::proton"; + + let description = [{ + Proton Dialect provides core ops for building third-party compiler-based + performance profiling and analysis tools. + }]; + + let dependentDialects = []; +} + +#endif diff --git a/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonOps.td b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonOps.td new file mode 100644 index 0000000000..d18a48d5d1 --- /dev/null +++ b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonOps.td @@ -0,0 +1,65 @@ +#ifndef PROTON_OPS +#define PROTON_OPS + +include "mlir/IR/OpBase.td" +include "mlir/IR/EnumAttr.td" +include "triton/Dialect/Triton/IR/TritonTypes.td" +include "mlir/Dialect/LLVMIR/LLVMOpBase.td" +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "triton/Dialect/Triton/IR/TritonInterfaces.td" +include "ProtonDialect.td" +include "ProtonAttrDefs.td" + +class TT_Proton_Op traits = []> : + Op { +} + +// Proton profiling metric. +def MetricAttr : I32EnumAttr< + "Metric", "", + [ + I32EnumAttrCase<"CYCLE", 0, "cycle">, + ]> { + let cppNamespace = "::mlir::triton::proton"; +} + +// Proton profiling granularity. +def GranularityAttr : I32EnumAttr< + "Granularity", "", + [ + I32EnumAttrCase<"WARPGROUP", 0, "warpgroup">, + I32EnumAttrCase<"WARP", 1, "warp">, + ]> { + let cppNamespace = "::mlir::triton::proton"; +} + +def TT_RecordOp : TT_Proton_Op<"record", [DeclareOpInterfaceMethods]> { + let summary = "Record a GPU hardware event"; + + let description = [{ + The operator records GPU events from performance counters. + Currently only cycle counter is supported. + + Example: + + ```mlir + proton.record() {isStart = true, regionId = 4 : i32} + ... + proton.record() {isStart = false, regionId = 4 : i32} + ... + proton.record() {isStart = true, regionId = 1 : i32, granularity = 1 : i32} + ... + proton.record() {isStart = false, regionId = 1 : i32, granularity = 1 : i32} + ``` + }]; + let arguments = ( + ins BoolAttr: $isStart, + ConfinedAttr:$regionId, + DefaultValuedAttr:$metric, + DefaultValuedAttr:$granularity + ); + let assemblyFormat = " `(` operands `)` attr-dict"; +} + +#endif // PROTON_OPS diff --git a/third_party/proton/dialect/lib/CMakeLists.txt b/third_party/proton/dialect/lib/CMakeLists.txt new file mode 100644 index 0000000000..0ca0f41c5a --- /dev/null +++ b/third_party/proton/dialect/lib/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(Dialect) diff --git a/third_party/proton/dialect/lib/Dialect/CMakeLists.txt b/third_party/proton/dialect/lib/Dialect/CMakeLists.txt new file mode 100644 index 0000000000..f18c30ba1a --- /dev/null +++ b/third_party/proton/dialect/lib/Dialect/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(Proton) diff --git a/third_party/proton/dialect/lib/Dialect/Proton/CMakeLists.txt b/third_party/proton/dialect/lib/Dialect/Proton/CMakeLists.txt new file mode 100644 index 0000000000..f33061b2d8 --- /dev/null +++ b/third_party/proton/dialect/lib/Dialect/Proton/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(IR) diff --git a/third_party/proton/dialect/lib/Dialect/Proton/IR/CMakeLists.txt b/third_party/proton/dialect/lib/Dialect/Proton/IR/CMakeLists.txt new file mode 100644 index 0000000000..5eea5cb3cf --- /dev/null +++ b/third_party/proton/dialect/lib/Dialect/Proton/IR/CMakeLists.txt @@ -0,0 +1,13 @@ +add_triton_library(ProtonIR + Dialect.cpp + Ops.cpp + + DEPENDS + ProtonTableGen + ProtonAttrDefsIncGen + + LINK_LIBS PUBLIC + MLIRLLVMDialect + TritonIR + TritonGPUIR +) diff --git a/third_party/proton/dialect/lib/Dialect/Proton/IR/Dialect.cpp b/third_party/proton/dialect/lib/Dialect/Proton/IR/Dialect.cpp new file mode 100644 index 0000000000..60c2852654 --- /dev/null +++ b/third_party/proton/dialect/lib/Dialect/Proton/IR/Dialect.cpp @@ -0,0 +1,25 @@ +#include "mlir/IR/DialectImplementation.h" +#include "mlir/IR/OpImplementation.h" + +// clang-format off +#include "Dialect/Proton/IR/Dialect.h" +#include "Dialect/Proton/IR/Dialect.cpp.inc" +// clang-format on + +using namespace mlir; +using namespace mlir::triton::proton; + +void mlir::triton::proton::ProtonDialect::initialize() { + addAttributes< +#define GET_ATTRDEF_LIST +#include "Dialect/Proton/IR/ProtonAttrDefs.cpp.inc" + >(); + + addOperations< +#define GET_OP_LIST +#include "Dialect/Proton/IR/Ops.cpp.inc" + >(); +} + +#define GET_ATTRDEF_CLASSES +#include "Dialect/Proton/IR/ProtonAttrDefs.cpp.inc" diff --git a/third_party/proton/dialect/lib/Dialect/Proton/IR/Ops.cpp b/third_party/proton/dialect/lib/Dialect/Proton/IR/Ops.cpp new file mode 100644 index 0000000000..1a0799aea1 --- /dev/null +++ b/third_party/proton/dialect/lib/Dialect/Proton/IR/Ops.cpp @@ -0,0 +1,33 @@ +#include "Dialect/Proton/IR/Dialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/OperationSupport.h" +#include "mlir/Interfaces/FunctionImplementation.h" +#include "mlir/Interfaces/FunctionInterfaces.h" +#include "mlir/Support/LLVM.h" +#include "triton/Dialect/Triton/IR/Dialect.h" +#include "triton/Dialect/Triton/IR/Types.h" +#include "triton/Dialect/Triton/IR/Utility.h" + +#define GET_OP_CLASSES +#include "Dialect/Proton/IR/Ops.cpp.inc" +#include "Dialect/Proton/IR/OpsEnums.cpp.inc" + +namespace mlir { +namespace triton { +namespace proton { + +// -- RecordOp -- +void RecordOp::getEffects( + SmallVectorImpl> + &effects) { + effects.emplace_back(MemoryEffects::Write::get(), + SideEffects::DefaultResource::get()); + effects.emplace_back(MemoryEffects::Read::get(), + SideEffects::DefaultResource::get()); +} + +} // namespace proton +} // namespace triton +} // namespace mlir diff --git a/third_party/proton/dialect/triton_proton.cc b/third_party/proton/dialect/triton_proton.cc new file mode 100644 index 0000000000..8046539794 --- /dev/null +++ b/third_party/proton/dialect/triton_proton.cc @@ -0,0 +1,20 @@ +#include "Dialect/Proton/IR/Dialect.h" +#include "mlir/Pass/PassManager.h" +#include "passes.h" +#include +#include +#include + +namespace py = pybind11; + +void init_triton_proton(py::module &&m) { + auto passes = m.def_submodule("passes"); + + // load dialects + m.def("load_dialects", [](mlir::MLIRContext &context) { + mlir::DialectRegistry registry; + registry.insert(); + context.appendDialectRegistry(registry); + context.loadAllAvailableDialects(); + }); +}