diff --git a/.github/actions/setup-pytorch/action.yml b/.github/actions/setup-pytorch/action.yml
index b6cc894722..88f482f78a 100644
--- a/.github/actions/setup-pytorch/action.yml
+++ b/.github/actions/setup-pytorch/action.yml
@@ -83,7 +83,7 @@ runs:
       uses: ./.github/actions/load
       env:
         # Increase this value to reset cache
-        CACHE_NUMBER: 12
+        CACHE_NUMBER: 14
       with:
         path: pytorch
         key: pytorch-$PYTORCH_CACHE_KEY-$CACHE_NUMBER
@@ -120,7 +120,7 @@ runs:
         cd pytorch
         pip install wheel
         pip install -r requirements.txt
-        python setup.py bdist_wheel
+        USE_STATIC_MKL=1 python setup.py bdist_wheel
 
     - name: Install PyTorch (built from source)
       if: ${{ inputs.mode == 'source' }}
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index cfba6d7225..2922da501e 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -21,10 +21,12 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 permissions: read-all
 env:
+  TRITON_BUILD_WITH_CCACHE: "true"
   TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
   TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
   TRITON_DISABLE_LINE_INFO: 1
   PROTON_SKIP_PC_SAMPLING_TEST: 1
+  CCACHE_COMPRESS: "true"
 jobs:
   Runner-Preparation:
     runs-on: ubuntu-latest
@@ -39,6 +41,11 @@ jobs:
         if: github.event_name == 'pull_request'
         run: |
           echo "enable_integration=true" >> $GITHUB_ENV
+      - name: Decide manual trigger integration test enablement
+        # Always enable integration tests when manually triggered
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          echo "enable_integration=true" >> $GITHUB_ENV
       - name: Checkout post-submit commits
         if: github.event_name == 'push'
         uses: actions/checkout@v4
@@ -154,6 +161,8 @@ jobs:
     strategy:
       matrix:
         runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-CUDA)}}
+    env:
+      RUNNER_TYPE: ${{ matrix.runner[0] }}
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -199,22 +208,28 @@ jobs:
         # "restore" step.  This is to prevent the caches from accumulating stale
         # files over time.
         name: Restore cache of ccache and Triton compilation artifacts
-        if: github.event_name != 'push'
+        id: restore-build-cache
+        if: github.ref != 'refs/heads/main'
         uses: actions/cache/restore@v4
         with:
           path: |
             ~/.triton/cache
-            ~/.cache/ccache
+            ~/.ccache
           # Restore the most recent cache entry.
-          restore-keys: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-
+          restore-keys: |
+            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-
+            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-
           # We expect this cache key never to hit and for us to fall back
           # unconditionally to the restore-key, so it doesn't actually matter
           # what we put here (so long as it doesn't hit an existing key).
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
-      - name: Inspect cache directory
+          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
+      - name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
-          ls -alh ~/.triton
+          du -h -d 1 ~/.triton
+
+          mkdir -p ~/.ccache
+          du -h -d 1 ~/.ccache
       - name: Update PATH
         run: |
           echo "$HOME/.local/bin" >> $GITHUB_PATH
@@ -224,12 +239,14 @@ jobs:
           python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
       - name: Install Triton
         env:
-          TRITON_BUILD_WITH_CCACHE: "true"
           CUDA_HOME: "/usr/local/cuda"
         run: |
           echo "PATH is '$PATH'"
           cd python
-          python3 -m pip install '.[tests]'
+          ccache --zero-stats
+          python3 -m pip install -v '.[tests]'
+      - name: CCache Stats
+        run: ccache --print-stats
       - name: Run lit tests
         run: |
           cd python
@@ -278,6 +295,13 @@ jobs:
           cd third_party/proton/test
           python3 -m pytest -s .
           cd ..
+      - name: Inspect cache directories
+        run: |
+          mkdir -p ~/.triton
+          du -h -d 1 ~/.triton
+
+          mkdir -p ~/.ccache
+          du -h -d 1 ~/.ccache
       - # If we're on branch `main`, save the ccache Triton compilation artifacts
         # to the cache so they can be used by other (non-main) CI runs.
         #
@@ -287,22 +311,17 @@ jobs:
         if: github.ref == 'refs/heads/main'
         uses: actions/cache/save@v4
         with:
-          path: ~/.triton/cache ~/.cache/ccache
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
-      - name: Inspect cache directories
-        run: |
-          mkdir -p ~/.triton
-          ls -alh ~/.triton
-          du -sh ~/.triton/**
-
-          mkdir -p ~/.cache/ccache
-          ls -alh ~/.cache/ccache
-          du -sh ~/.cache/ccache
+          path: |
+            ~/.triton/cache
+            ~/.ccache
+          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
   Integration-Tests-AMD:
     needs: Runner-Preparation
     if: needs.Runner-Preparation.outputs.matrix-HIP != ''
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 30
+    env:
+      RUNNER_TYPE: ${{ matrix.runner[1] }}
     strategy:
       matrix:
         runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-HIP)}}
@@ -355,22 +374,28 @@ jobs:
         # "restore" step.  This is to prevent the caches from accumulating stale
         # files over time.
         name: Restore cache of ccache and Triton compilation artifacts
-        if: github.event_name != 'push'
+        id: restore-build-cache
+        if: github.ref != 'refs/heads/main'
         uses: actions/cache/restore@v4
         with:
           path: |
             ~/.triton/cache
-            ~/.cache/ccache
+            ~/.ccache
           # Restore the most recent cache entry.
-          restore-keys: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-
+          restore-keys: |
+            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-
+            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-
           # We expect this cache key never to hit and for us to fall back
           # unconditionally to the restore-key, so it doesn't actually matter
           # what we put here (so long as it doesn't hit an existing key).
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
-      - name: Inspect cache directory
+          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
+      - name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
-          ls -alh ~/.triton
+          du -h -d 1 ~/.triton
+
+          mkdir -p ~/.ccache
+          du -h -d 1 ~/.ccache
       - name: Update PATH
         run: |
           echo "/opt/rocm/llvm/bin" >> $GITHUB_PATH
@@ -378,17 +403,24 @@ jobs:
         run: |
           python3 -m pip install --upgrade pip
           python3 -m pip install lit
+      - name: Install apt dependencies
+        run: |
+          apt update
+          apt install ccache
       - name: Install Triton
         id: amd-install-triton
         run: |
           echo "PATH is '$PATH'"
           pip uninstall -y triton
           cd python
+          ccache --zero-stats
           pip install -v -e '.[tests]'
       - name: Clean up after an unsuccessful build
         if: ${{ !success() && steps.amd-install-triton.outcome != 'success' }}
         run: |
           rm -rf ~/.triton
+      - name: CCache Stats
+        run: ccache --print-stats
       - name: Run lit tests
         run: |
           cd python
@@ -431,6 +463,13 @@ jobs:
           cd python
           cd "build/$(ls build | grep -i cmake)"
           ctest -j32
+      - name: Inspect cache directories
+        run: |
+          mkdir -p ~/.triton
+          du -h -d 1 ~/.triton
+
+          mkdir -p ~/.ccache
+          du -h -d 1 ~/.ccache
       - # If we're on branch `main`, save the ccache Triton compilation artifacts
         # to the cache so they can be used by other (non-main) CI runs.
         #
@@ -440,17 +479,10 @@ jobs:
         if: github.ref == 'refs/heads/main'
         uses: actions/cache/save@v4
         with:
-          path: ~/.triton/cache ~/.cache/ccache
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
-      - name: Inspect cache directories
-        run: |
-          mkdir -p ~/.triton
-          ls -alh ~/.triton
-          du -sh ~/.triton/**
-
-          mkdir -p ~/.cache/ccache
-          ls -alh ~/.cache/ccache
-          du -sh ~/.cache/ccache
+          path: |
+            ~/.triton/cache
+            ~/.ccache
+          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
       - name: Clean up caches
         run: |
           rm -rf ~/.triton/cache
@@ -458,10 +490,12 @@ jobs:
     needs: Runner-Preparation
     if: needs.Runner-Preparation.outputs.matrix-MACOS != ''
     runs-on: ${{ matrix.runner }}
-    timeout-minutes: 30
+    timeout-minutes: 40
     strategy:
       matrix:
         runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-MACOS)}}
+    env:
+      RUNNER_TYPE: ${{ matrix.runner[0] }}
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -470,7 +504,7 @@ jobs:
       - name: Install brew dependencies
         run: |
           brew update
-          brew install ccache llvm@19 lld
+          brew install ccache llvm@19 lld coreutils
       - name: Compute cache keys
         id: cache-key
         run: |
@@ -511,22 +545,28 @@ jobs:
         # "restore" step.  This is to prevent the caches from accumulating stale
         # files over time.
         name: Restore cache of ccache and Triton compilation artifacts
-        if: github.event_name != 'push'
+        id: restore-build-cache
+        if: github.ref != 'refs/heads/main'
         uses: actions/cache/restore@v4
         with:
           path: |
             ~/.triton/cache
-            ~/.cache/ccache
+            ~/.ccache
           # Restore the most recent cache entry.
-          restore-keys: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-
+          restore-keys: |
+            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-
+            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-
           # We expect this cache key never to hit and for us to fall back
           # unconditionally to the restore-key, so it doesn't actually matter
           # what we put here (so long as it doesn't hit an existing key).
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
-      - name: Inspect cache directory
+          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
+      - name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
-          ls -alh ~/.triton
+          du -h -d 1 ~/.triton
+
+          mkdir -p ~/.ccache
+          du -h -d 1 ~/.ccache
       - name: Update PATH
         run: |
           echo "$HOME/.local/bin" >> $GITHUB_PATH
@@ -539,7 +579,6 @@ jobs:
           python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit pybind11
       - name: Install Triton
         env:
-          TRITON_BUILD_WITH_CCACHE: "true"
           TRITON_BUILD_WITH_O1: "true"
           # macos-latest has 3 vcpus and 7GB DRAM, to save memory we limit the number of jobs to 3
           # https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories
@@ -548,7 +587,17 @@ jobs:
           source ~/.venv/bin/activate
           echo "PATH is '$PATH'"
           cd python
-          python3 -m pip install --no-build-isolation .
+          ccache --zero-stats
+          python3 -m pip install -v --no-build-isolation .
+      - name: CCache Stats
+        run: ccache --print-stats
+      - name: Inspect cache directories
+        run: |
+          mkdir -p ~/.triton
+          du -h -d 1 ~/.triton
+
+          mkdir -p ~/.ccache
+          du -h -d 1 ~/.ccache
       - # If we're on branch `main`, save the ccache Triton compilation artifacts
         # to the cache so they can be used by other (non-main) CI runs.
         #
@@ -558,14 +607,7 @@ jobs:
         if: github.ref == 'refs/heads/main'
         uses: actions/cache/save@v4
         with:
-          path: ~/.triton/cache ~/.cache/ccache
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
-      - name: Inspect cache directories
-        run: |
-          mkdir -p ~/.triton
-          ls -alh ~/.triton
-          du -sh ~/.triton/**
-
-          mkdir -p ~/.cache/ccache
-          ls -alh ~/.cache/ccache
-          du -sh ~/.cache/ccache
+          path: |
+            ~/.triton/cache
+            ~/.ccache
+          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in
index 7da4aa0793..7de7264272 100644
--- a/.github/workflows/integration-tests.yml.in
+++ b/.github/workflows/integration-tests.yml.in
@@ -23,10 +23,12 @@ concurrency:
 permissions: read-all
 
 env:
+  TRITON_BUILD_WITH_CCACHE: "true"
   TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
   TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
   TRITON_DISABLE_LINE_INFO: 1
   PROTON_SKIP_PC_SAMPLING_TEST: 1
+  CCACHE_COMPRESS: "true"
 
 jobs:
   Runner-Preparation:
@@ -43,6 +45,12 @@ jobs:
         run: |
           echo "enable_integration=true" >> $GITHUB_ENV
 
+      - name: Decide manual trigger integration test enablement
+        # Always enable integration tests when manually triggered
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          echo "enable_integration=true" >> $GITHUB_ENV
+
       - name: Checkout post-submit commits
         if: github.event_name == 'push'
         uses: actions/checkout@v4
@@ -174,6 +182,9 @@ jobs:
       matrix:
         runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-CUDA)}}
 
+    env:
+        RUNNER_TYPE: ${{ matrix.runner[0] }}
+
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -225,24 +236,30 @@ jobs:
       # files over time.
       - &restore-build-artifacts-step
         name: Restore cache of ccache and Triton compilation artifacts
-        if: github.event_name != 'push'
+        id: restore-build-cache
+        if: github.ref != 'refs/heads/main'
         uses: actions/cache/restore@v4
         with:
           path: |
             ~/.triton/cache
-            ~/.cache/ccache
+            ~/.ccache
           # Restore the most recent cache entry.
-          restore-keys: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-
+          restore-keys: |
+            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-
+            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-
           # We expect this cache key never to hit and for us to fall back
           # unconditionally to the restore-key, so it doesn't actually matter
           # what we put here (so long as it doesn't hit an existing key).
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
+          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
 
-      - &inspect-cache-directory-step
-        name: Inspect cache directory
+      - &inspect-cache-directories-step
+        name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
-          ls -alh ~/.triton
+          du -h -d 1 ~/.triton
+
+          mkdir -p ~/.ccache
+          du -h -d 1 ~/.ccache
 
       - name: Update PATH
         run: |
@@ -255,12 +272,16 @@ jobs:
 
       - name: Install Triton
         env:
-          TRITON_BUILD_WITH_CCACHE: "true"
           CUDA_HOME: "/usr/local/cuda"
         run: |
           echo "PATH is '$PATH'"
           cd python
-          python3 -m pip install '.[tests]'
+          ccache --zero-stats
+          python3 -m pip install -v '.[tests]'
+
+      - &print-ccache-stats
+        name: CCache Stats
+        run: ccache --print-stats
 
       - &run-lit-tests-step
         name: Run lit tests
@@ -319,6 +340,8 @@ jobs:
           python3 -m pytest -s .
           cd ..
 
+      - *inspect-cache-directories-step
+
       # If we're on branch `main`, save the ccache Triton compilation artifacts
       # to the cache so they can be used by other (non-main) CI runs.
       #
@@ -329,19 +352,10 @@ jobs:
         if: github.ref == 'refs/heads/main'
         uses: actions/cache/save@v4
         with:
-          path: ~/.triton/cache ~/.cache/ccache
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
-
-      - &inspect-cache-directories-step
-        name: Inspect cache directories
-        run: |
-          mkdir -p ~/.triton
-          ls -alh ~/.triton
-          du -sh ~/.triton/**
-
-          mkdir -p ~/.cache/ccache
-          ls -alh ~/.cache/ccache
-          du -sh ~/.cache/ccache
+          path: |
+            ~/.triton/cache
+            ~/.ccache
+          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
 
   Integration-Tests-AMD:
     needs: Runner-Preparation
@@ -350,6 +364,9 @@ jobs:
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 30
 
+    env:
+        RUNNER_TYPE: ${{ matrix.runner[1] }}
+
     strategy:
       matrix:
         runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-HIP)}}
@@ -369,7 +386,7 @@ jobs:
       - *compute-cache-keys-step
       - *cache-build-dependencies-step
       - *restore-build-artifacts-step
-      - *inspect-cache-directory-step
+      - *inspect-cache-directories-step
 
       - name: Update PATH
         run: |
@@ -380,12 +397,18 @@ jobs:
           python3 -m pip install --upgrade pip
           python3 -m pip install lit
 
+      - name: Install apt dependencies
+        run: |
+          apt update
+          apt install ccache
+
       - name: Install Triton
         id: amd-install-triton
         run: |
           echo "PATH is '$PATH'"
           pip uninstall -y triton
           cd python
+          ccache --zero-stats
           pip install -v -e '.[tests]'
 
       - name: Clean up after an unsuccessful build
@@ -393,6 +416,7 @@ jobs:
         run: |
           rm -rf ~/.triton
 
+      - *print-ccache-stats
       - *run-lit-tests-step
 
       - name: Run python tests on HIP
@@ -423,8 +447,8 @@ jobs:
 
       - *run-proton-tests-step
       - *run-cpp-unittests-step
-      - *save-build-artifacts-step
       - *inspect-cache-directories-step
+      - *save-build-artifacts-step
 
       - name: Clean up caches
         run: |
@@ -434,10 +458,14 @@ jobs:
     needs: Runner-Preparation
     if: needs.Runner-Preparation.outputs.matrix-MACOS != ''
     runs-on: ${{ matrix.runner }}
-    timeout-minutes: 30
+    timeout-minutes: 40
     strategy:
       matrix:
         runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-MACOS)}}
+
+    env:
+        RUNNER_TYPE: ${{ matrix.runner[0] }}
+
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -446,12 +474,12 @@ jobs:
       - name: Install brew dependencies
         run: |
           brew update
-          brew install ccache llvm@19 lld
+          brew install ccache llvm@19 lld coreutils
 
       - *compute-cache-keys-step
       - *cache-build-dependencies-step
       - *restore-build-artifacts-step
-      - *inspect-cache-directory-step
+      - *inspect-cache-directories-step
 
       - name: Update PATH
         run: |
@@ -465,7 +493,6 @@ jobs:
           python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit pybind11
       - name: Install Triton
         env:
-          TRITON_BUILD_WITH_CCACHE: "true"
           TRITON_BUILD_WITH_O1: "true"
           # macos-latest has 3 vcpus and 7GB DRAM, to save memory we limit the number of jobs to 3
           # https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories
@@ -474,7 +501,9 @@ jobs:
           source ~/.venv/bin/activate
           echo "PATH is '$PATH'"
           cd python
-          python3 -m pip install --no-build-isolation .
+          ccache --zero-stats
+          python3 -m pip install -v --no-build-isolation .
 
-      - *save-build-artifacts-step
+      - *print-ccache-stats
       - *inspect-cache-directories-step
+      - *save-build-artifacts-step
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa9bd605c9..e4d16d4f9d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,6 +45,13 @@ if(TRITON_BUILD_WITH_CCACHE)
   endif()
 endif()
 
+set(TRITON_PARALLEL_LINK_JOBS "" CACHE STRING
+  "Define the maximum number of concurrent link jobs (Ninja only).")
+if (TRITON_PARALLEL_LINK_JOBS)
+    set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${TRITON_PARALLEL_LINK_JOBS})
+    set(CMAKE_JOB_POOL_LINK link_job_pool)
+endif()
+
 
 # Ensure Python3 vars are set correctly
 # used conditionally in this file and by lit tests
@@ -226,6 +233,9 @@ if(TRITON_BUILD_PYTHON_MODULE)
   if (TRITON_BUILD_PROTON)
     add_subdirectory(third_party/proton)
   endif()
+  # We always build proton dialect
+  list(APPEND TRITON_PLUGIN_NAMES "proton")
+  add_subdirectory(third_party/proton/dialect)
 
   get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS)
   get_property(triton_plugins GLOBAL PROPERTY TRITON_PLUGINS)
@@ -334,6 +344,7 @@ if(NOT TRITON_BUILD_PYTHON_MODULE)
   foreach(CODEGEN_BACKEND ${TRITON_CODEGEN_BACKENDS})
     add_subdirectory(third_party/${CODEGEN_BACKEND})
   endforeach()
+  add_subdirectory(third_party/proton/dialect)
 endif()
 if(WIN32)
   option(CMAKE_USE_WIN32_THREADS_INIT "using WIN32 threads" ON)
diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index b32e533b64..aa1293bd49 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -102,6 +102,7 @@ add_llvm_executable(triton-tensor-layout triton-tensor-layout.cpp PARTIAL_SOURCE
 target_link_libraries(triton-tensor-layout PRIVATE
   TritonGPUIR
   TritonNvidiaGPUIR
+  TritonIntelGPUIR
   ${triton_libs}
   ${conversion_libs}
   ${dialect_libs}
diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h
index b103adeaba..a59956af5c 100644
--- a/bin/RegisterTritonDialects.h
+++ b/bin/RegisterTritonDialects.h
@@ -12,6 +12,7 @@
 #include "amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "amd/include/TritonAMDGPUTransforms/Passes.h"
 #include "third_party/nvidia/include/Dialect/NVGPU/IR/Dialect.h"
+#include "third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
@@ -93,14 +94,15 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::registerTritonAMDGPULowerInstructionSchedHints();
 
   // TODO: register Triton & TritonGPU passes
-  registry.insert<mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
-                  mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect,
-                  mlir::triton::gpu::TritonGPUDialect, mlir::math::MathDialect,
-                  mlir::arith::ArithDialect, mlir::scf::SCFDialect,
-                  mlir::gpu::GPUDialect, mlir::LLVM::LLVMDialect,
-                  mlir::NVVM::NVVMDialect, mlir::triton::nvgpu::NVGPUDialect,
-                  mlir::triton::amdgpu::TritonAMDGPUDialect,
-                  mlir::ROCDL::ROCDLDialect,
-                  mlir::triton::gpu::intel::TritonIntelGPUDialect,
-                  mlir::triton::TritonGEN::TritonGENDialect>();
+  registry
+      .insert<mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
+              mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect,
+              mlir::triton::gpu::TritonGPUDialect, mlir::math::MathDialect,
+              mlir::arith::ArithDialect, mlir::scf::SCFDialect,
+              mlir::gpu::GPUDialect, mlir::LLVM::LLVMDialect,
+              mlir::NVVM::NVVMDialect, mlir::triton::nvgpu::NVGPUDialect,
+              mlir::triton::amdgpu::TritonAMDGPUDialect,
+              mlir::triton::proton::ProtonDialect, mlir::ROCDL::ROCDLDialect,
+              mlir::triton::gpu::intel::TritonIntelGPUDialect,
+              mlir::triton::TritonGEN::TritonGENDialect>();
 }
diff --git a/bin/triton-tensor-layout.cpp b/bin/triton-tensor-layout.cpp
index 4087ac1350..b330cfb5aa 100644
--- a/bin/triton-tensor-layout.cpp
+++ b/bin/triton-tensor-layout.cpp
@@ -80,17 +80,9 @@ static cl::opt<std::string> TensorStr(
 //===--------------------------------------------------------------------===//
 
 LogicalResult layoutPrint(RankedTensorType tensorType, raw_ostream &os) {
-  StringRef dialectName = tensorType.getEncoding().getDialect().getNamespace();
-
   // Dispatch to the corresponding dialect helper function to print the layout.
-  if (dialectName == "triton_gpu") {
-    os << triton::gpu::getLayoutStr(tensorType, UseHWPointOfView);
-    return success();
-  }
-
-  llvm::errs() << "Unsupported tensor layout attribute: "
-               << tensorType.getEncoding() << "\n";
-  return failure();
+  os << triton::gpu::getLayoutStr(tensorType, UseHWPointOfView);
+  return success();
 }
 
 LogicalResult printLayoutFromFile(MLIRContext *context, StringRef filename,
diff --git a/docs/update_sycl_libdevice.md b/docs/update_sycl_libdevice.md
new file mode 100644
index 0000000000..307b0c887a
--- /dev/null
+++ b/docs/update_sycl_libdevice.md
@@ -0,0 +1,90 @@
+# Guide to Update SYCL Device Library
+
+This guide will walk you through the steps to update the SYCL device library using the Intel DPC++ compiler.
+
+## Step 1: Display Commands used during Compilation Process
+1. Open a terminal.
+2. Run the following command to compile a C++ file:
+```sh
+dpcpp -save-temps -#x t.cpp
+```
+Replace t.cpp with any C++ file of your choice. This command will display the commands used during the compilation process.
+
+## Step 2: Locate the llvm-link Command
+From the output of the previous command, find the llvm-link command line. It should look similar to the following example:
+```sh
+"/opt/intel/oneapi/compiler/2025.0/bin/compiler/llvm-link" \
+    -only-needed \
+    t-sycl-spir64-unknown-unknown-b331ea.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-crt.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-complex.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-complex-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-cmath.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-cmath-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf-bf16.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cassert.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cstring.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-complex.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-complex-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cmath.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cmath-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf-bf16.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-user-wrappers.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-compiler-wrappers.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-stubs.bc \
+    -o \
+    t-sycl-spir64-unknown-unknown-d81f68.bc \
+    --suppress-warnings
+```
+
+## Step 3: Modify the llvm-link Command
+Remove the `-only-needed` option and the intermediate file `t-sycl-spir64-unknown-unknown-b331ea.bc` from the command line.
+And modify to output file name to `libsycl-spir64-unknown-unknown.bc`.
+The modified command should look like this:
+```sh
+"/opt/intel/oneapi/compiler/2025.0/bin/compiler/llvm-link" \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-crt.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-complex.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-complex-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-cmath.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-cmath-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-imf-bf16.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cassert.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cstring.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-complex.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-complex-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cmath.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-cmath-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf-fp64.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-fallback-imf-bf16.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-user-wrappers.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-compiler-wrappers.bc \
+    /opt/intel/oneapi/compiler/2025.0/bin/compiler/../../lib/libsycl-itt-stubs.bc \
+    -o \
+    libsycl-spir64-unknown-unknown.bc \
+    --suppress-warnings
+```
+
+## Step 4: Execute the Modified Command
+Copy the modified llvm-link command.
+Paste and run it in the terminal.
+
+## Step 5: Check for Manual Changes
+Check the log of the existing device library to see what manual changes need to be made:
+```sh
+git log third_party/intel/backend/lib/libsycl-spir64-unknown-unknown.bc
+```
+Look for any specific changes mentioned in the commit messages. For example, from commit 0dd37fc92c46f35c6ced34801e51058b6b89ea47, you need to change one of the module metadata from 4 to 3.
+
+## Step 6: Apply Manual Changes
+`llvm-dis` to disassemble the bitcode library, then based on the information from the git log, apply the necessary manual changes to the updated device library.
+Reassemble the modified LLVMIR device library using `llvm-as`.
+
+By following these steps, you will have successfully updated the SYCL device library and applied any necessary manual changes.
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
index 8e8b089549..2d06980809 100644
--- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -374,24 +374,24 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     // TODO (Keren): Currently, we handle general mma/blocked/slice/dot(ampere)
     // -> mma/blocked/slice/dot(ampere) conversions. The following tasks must be
     // completed before we can remove the layoutIsOK check:
-    // 1. Support for AMD's MFMA and WMMA
+    // 1. Support for AMD's WMMA
     std::function<bool(Attribute)> layoutIsOK = [&](Attribute layout) {
-      if (auto nvidiaMma = dyn_cast<NvidiaMmaEncodingAttr>(layout)) {
-        if (useLegacyMMAConversion) {
-          return false;
-        }
-        return true;
+      if (isa<NvidiaMmaEncodingAttr, AMDMfmaEncodingAttr>(layout)) {
+        return !useLegacyMMAConversion;
       }
       if (auto dotOperand = dyn_cast<DotOperandEncodingAttr>(layout)) {
-        if (auto nvidiaMma =
-                dyn_cast<NvidiaMmaEncodingAttr>(dotOperand.getParent())) {
-          if (useLegacyMMAConversion) {
-            return false;
-          }
+        auto parent = dotOperand.getParent();
+        if (isa<MmaEncodingTrait>(parent) && useLegacyMMAConversion) {
+          return false;
+        }
+        if (auto nvidiaMma = dyn_cast<NvidiaMmaEncodingAttr>(parent)) {
           if (nvidiaMma.isAmpere()) {
             return true;
           }
         }
+        if (isa<AMDMfmaEncodingAttr>(parent)) {
+          return true;
+        }
         return false;
       }
       if (isa<BlockedEncodingAttr>(layout)) {
diff --git a/python/setup.py b/python/setup.py
index 65388d8664..1e6dee4cf6 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -523,6 +523,7 @@ def build_extension(self, ext):
             "TRITON_BUILD_PROTON",
             "TRITON_BUILD_TUTORIALS",
             "TRITON_BUILD_WITH_CCACHE",
+            "TRITON_PARALLEL_LINK_JOBS",
         ]
         cmake_args += [f"-D{option}={os.getenv(option)}" for option in passthrough_args if option in os.environ]
 
diff --git a/scripts/compile-pytorch-ipex.sh b/scripts/compile-pytorch-ipex.sh
index 7c5a41f6c8..d753abe113 100755
--- a/scripts/compile-pytorch-ipex.sh
+++ b/scripts/compile-pytorch-ipex.sh
@@ -117,7 +117,7 @@ if [[ $BUILD_PYTORCH = true ]]; then
   echo "****** Building $PYTORCH_PROJ ******"
   pip install -r requirements.txt
   pip install cmake ninja "numpy<2.0"
-  python setup.py bdist_wheel
+  USE_STATIC_MKL=1 python setup.py bdist_wheel
 
   echo "****** Installing PyTorch ******"
   pip install dist/*.whl
diff --git a/scripts/install-pytorch.sh b/scripts/install-pytorch.sh
index a8a0f2b83a..74b3ac5158 100755
--- a/scripts/install-pytorch.sh
+++ b/scripts/install-pytorch.sh
@@ -155,7 +155,7 @@ $SCRIPTS_DIR/patch-pytorch.sh
 echo "****** Building $PYTORCH_PROJ ******"
 pip install -r requirements.txt
 pip install cmake ninja
-python setup.py bdist_wheel
+USE_STATIC_MKL=1 python setup.py bdist_wheel
 
 echo "****** Installing PyTorch ******"
 pip install dist/*.whl
diff --git a/scripts/skiplist/a770/language.txt b/scripts/skiplist/a770/language.txt
index e833b924bd..7e3e8d62fc 100644
--- a/scripts/skiplist/a770/language.txt
+++ b/scripts/skiplist/a770/language.txt
@@ -1,7 +1,5 @@
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
-test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/2703
 test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
diff --git a/scripts/skiplist/conda/language.txt b/scripts/skiplist/conda/language.txt
index 41035163ff..1f2dcf0d10 100644
--- a/scripts/skiplist/conda/language.txt
+++ b/scripts/skiplist/conda/language.txt
@@ -115,8 +115,6 @@ test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[64-float8e4b15-1
 test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[128-float8e5-128-256-128-128-256-256]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
-test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/2703
 test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
diff --git a/scripts/skiplist/default/language.txt b/scripts/skiplist/default/language.txt
index fb018c5e0f..a891b802b5 100644
--- a/scripts/skiplist/default/language.txt
+++ b/scripts/skiplist/default/language.txt
@@ -1,6 +1,4 @@
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
-test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/2703
 test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0]
diff --git a/scripts/skiplist/mtl/language.txt b/scripts/skiplist/mtl/language.txt
index df2e44aae4..69530824f3 100644
--- a/scripts/skiplist/mtl/language.txt
+++ b/scripts/skiplist/mtl/language.txt
@@ -1,7 +1,5 @@
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
-test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/2703
 test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
diff --git a/scripts/skiplist/xe2/language.txt b/scripts/skiplist/xe2/language.txt
index fb018c5e0f..a891b802b5 100644
--- a/scripts/skiplist/xe2/language.txt
+++ b/scripts/skiplist/xe2/language.txt
@@ -1,6 +1,4 @@
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
-test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/2703
 test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0]
diff --git a/test/Conversion/amd/mfma-shortcut.mlir b/test/Conversion/amd/mfma-shortcut.mlir
index 83c9e535d8..a2c8f48718 100644
--- a/test/Conversion/amd/mfma-shortcut.mlir
+++ b/test/Conversion/amd/mfma-shortcut.mlir
@@ -7,6 +7,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   tt.func public @shortcut_mfma16(%arg0: tensor<16x16xf16, #mfma>) {
     // CHECK-NOT: store
     // CHECK-NOT: load
+    // CHECK: llvm.return
     %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mfma> -> tensor<16x16xf16, #dotop>
     tt.return
   }
@@ -21,6 +22,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   tt.func public @no_shortcut_mfma16(%arg0: tensor<16x16xf16, #mfma>) {
     // CHECK: store
     // CHECK: load
+    // CHECK: llvm.return
     %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mfma> -> tensor<16x16xf16, #dotop>
     tt.return
   }
diff --git a/test/Conversion/intel/dot_layout_offset.mlir b/test/Conversion/intel/dot_layout_offset.mlir
index 92129848d0..09615f4252 100644
--- a/test/Conversion/intel/dot_layout_offset.mlir
+++ b/test/Conversion/intel/dot_layout_offset.mlir
@@ -12,317 +12,307 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
     // CHECK:           %[[THREAD_ID_I64:.*]] = llvm.call spir_funccc @_Z12get_local_idj
     // CHECK:           %[[THREAD_ID_I32:.*]] = llvm.trunc %[[THREAD_ID_I64]] : i64 to i32
     // CHECK:           %[[VAL_145:.*]] = llvm.mlir.constant(16 : i32) : i32
+    // CHECK:           %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_I32]], %[[VAL_145]]  : i32
     // CHECK:           %[[WARP_ID:.*]] = llvm.udiv %[[THREAD_ID_I32]], %[[VAL_145]]  : i32
-    // CHECK:           %[[VAL_147:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_I32]], %[[VAL_147]]  : i32
+    // CHECK-COUNT-3:   %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK:           %[[VAL_149:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:           %[[WARP_ID_N:.*]] = llvm.urem %[[WARP_ID]], %[[VAL_149]]  : i32
-    // CHECK:           %[[VAL_151:.*]] = llvm.udiv %[[WARP_ID]], %[[VAL_149]]  : i32
+    // CHECK:           %[[VAL_150:.*]] = llvm.and %[[LANE_ID]], %[[VAL_149]]  : i32
+    // CHECK:           %[[VAL_151:.*]] = llvm.icmp "eq" %[[VAL_150]], %[[CST_0]] : i32
     // CHECK:           %[[VAL_152:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:           %[[WARP_ID_M:.*]] = llvm.urem %[[VAL_151]], %[[VAL_152]]  : i32
-    // CHECK:           %[[VAL_154:.*]] = llvm.udiv %[[VAL_151]], %[[VAL_152]]  : i32
+    // CHECK:           %[[VAL_153:.*]] = llvm.select %[[VAL_151]], %[[CST_0]], %[[VAL_152]] : i1, i32
+    // CHECK:           %[[VAL_154:.*]] = llvm.xor %[[CST_0]], %[[VAL_153]]  : i32
     // CHECK:           %[[VAL_155:.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK:           %[[ROUNDED_WARP_ID_M:.*]] = llvm.urem %[[WARP_ID_M]], %[[VAL_155]]  : i32
-    // CHECK:           %[[warpShape_M:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[warpOffset:.*]] = llvm.mul %[[ROUNDED_WARP_ID_M]], %[[warpShape_M]] : i32
-    // CHECK:           %[[VAL_159:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[laneRowIndex:.*]] = llvm.udiv %[[LANE_ID]], %[[VAL_159]]  : i32
-    // CHECK:           %[[VAL_161:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_162:.*]] = llvm.urem %[[LANE_ID]], %[[VAL_161]]  : i32
-    // CHECK:           %[[VAL_163:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:           %[[multiDimBase_N:.*]] = llvm.mul %[[VAL_162]], %[[VAL_163]] : i32
-    // CHECK:           %[[multiDimBase_M:.*]] = llvm.add %[[laneRowIndex]], %[[warpOffset]] : i32
-    // CHECK:           %[[VAL_166:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_167:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:           %[[VAL_168:.*]] = llvm.urem %[[VAL_166]], %[[VAL_167]]  : i32
-    // CHECK:           %[[VAL_169:.*]] = llvm.udiv %[[VAL_166]], %[[VAL_167]]  : i32
-    // CHECK:           %[[VAL_170:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:           %[[VAL_171:.*]] = llvm.urem %[[VAL_169]], %[[VAL_170]]  : i32
-    // CHECK:           %[[VAL_172:.*]] = llvm.udiv %[[VAL_169]], %[[VAL_170]]  : i32
-    // CHECK:           %[[VAL_173:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:           %[[VAL_174:.*]] = llvm.urem %[[VAL_171]], %[[VAL_173]]  : i32
-    // CHECK:           %[[VAL_175:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:           %[[VAL_176:.*]] = llvm.urem %[[VAL_168]], %[[VAL_175]]  : i32
-    // CHECK:           %[[VAL_177:.*]] = llvm.mlir.constant(32 : i32) : i32
-    // CHECK:           %[[CTAOffset_M:.*]] = llvm.mul %[[VAL_174]], %[[VAL_177]] : i32
-    // CHECK:           %[[VAL_179:.*]] = llvm.mlir.constant(32 : i32) : i32
-    // CHECK:           %[[CTAOffset_N:.*]] = llvm.mul %[[VAL_176]], %[[VAL_179]] : i32
-    // CHECK:           %[[VAL_181:.*]] = llvm.add %[[multiDimBase_M]], %[[CTAOffset_M]] : i32
-    // CHECK:           %[[VAL_182:.*]] = llvm.add %[[multiDimBase_N]], %[[CTAOffset_N]] : i32
+    // CHECK:           %[[VAL_156:.*]] = llvm.and %[[LANE_ID]], %[[VAL_155]]  : i32
+    // CHECK:           %[[VAL_157:.*]] = llvm.icmp "eq" %[[VAL_156]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_158:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK:           %[[VAL_159:.*]] = llvm.select %[[VAL_157]], %[[CST_0]], %[[VAL_158]] : i1, i32
+    // CHECK:           %[[VAL_160:.*]] = llvm.xor %[[VAL_154]], %[[VAL_159]]  : i32
+    // CHECK:           %[[VAL_161:.*]] = llvm.mlir.constant(4 : i32) : i32
+    // CHECK:           %[[VAL_162:.*]] = llvm.and %[[LANE_ID]], %[[VAL_161]]  : i32
+    // CHECK:           %[[VAL_163:.*]] = llvm.icmp "eq" %[[VAL_162]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_164:.*]] = llvm.mlir.constant(4 : i32) : i32
+    // CHECK:           %[[VAL_165:.*]] = llvm.select %[[VAL_163]], %[[CST_0]], %[[VAL_164]] : i1, i32
+    // CHECK:           %[[VAL_182:.*]] = llvm.xor %[[VAL_160]], %[[VAL_165]]  : i32
+    // CHECK:           %[[VAL_167:.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK:           %[[VAL_168:.*]] = llvm.and %[[LANE_ID]], %[[VAL_167]]  : i32
+    // CHECK:           %[[VAL_169:.*]] = llvm.icmp "eq" %[[VAL_168]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_170:.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK:           %[[VAL_171:.*]] = llvm.select %[[VAL_169]], %[[CST_0]], %[[VAL_170]] : i1, i32
+    // CHECK:           %[[VAL_181:.*]] = llvm.xor %[[VAL_182]], %[[VAL_171]]  : i32
 
     // COM: There are total [4, 2] repetitions of tensor shape [32, 32] per warp.
     // COM: The repetitions are clustered as [2, 1] for A operand. The repetitions orders are [0, 0], [1, 0], [0, 1], [1, 1], [2, 0], [3, 0], [2, 1], [3, 1]
     // COM: Offsets of rep [0, 0].
     // CHECK:           %[[VAL_183:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_184:.*]] = llvm.add %[[VAL_181]], %[[VAL_183]] : i32
+    // CHECK:           %[[VAL_184:.*]] = llvm.xor %[[CST_0]], %[[VAL_183]] : i32
     // CHECK:           %[[VAL_185:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_186:.*]] = llvm.add %[[VAL_182]], %[[VAL_185]] : i32
+    // CHECK:           %[[VAL_186:.*]] = llvm.xor %[[VAL_181]], %[[VAL_185]] : i32
     // CHECK:           %[[VAL_187:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:           %[[VAL_188:.*]] = llvm.add %[[VAL_181]], %[[VAL_187]] : i32
+    // CHECK:           %[[VAL_188:.*]] = llvm.xor %[[CST_0]], %[[VAL_187]] : i32
     // CHECK:           %[[VAL_189:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_190:.*]] = llvm.add %[[VAL_182]], %[[VAL_189]] : i32
+    // CHECK:           %[[VAL_190:.*]] = llvm.xor %[[VAL_181]], %[[VAL_189]] : i32
     // CHECK:           %[[VAL_191:.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK:           %[[VAL_192:.*]] = llvm.add %[[VAL_181]], %[[VAL_191]] : i32
+    // CHECK:           %[[VAL_192:.*]] = llvm.xor %[[CST_0]], %[[VAL_191]] : i32
     // CHECK:           %[[VAL_193:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_194:.*]] = llvm.add %[[VAL_182]], %[[VAL_193]] : i32
+    // CHECK:           %[[VAL_194:.*]] = llvm.xor %[[VAL_181]], %[[VAL_193]] : i32
     // CHECK:           %[[VAL_195:.*]] = llvm.mlir.constant(3 : i32) : i32
-    // CHECK:           %[[VAL_196:.*]] = llvm.add %[[VAL_181]], %[[VAL_195]] : i32
+    // CHECK:           %[[VAL_196:.*]] = llvm.xor %[[CST_0]], %[[VAL_195]] : i32
     // CHECK:           %[[VAL_197:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_198:.*]] = llvm.add %[[VAL_182]], %[[VAL_197]] : i32
+    // CHECK:           %[[VAL_198:.*]] = llvm.xor %[[VAL_181]], %[[VAL_197]] : i32
     // CHECK:           %[[VAL_199:.*]] = llvm.mlir.constant(4 : i32) : i32
-    // CHECK:           %[[VAL_200:.*]] = llvm.add %[[VAL_181]], %[[VAL_199]] : i32
+    // CHECK:           %[[VAL_200:.*]] = llvm.xor %[[CST_0]], %[[VAL_199]] : i32
     // CHECK:           %[[VAL_201:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_202:.*]] = llvm.add %[[VAL_182]], %[[VAL_201]] : i32
+    // CHECK:           %[[VAL_202:.*]] = llvm.xor %[[VAL_181]], %[[VAL_201]] : i32
     // CHECK:           %[[VAL_203:.*]] = llvm.mlir.constant(5 : i32) : i32
-    // CHECK:           %[[VAL_204:.*]] = llvm.add %[[VAL_181]], %[[VAL_203]] : i32
+    // CHECK:           %[[VAL_204:.*]] = llvm.xor %[[CST_0]], %[[VAL_203]] : i32
     // CHECK:           %[[VAL_205:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_206:.*]] = llvm.add %[[VAL_182]], %[[VAL_205]] : i32
+    // CHECK:           %[[VAL_206:.*]] = llvm.xor %[[VAL_181]], %[[VAL_205]] : i32
     // CHECK:           %[[VAL_207:.*]] = llvm.mlir.constant(6 : i32) : i32
-    // CHECK:           %[[VAL_208:.*]] = llvm.add %[[VAL_181]], %[[VAL_207]] : i32
+    // CHECK:           %[[VAL_208:.*]] = llvm.xor %[[CST_0]], %[[VAL_207]] : i32
     // CHECK:           %[[VAL_209:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_210:.*]] = llvm.add %[[VAL_182]], %[[VAL_209]] : i32
+    // CHECK:           %[[VAL_210:.*]] = llvm.xor %[[VAL_181]], %[[VAL_209]] : i32
     // CHECK:           %[[VAL_211:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_212:.*]] = llvm.add %[[VAL_181]], %[[VAL_211]] : i32
+    // CHECK:           %[[VAL_212:.*]] = llvm.xor %[[CST_0]], %[[VAL_211]] : i32
     // CHECK:           %[[VAL_213:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_214:.*]] = llvm.add %[[VAL_182]], %[[VAL_213]] : i32
+    // CHECK:           %[[VAL_214:.*]] = llvm.xor %[[VAL_181]], %[[VAL_213]] : i32
 
     // COM: Offsets of rep [1, 0].
     // CHECK:           %[[VAL_215:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK:           %[[VAL_216:.*]] = llvm.add %[[VAL_181]], %[[VAL_215]] : i32
+    // CHECK:           %[[VAL_216:.*]] = llvm.xor %[[CST_0]], %[[VAL_215]] : i32
     // CHECK:           %[[VAL_217:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_218:.*]] = llvm.add %[[VAL_182]], %[[VAL_217]] : i32
+    // CHECK:           %[[VAL_218:.*]] = llvm.xor %[[VAL_181]], %[[VAL_217]] : i32
     // CHECK:           %[[VAL_219:.*]] = llvm.mlir.constant(9 : i32) : i32
-    // CHECK:           %[[VAL_220:.*]] = llvm.add %[[VAL_181]], %[[VAL_219]] : i32
+    // CHECK:           %[[VAL_220:.*]] = llvm.xor %[[CST_0]], %[[VAL_219]] : i32
     // CHECK:           %[[VAL_221:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_222:.*]] = llvm.add %[[VAL_182]], %[[VAL_221]] : i32
+    // CHECK:           %[[VAL_222:.*]] = llvm.xor %[[VAL_181]], %[[VAL_221]] : i32
     // CHECK:           %[[VAL_223:.*]] = llvm.mlir.constant(10 : i32) : i32
-    // CHECK:           %[[VAL_224:.*]] = llvm.add %[[VAL_181]], %[[VAL_223]] : i32
+    // CHECK:           %[[VAL_224:.*]] = llvm.xor %[[CST_0]], %[[VAL_223]] : i32
     // CHECK:           %[[VAL_225:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_226:.*]] = llvm.add %[[VAL_182]], %[[VAL_225]] : i32
+    // CHECK:           %[[VAL_226:.*]] = llvm.xor %[[VAL_181]], %[[VAL_225]] : i32
     // CHECK:           %[[VAL_227:.*]] = llvm.mlir.constant(11 : i32) : i32
-    // CHECK:           %[[VAL_228:.*]] = llvm.add %[[VAL_181]], %[[VAL_227]] : i32
+    // CHECK:           %[[VAL_228:.*]] = llvm.xor %[[CST_0]], %[[VAL_227]] : i32
     // CHECK:           %[[VAL_229:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_230:.*]] = llvm.add %[[VAL_182]], %[[VAL_229]] : i32
+    // CHECK:           %[[VAL_230:.*]] = llvm.xor %[[VAL_181]], %[[VAL_229]] : i32
     // CHECK:           %[[VAL_231:.*]] = llvm.mlir.constant(12 : i32) : i32
-    // CHECK:           %[[VAL_232:.*]] = llvm.add %[[VAL_181]], %[[VAL_231]] : i32
+    // CHECK:           %[[VAL_232:.*]] = llvm.xor %[[CST_0]], %[[VAL_231]] : i32
     // CHECK:           %[[VAL_233:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_234:.*]] = llvm.add %[[VAL_182]], %[[VAL_233]] : i32
+    // CHECK:           %[[VAL_234:.*]] = llvm.xor %[[VAL_181]], %[[VAL_233]] : i32
     // CHECK:           %[[VAL_235:.*]] = llvm.mlir.constant(13 : i32) : i32
-    // CHECK:           %[[VAL_236:.*]] = llvm.add %[[VAL_181]], %[[VAL_235]] : i32
+    // CHECK:           %[[VAL_236:.*]] = llvm.xor %[[CST_0]], %[[VAL_235]] : i32
     // CHECK:           %[[VAL_237:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_238:.*]] = llvm.add %[[VAL_182]], %[[VAL_237]] : i32
+    // CHECK:           %[[VAL_238:.*]] = llvm.xor %[[VAL_181]], %[[VAL_237]] : i32
     // CHECK:           %[[VAL_239:.*]] = llvm.mlir.constant(14 : i32) : i32
-    // CHECK:           %[[VAL_240:.*]] = llvm.add %[[VAL_181]], %[[VAL_239]] : i32
+    // CHECK:           %[[VAL_240:.*]] = llvm.xor %[[CST_0]], %[[VAL_239]] : i32
     // CHECK:           %[[VAL_241:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_242:.*]] = llvm.add %[[VAL_182]], %[[VAL_241]] : i32
+    // CHECK:           %[[VAL_242:.*]] = llvm.xor %[[VAL_181]], %[[VAL_241]] : i32
     // CHECK:           %[[VAL_243:.*]] = llvm.mlir.constant(15 : i32) : i32
-    // CHECK:           %[[VAL_244:.*]] = llvm.add %[[VAL_181]], %[[VAL_243]] : i32
+    // CHECK:           %[[VAL_244:.*]] = llvm.xor %[[CST_0]], %[[VAL_243]] : i32
     // CHECK:           %[[VAL_245:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_246:.*]] = llvm.add %[[VAL_182]], %[[VAL_245]] : i32
+    // CHECK:           %[[VAL_246:.*]] = llvm.xor %[[VAL_181]], %[[VAL_245]] : i32
 
     // COM: Offsets of rep [0, 1].
     // CHECK:           %[[VAL_247:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_248:.*]] = llvm.add %[[VAL_181]], %[[VAL_247]] : i32
+    // CHECK:           %[[VAL_248:.*]] = llvm.xor %[[CST_0]], %[[VAL_247]] : i32
     // CHECK:           %[[VAL_249:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_250:.*]] = llvm.add %[[VAL_182]], %[[VAL_249]] : i32
+    // CHECK:           %[[VAL_250:.*]] = llvm.xor %[[VAL_181]], %[[VAL_249]] : i32
     // CHECK:           %[[VAL_251:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:           %[[VAL_252:.*]] = llvm.add %[[VAL_181]], %[[VAL_251]] : i32
+    // CHECK:           %[[VAL_252:.*]] = llvm.xor %[[CST_0]], %[[VAL_251]] : i32
     // CHECK:           %[[VAL_253:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_254:.*]] = llvm.add %[[VAL_182]], %[[VAL_253]] : i32
+    // CHECK:           %[[VAL_254:.*]] = llvm.xor %[[VAL_181]], %[[VAL_253]] : i32
     // CHECK:           %[[VAL_255:.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK:           %[[VAL_256:.*]] = llvm.add %[[VAL_181]], %[[VAL_255]] : i32
+    // CHECK:           %[[VAL_256:.*]] = llvm.xor %[[CST_0]], %[[VAL_255]] : i32
     // CHECK:           %[[VAL_257:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_258:.*]] = llvm.add %[[VAL_182]], %[[VAL_257]] : i32
+    // CHECK:           %[[VAL_258:.*]] = llvm.xor %[[VAL_181]], %[[VAL_257]] : i32
     // CHECK:           %[[VAL_259:.*]] = llvm.mlir.constant(3 : i32) : i32
-    // CHECK:           %[[VAL_260:.*]] = llvm.add %[[VAL_181]], %[[VAL_259]] : i32
+    // CHECK:           %[[VAL_260:.*]] = llvm.xor %[[CST_0]], %[[VAL_259]] : i32
     // CHECK:           %[[VAL_261:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_262:.*]] = llvm.add %[[VAL_182]], %[[VAL_261]] : i32
+    // CHECK:           %[[VAL_262:.*]] = llvm.xor %[[VAL_181]], %[[VAL_261]] : i32
     // CHECK:           %[[VAL_263:.*]] = llvm.mlir.constant(4 : i32) : i32
-    // CHECK:           %[[VAL_264:.*]] = llvm.add %[[VAL_181]], %[[VAL_263]] : i32
+    // CHECK:           %[[VAL_264:.*]] = llvm.xor %[[CST_0]], %[[VAL_263]] : i32
     // CHECK:           %[[VAL_265:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_266:.*]] = llvm.add %[[VAL_182]], %[[VAL_265]] : i32
+    // CHECK:           %[[VAL_266:.*]] = llvm.xor %[[VAL_181]], %[[VAL_265]] : i32
     // CHECK:           %[[VAL_267:.*]] = llvm.mlir.constant(5 : i32) : i32
-    // CHECK:           %[[VAL_268:.*]] = llvm.add %[[VAL_181]], %[[VAL_267]] : i32
+    // CHECK:           %[[VAL_268:.*]] = llvm.xor %[[CST_0]], %[[VAL_267]] : i32
     // CHECK:           %[[VAL_269:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_270:.*]] = llvm.add %[[VAL_182]], %[[VAL_269]] : i32
+    // CHECK:           %[[VAL_270:.*]] = llvm.xor %[[VAL_181]], %[[VAL_269]] : i32
     // CHECK:           %[[VAL_271:.*]] = llvm.mlir.constant(6 : i32) : i32
-    // CHECK:           %[[VAL_272:.*]] = llvm.add %[[VAL_181]], %[[VAL_271]] : i32
+    // CHECK:           %[[VAL_272:.*]] = llvm.xor %[[CST_0]], %[[VAL_271]] : i32
     // CHECK:           %[[VAL_273:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_274:.*]] = llvm.add %[[VAL_182]], %[[VAL_273]] : i32
+    // CHECK:           %[[VAL_274:.*]] = llvm.xor %[[VAL_181]], %[[VAL_273]] : i32
     // CHECK:           %[[VAL_275:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_276:.*]] = llvm.add %[[VAL_181]], %[[VAL_275]] : i32
+    // CHECK:           %[[VAL_276:.*]] = llvm.xor %[[CST_0]], %[[VAL_275]] : i32
     // CHECK:           %[[VAL_277:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_278:.*]] = llvm.add %[[VAL_182]], %[[VAL_277]] : i32
+    // CHECK:           %[[VAL_278:.*]] = llvm.xor %[[VAL_181]], %[[VAL_277]] : i32
 
     // COM: Offsets of rep [1, 1].
     // CHECK:           %[[VAL_279:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK:           %[[VAL_280:.*]] = llvm.add %[[VAL_181]], %[[VAL_279]] : i32
+    // CHECK:           %[[VAL_280:.*]] = llvm.xor %[[CST_0]], %[[VAL_279]] : i32
     // CHECK:           %[[VAL_281:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_282:.*]] = llvm.add %[[VAL_182]], %[[VAL_281]] : i32
+    // CHECK:           %[[VAL_282:.*]] = llvm.xor %[[VAL_181]], %[[VAL_281]] : i32
     // CHECK:           %[[VAL_283:.*]] = llvm.mlir.constant(9 : i32) : i32
-    // CHECK:           %[[VAL_284:.*]] = llvm.add %[[VAL_181]], %[[VAL_283]] : i32
+    // CHECK:           %[[VAL_284:.*]] = llvm.xor %[[CST_0]], %[[VAL_283]] : i32
     // CHECK:           %[[VAL_285:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_286:.*]] = llvm.add %[[VAL_182]], %[[VAL_285]] : i32
+    // CHECK:           %[[VAL_286:.*]] = llvm.xor %[[VAL_181]], %[[VAL_285]] : i32
     // CHECK:           %[[VAL_287:.*]] = llvm.mlir.constant(10 : i32) : i32
-    // CHECK:           %[[VAL_288:.*]] = llvm.add %[[VAL_181]], %[[VAL_287]] : i32
+    // CHECK:           %[[VAL_288:.*]] = llvm.xor %[[CST_0]], %[[VAL_287]] : i32
     // CHECK:           %[[VAL_289:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_290:.*]] = llvm.add %[[VAL_182]], %[[VAL_289]] : i32
+    // CHECK:           %[[VAL_290:.*]] = llvm.xor %[[VAL_181]], %[[VAL_289]] : i32
     // CHECK:           %[[VAL_291:.*]] = llvm.mlir.constant(11 : i32) : i32
-    // CHECK:           %[[VAL_292:.*]] = llvm.add %[[VAL_181]], %[[VAL_291]] : i32
+    // CHECK:           %[[VAL_292:.*]] = llvm.xor %[[CST_0]], %[[VAL_291]] : i32
     // CHECK:           %[[VAL_293:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_294:.*]] = llvm.add %[[VAL_182]], %[[VAL_293]] : i32
+    // CHECK:           %[[VAL_294:.*]] = llvm.xor %[[VAL_181]], %[[VAL_293]] : i32
     // CHECK:           %[[VAL_295:.*]] = llvm.mlir.constant(12 : i32) : i32
-    // CHECK:           %[[VAL_296:.*]] = llvm.add %[[VAL_181]], %[[VAL_295]] : i32
+    // CHECK:           %[[VAL_296:.*]] = llvm.xor %[[CST_0]], %[[VAL_295]] : i32
     // CHECK:           %[[VAL_297:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_298:.*]] = llvm.add %[[VAL_182]], %[[VAL_297]] : i32
+    // CHECK:           %[[VAL_298:.*]] = llvm.xor %[[VAL_181]], %[[VAL_297]] : i32
     // CHECK:           %[[VAL_299:.*]] = llvm.mlir.constant(13 : i32) : i32
-    // CHECK:           %[[VAL_300:.*]] = llvm.add %[[VAL_181]], %[[VAL_299]] : i32
+    // CHECK:           %[[VAL_300:.*]] = llvm.xor %[[CST_0]], %[[VAL_299]] : i32
     // CHECK:           %[[VAL_301:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_302:.*]] = llvm.add %[[VAL_182]], %[[VAL_301]] : i32
+    // CHECK:           %[[VAL_302:.*]] = llvm.xor %[[VAL_181]], %[[VAL_301]] : i32
     // CHECK:           %[[VAL_303:.*]] = llvm.mlir.constant(14 : i32) : i32
-    // CHECK:           %[[VAL_304:.*]] = llvm.add %[[VAL_181]], %[[VAL_303]] : i32
+    // CHECK:           %[[VAL_304:.*]] = llvm.xor %[[CST_0]], %[[VAL_303]] : i32
     // CHECK:           %[[VAL_305:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_306:.*]] = llvm.add %[[VAL_182]], %[[VAL_305]] : i32
+    // CHECK:           %[[VAL_306:.*]] = llvm.xor %[[VAL_181]], %[[VAL_305]] : i32
     // CHECK:           %[[VAL_307:.*]] = llvm.mlir.constant(15 : i32) : i32
-    // CHECK:           %[[VAL_308:.*]] = llvm.add %[[VAL_181]], %[[VAL_307]] : i32
+    // CHECK:           %[[VAL_308:.*]] = llvm.xor %[[CST_0]], %[[VAL_307]] : i32
     // CHECK:           %[[VAL_309:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_310:.*]] = llvm.add %[[VAL_182]], %[[VAL_309]] : i32
+    // CHECK:           %[[VAL_310:.*]] = llvm.xor %[[VAL_181]], %[[VAL_309]] : i32
 
     // COM: Offsets of rep [2, 0].
     // CHECK:           %[[VAL_311:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_312:.*]] = llvm.add %[[VAL_181]], %[[VAL_311]] : i32
+    // CHECK:           %[[VAL_312:.*]] = llvm.xor %[[CST_0]], %[[VAL_311]] : i32
     // CHECK:           %[[VAL_313:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_314:.*]] = llvm.add %[[VAL_182]], %[[VAL_313]] : i32
+    // CHECK:           %[[VAL_314:.*]] = llvm.xor %[[VAL_181]], %[[VAL_313]] : i32
     // CHECK:           %[[VAL_315:.*]] = llvm.mlir.constant(17 : i32) : i32
-    // CHECK:           %[[VAL_316:.*]] = llvm.add %[[VAL_181]], %[[VAL_315]] : i32
+    // CHECK:           %[[VAL_316:.*]] = llvm.xor %[[CST_0]], %[[VAL_315]] : i32
     // CHECK:           %[[VAL_317:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_318:.*]] = llvm.add %[[VAL_182]], %[[VAL_317]] : i32
+    // CHECK:           %[[VAL_318:.*]] = llvm.xor %[[VAL_181]], %[[VAL_317]] : i32
     // CHECK:           %[[VAL_319:.*]] = llvm.mlir.constant(18 : i32) : i32
-    // CHECK:           %[[VAL_320:.*]] = llvm.add %[[VAL_181]], %[[VAL_319]] : i32
+    // CHECK:           %[[VAL_320:.*]] = llvm.xor %[[CST_0]], %[[VAL_319]] : i32
     // CHECK:           %[[VAL_321:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_322:.*]] = llvm.add %[[VAL_182]], %[[VAL_321]] : i32
+    // CHECK:           %[[VAL_322:.*]] = llvm.xor %[[VAL_181]], %[[VAL_321]] : i32
     // CHECK:           %[[VAL_323:.*]] = llvm.mlir.constant(19 : i32) : i32
-    // CHECK:           %[[VAL_324:.*]] = llvm.add %[[VAL_181]], %[[VAL_323]] : i32
+    // CHECK:           %[[VAL_324:.*]] = llvm.xor %[[CST_0]], %[[VAL_323]] : i32
     // CHECK:           %[[VAL_325:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_326:.*]] = llvm.add %[[VAL_182]], %[[VAL_325]] : i32
+    // CHECK:           %[[VAL_326:.*]] = llvm.xor %[[VAL_181]], %[[VAL_325]] : i32
     // CHECK:           %[[VAL_327:.*]] = llvm.mlir.constant(20 : i32) : i32
-    // CHECK:           %[[VAL_328:.*]] = llvm.add %[[VAL_181]], %[[VAL_327]] : i32
+    // CHECK:           %[[VAL_328:.*]] = llvm.xor %[[CST_0]], %[[VAL_327]] : i32
     // CHECK:           %[[VAL_329:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_330:.*]] = llvm.add %[[VAL_182]], %[[VAL_329]] : i32
+    // CHECK:           %[[VAL_330:.*]] = llvm.xor %[[VAL_181]], %[[VAL_329]] : i32
     // CHECK:           %[[VAL_331:.*]] = llvm.mlir.constant(21 : i32) : i32
-    // CHECK:           %[[VAL_332:.*]] = llvm.add %[[VAL_181]], %[[VAL_331]] : i32
+    // CHECK:           %[[VAL_332:.*]] = llvm.xor %[[CST_0]], %[[VAL_331]] : i32
     // CHECK:           %[[VAL_333:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_334:.*]] = llvm.add %[[VAL_182]], %[[VAL_333]] : i32
+    // CHECK:           %[[VAL_334:.*]] = llvm.xor %[[VAL_181]], %[[VAL_333]] : i32
     // CHECK:           %[[VAL_335:.*]] = llvm.mlir.constant(22 : i32) : i32
-    // CHECK:           %[[VAL_336:.*]] = llvm.add %[[VAL_181]], %[[VAL_335]] : i32
+    // CHECK:           %[[VAL_336:.*]] = llvm.xor %[[CST_0]], %[[VAL_335]] : i32
     // CHECK:           %[[VAL_337:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_338:.*]] = llvm.add %[[VAL_182]], %[[VAL_337]] : i32
+    // CHECK:           %[[VAL_338:.*]] = llvm.xor %[[VAL_181]], %[[VAL_337]] : i32
     // CHECK:           %[[VAL_339:.*]] = llvm.mlir.constant(23 : i32) : i32
-    // CHECK:           %[[VAL_340:.*]] = llvm.add %[[VAL_181]], %[[VAL_339]] : i32
+    // CHECK:           %[[VAL_340:.*]] = llvm.xor %[[CST_0]], %[[VAL_339]] : i32
     // CHECK:           %[[VAL_341:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_342:.*]] = llvm.add %[[VAL_182]], %[[VAL_341]] : i32
+    // CHECK:           %[[VAL_342:.*]] = llvm.xor %[[VAL_181]], %[[VAL_341]] : i32
 
     // COM: Offsets of rep [3, 0].
     // CHECK:           %[[VAL_343:.*]] = llvm.mlir.constant(24 : i32) : i32
-    // CHECK:           %[[VAL_344:.*]] = llvm.add %[[VAL_181]], %[[VAL_343]] : i32
+    // CHECK:           %[[VAL_344:.*]] = llvm.xor %[[CST_0]], %[[VAL_343]] : i32
     // CHECK:           %[[VAL_345:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_346:.*]] = llvm.add %[[VAL_182]], %[[VAL_345]] : i32
+    // CHECK:           %[[VAL_346:.*]] = llvm.xor %[[VAL_181]], %[[VAL_345]] : i32
     // CHECK:           %[[VAL_347:.*]] = llvm.mlir.constant(25 : i32) : i32
-    // CHECK:           %[[VAL_348:.*]] = llvm.add %[[VAL_181]], %[[VAL_347]] : i32
+    // CHECK:           %[[VAL_348:.*]] = llvm.xor %[[CST_0]], %[[VAL_347]] : i32
     // CHECK:           %[[VAL_349:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_350:.*]] = llvm.add %[[VAL_182]], %[[VAL_349]] : i32
+    // CHECK:           %[[VAL_350:.*]] = llvm.xor %[[VAL_181]], %[[VAL_349]] : i32
     // CHECK:           %[[VAL_351:.*]] = llvm.mlir.constant(26 : i32) : i32
-    // CHECK:           %[[VAL_352:.*]] = llvm.add %[[VAL_181]], %[[VAL_351]] : i32
+    // CHECK:           %[[VAL_352:.*]] = llvm.xor %[[CST_0]], %[[VAL_351]] : i32
     // CHECK:           %[[VAL_353:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_354:.*]] = llvm.add %[[VAL_182]], %[[VAL_353]] : i32
+    // CHECK:           %[[VAL_354:.*]] = llvm.xor %[[VAL_181]], %[[VAL_353]] : i32
     // CHECK:           %[[VAL_355:.*]] = llvm.mlir.constant(27 : i32) : i32
-    // CHECK:           %[[VAL_356:.*]] = llvm.add %[[VAL_181]], %[[VAL_355]] : i32
+    // CHECK:           %[[VAL_356:.*]] = llvm.xor %[[CST_0]], %[[VAL_355]] : i32
     // CHECK:           %[[VAL_357:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_358:.*]] = llvm.add %[[VAL_182]], %[[VAL_357]] : i32
+    // CHECK:           %[[VAL_358:.*]] = llvm.xor %[[VAL_181]], %[[VAL_357]] : i32
     // CHECK:           %[[VAL_359:.*]] = llvm.mlir.constant(28 : i32) : i32
-    // CHECK:           %[[VAL_360:.*]] = llvm.add %[[VAL_181]], %[[VAL_359]] : i32
+    // CHECK:           %[[VAL_360:.*]] = llvm.xor %[[CST_0]], %[[VAL_359]] : i32
     // CHECK:           %[[VAL_361:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_362:.*]] = llvm.add %[[VAL_182]], %[[VAL_361]] : i32
+    // CHECK:           %[[VAL_362:.*]] = llvm.xor %[[VAL_181]], %[[VAL_361]] : i32
     // CHECK:           %[[VAL_363:.*]] = llvm.mlir.constant(29 : i32) : i32
-    // CHECK:           %[[VAL_364:.*]] = llvm.add %[[VAL_181]], %[[VAL_363]] : i32
+    // CHECK:           %[[VAL_364:.*]] = llvm.xor %[[CST_0]], %[[VAL_363]] : i32
     // CHECK:           %[[VAL_365:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_366:.*]] = llvm.add %[[VAL_182]], %[[VAL_365]] : i32
+    // CHECK:           %[[VAL_366:.*]] = llvm.xor %[[VAL_181]], %[[VAL_365]] : i32
     // CHECK:           %[[VAL_367:.*]] = llvm.mlir.constant(30 : i32) : i32
-    // CHECK:           %[[VAL_368:.*]] = llvm.add %[[VAL_181]], %[[VAL_367]] : i32
+    // CHECK:           %[[VAL_368:.*]] = llvm.xor %[[CST_0]], %[[VAL_367]] : i32
     // CHECK:           %[[VAL_369:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_370:.*]] = llvm.add %[[VAL_182]], %[[VAL_369]] : i32
+    // CHECK:           %[[VAL_370:.*]] = llvm.xor %[[VAL_181]], %[[VAL_369]] : i32
     // CHECK:           %[[VAL_371:.*]] = llvm.mlir.constant(31 : i32) : i32
-    // CHECK:           %[[VAL_372:.*]] = llvm.add %[[VAL_181]], %[[VAL_371]] : i32
+    // CHECK:           %[[VAL_372:.*]] = llvm.xor %[[CST_0]], %[[VAL_371]] : i32
     // CHECK:           %[[VAL_373:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_374:.*]] = llvm.add %[[VAL_182]], %[[VAL_373]] : i32
+    // CHECK:           %[[VAL_374:.*]] = llvm.xor %[[VAL_181]], %[[VAL_373]] : i32
 
     // COM: Offsets of rep [2, 1].
     // CHECK:           %[[VAL_375:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_376:.*]] = llvm.add %[[VAL_181]], %[[VAL_375]] : i32
+    // CHECK:           %[[VAL_376:.*]] = llvm.xor %[[CST_0]], %[[VAL_375]] : i32
     // CHECK:           %[[VAL_377:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_378:.*]] = llvm.add %[[VAL_182]], %[[VAL_377]] : i32
+    // CHECK:           %[[VAL_378:.*]] = llvm.xor %[[VAL_181]], %[[VAL_377]] : i32
     // CHECK:           %[[VAL_379:.*]] = llvm.mlir.constant(17 : i32) : i32
-    // CHECK:           %[[VAL_380:.*]] = llvm.add %[[VAL_181]], %[[VAL_379]] : i32
+    // CHECK:           %[[VAL_380:.*]] = llvm.xor %[[CST_0]], %[[VAL_379]] : i32
     // CHECK:           %[[VAL_381:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_382:.*]] = llvm.add %[[VAL_182]], %[[VAL_381]] : i32
+    // CHECK:           %[[VAL_382:.*]] = llvm.xor %[[VAL_181]], %[[VAL_381]] : i32
     // CHECK:           %[[VAL_383:.*]] = llvm.mlir.constant(18 : i32) : i32
-    // CHECK:           %[[VAL_384:.*]] = llvm.add %[[VAL_181]], %[[VAL_383]] : i32
+    // CHECK:           %[[VAL_384:.*]] = llvm.xor %[[CST_0]], %[[VAL_383]] : i32
     // CHECK:           %[[VAL_385:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_386:.*]] = llvm.add %[[VAL_182]], %[[VAL_385]] : i32
+    // CHECK:           %[[VAL_386:.*]] = llvm.xor %[[VAL_181]], %[[VAL_385]] : i32
     // CHECK:           %[[VAL_387:.*]] = llvm.mlir.constant(19 : i32) : i32
-    // CHECK:           %[[VAL_388:.*]] = llvm.add %[[VAL_181]], %[[VAL_387]] : i32
+    // CHECK:           %[[VAL_388:.*]] = llvm.xor %[[CST_0]], %[[VAL_387]] : i32
     // CHECK:           %[[VAL_389:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_390:.*]] = llvm.add %[[VAL_182]], %[[VAL_389]] : i32
+    // CHECK:           %[[VAL_390:.*]] = llvm.xor %[[VAL_181]], %[[VAL_389]] : i32
     // CHECK:           %[[VAL_391:.*]] = llvm.mlir.constant(20 : i32) : i32
-    // CHECK:           %[[VAL_392:.*]] = llvm.add %[[VAL_181]], %[[VAL_391]] : i32
+    // CHECK:           %[[VAL_392:.*]] = llvm.xor %[[CST_0]], %[[VAL_391]] : i32
     // CHECK:           %[[VAL_393:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_394:.*]] = llvm.add %[[VAL_182]], %[[VAL_393]] : i32
+    // CHECK:           %[[VAL_394:.*]] = llvm.xor %[[VAL_181]], %[[VAL_393]] : i32
     // CHECK:           %[[VAL_395:.*]] = llvm.mlir.constant(21 : i32) : i32
-    // CHECK:           %[[VAL_396:.*]] = llvm.add %[[VAL_181]], %[[VAL_395]] : i32
+    // CHECK:           %[[VAL_396:.*]] = llvm.xor %[[CST_0]], %[[VAL_395]] : i32
     // CHECK:           %[[VAL_397:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_398:.*]] = llvm.add %[[VAL_182]], %[[VAL_397]] : i32
+    // CHECK:           %[[VAL_398:.*]] = llvm.xor %[[VAL_181]], %[[VAL_397]] : i32
     // CHECK:           %[[VAL_399:.*]] = llvm.mlir.constant(22 : i32) : i32
-    // CHECK:           %[[VAL_400:.*]] = llvm.add %[[VAL_181]], %[[VAL_399]] : i32
+    // CHECK:           %[[VAL_400:.*]] = llvm.xor %[[CST_0]], %[[VAL_399]] : i32
     // CHECK:           %[[VAL_401:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_402:.*]] = llvm.add %[[VAL_182]], %[[VAL_401]] : i32
+    // CHECK:           %[[VAL_402:.*]] = llvm.xor %[[VAL_181]], %[[VAL_401]] : i32
     // CHECK:           %[[VAL_403:.*]] = llvm.mlir.constant(23 : i32) : i32
-    // CHECK:           %[[VAL_404:.*]] = llvm.add %[[VAL_181]], %[[VAL_403]] : i32
+    // CHECK:           %[[VAL_404:.*]] = llvm.xor %[[CST_0]], %[[VAL_403]] : i32
     // CHECK:           %[[VAL_405:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_406:.*]] = llvm.add %[[VAL_182]], %[[VAL_405]] : i32
+    // CHECK:           %[[VAL_406:.*]] = llvm.xor %[[VAL_181]], %[[VAL_405]] : i32
 
     // COM: Offsets of rep [2, 2].
     // CHECK:           %[[VAL_407:.*]] = llvm.mlir.constant(24 : i32) : i32
-    // CHECK:           %[[VAL_408:.*]] = llvm.add %[[VAL_181]], %[[VAL_407]] : i32
+    // CHECK:           %[[VAL_408:.*]] = llvm.xor %[[CST_0]], %[[VAL_407]] : i32
     // CHECK:           %[[VAL_409:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_410:.*]] = llvm.add %[[VAL_182]], %[[VAL_409]] : i32
+    // CHECK:           %[[VAL_410:.*]] = llvm.xor %[[VAL_181]], %[[VAL_409]] : i32
     // CHECK:           %[[VAL_411:.*]] = llvm.mlir.constant(25 : i32) : i32
-    // CHECK:           %[[VAL_412:.*]] = llvm.add %[[VAL_181]], %[[VAL_411]] : i32
+    // CHECK:           %[[VAL_412:.*]] = llvm.xor %[[CST_0]], %[[VAL_411]] : i32
     // CHECK:           %[[VAL_413:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_414:.*]] = llvm.add %[[VAL_182]], %[[VAL_413]] : i32
+    // CHECK:           %[[VAL_414:.*]] = llvm.xor %[[VAL_181]], %[[VAL_413]] : i32
     // CHECK:           %[[VAL_415:.*]] = llvm.mlir.constant(26 : i32) : i32
-    // CHECK:           %[[VAL_416:.*]] = llvm.add %[[VAL_181]], %[[VAL_415]] : i32
+    // CHECK:           %[[VAL_416:.*]] = llvm.xor %[[CST_0]], %[[VAL_415]] : i32
     // CHECK:           %[[VAL_417:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_418:.*]] = llvm.add %[[VAL_182]], %[[VAL_417]] : i32
+    // CHECK:           %[[VAL_418:.*]] = llvm.xor %[[VAL_181]], %[[VAL_417]] : i32
     // CHECK:           %[[VAL_419:.*]] = llvm.mlir.constant(27 : i32) : i32
-    // CHECK:           %[[VAL_420:.*]] = llvm.add %[[VAL_181]], %[[VAL_419]] : i32
+    // CHECK:           %[[VAL_420:.*]] = llvm.xor %[[CST_0]], %[[VAL_419]] : i32
     // CHECK:           %[[VAL_421:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_422:.*]] = llvm.add %[[VAL_182]], %[[VAL_421]] : i32
+    // CHECK:           %[[VAL_422:.*]] = llvm.xor %[[VAL_181]], %[[VAL_421]] : i32
     // CHECK:           %[[VAL_423:.*]] = llvm.mlir.constant(28 : i32) : i32
-    // CHECK:           %[[VAL_424:.*]] = llvm.add %[[VAL_181]], %[[VAL_423]] : i32
+    // CHECK:           %[[VAL_424:.*]] = llvm.xor %[[CST_0]], %[[VAL_423]] : i32
     // CHECK:           %[[VAL_425:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_426:.*]] = llvm.add %[[VAL_182]], %[[VAL_425]] : i32
+    // CHECK:           %[[VAL_426:.*]] = llvm.xor %[[VAL_181]], %[[VAL_425]] : i32
     // CHECK:           %[[VAL_427:.*]] = llvm.mlir.constant(29 : i32) : i32
-    // CHECK:           %[[VAL_428:.*]] = llvm.add %[[VAL_181]], %[[VAL_427]] : i32
+    // CHECK:           %[[VAL_428:.*]] = llvm.xor %[[CST_0]], %[[VAL_427]] : i32
     // CHECK:           %[[VAL_429:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_430:.*]] = llvm.add %[[VAL_182]], %[[VAL_429]] : i32
+    // CHECK:           %[[VAL_430:.*]] = llvm.xor %[[VAL_181]], %[[VAL_429]] : i32
     // CHECK:           %[[VAL_431:.*]] = llvm.mlir.constant(30 : i32) : i32
-    // CHECK:           %[[VAL_432:.*]] = llvm.add %[[VAL_181]], %[[VAL_431]] : i32
+    // CHECK:           %[[VAL_432:.*]] = llvm.xor %[[CST_0]], %[[VAL_431]] : i32
     // CHECK:           %[[VAL_433:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_434:.*]] = llvm.add %[[VAL_182]], %[[VAL_433]] : i32
+    // CHECK:           %[[VAL_434:.*]] = llvm.xor %[[VAL_181]], %[[VAL_433]] : i32
     // CHECK:           %[[VAL_435:.*]] = llvm.mlir.constant(31 : i32) : i32
-    // CHECK:           %[[VAL_436:.*]] = llvm.add %[[VAL_181]], %[[VAL_435]] : i32
+    // CHECK:           %[[VAL_436:.*]] = llvm.xor %[[CST_0]], %[[VAL_435]] : i32
     // CHECK:           %[[VAL_437:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:           %[[VAL_438:.*]] = llvm.add %[[VAL_182]], %[[VAL_437]] : i32
+    // CHECK:           %[[VAL_438:.*]] = llvm.xor %[[VAL_181]], %[[VAL_437]] : i32
     tt.print " x: " {hex = false, isSigned = array<i32: 0>} : %cst : tensor<32x32xf16, #dot_operand_a>
     tt.return
   }
diff --git a/test/Proton/ops.mlir b/test/Proton/ops.mlir
new file mode 100644
index 0000000000..22a17e3f0f
--- /dev/null
+++ b/test/Proton/ops.mlir
@@ -0,0 +1,15 @@
+// RUN: triton-opt --split-input-file %s -cse -canonicalize | FileCheck %s
+
+module {
+  // CHECK-LABEL: proton_record
+  tt.func @proton_record() {
+    // CHECK: proton.record() {isStart = true, regionId = 1 : i32}
+    // CHECK-NEXT: proton.record() {isStart = false, regionId = 1 : i32}
+    // CHECK-NEXT: tt.return
+    proton.record() {isStart = true, regionId = 1 : i32}
+    proton.record() {isStart = false, regionId = 1 : i32}
+    tt.return
+  }
+} // end module
+
+// -----
diff --git a/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir b/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir
index 7bfff4fc36..48c9850418 100644
--- a/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir
+++ b/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir
@@ -6,46 +6,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32
   // CHECK-SAME:  %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} {
   tt.func public @convert_dpas_to_dot_rep_cluster_1_2(%arg: tensor<1024x32xf16, #dpas>) {
     // COM: The repetitions order of dot layout and dpas layout are same when the GEMM tiling is clustered as repCluster [1, 2].
-    // CHECK:           %[[VAL_81:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_0:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_81]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_98:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_1:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_98]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_115:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_2:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_115]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_132:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_3:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_132]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_149:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_4:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_149]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_166:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_5:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_166]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_183:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_6:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_183]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_200:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_7:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_200]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_216:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_217:.*]] = llvm.extractelement %[[REP_0]]{{\[}}%[[VAL_216]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_232:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_233:.*]] = llvm.extractelement %[[REP_1]]{{\[}}%[[VAL_232]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_248:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_249:.*]] = llvm.extractelement %[[REP_2]]{{\[}}%[[VAL_248]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_264:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_265:.*]] = llvm.extractelement %[[REP_3]]{{\[}}%[[VAL_264]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_280:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_281:.*]] = llvm.extractelement %[[REP_4]]{{\[}}%[[VAL_280]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_296:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_297:.*]] = llvm.extractelement %[[REP_5]]{{\[}}%[[VAL_296]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_312:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_313:.*]] = llvm.extractelement %[[REP_6]]{{\[}}%[[VAL_312]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_328:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_329:.*]] = llvm.extractelement %[[REP_7]]{{\[}}%[[VAL_328]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_338:.*]] = llvm.insertvalue %[[VAL_217]], {{.*}}[7]
-    // CHECK:           %[[VAL_346:.*]] = llvm.insertvalue %[[VAL_233]], {{.*}}[15]
-    // CHECK:           %[[VAL_354:.*]] = llvm.insertvalue %[[VAL_249]], {{.*}}[23]
-    // CHECK:           %[[VAL_362:.*]] = llvm.insertvalue %[[VAL_265]], {{.*}}[31]
-    // CHECK:           %[[VAL_370:.*]] = llvm.insertvalue %[[VAL_281]], {{.*}}[39]
-    // CHECK:           %[[VAL_378:.*]] = llvm.insertvalue %[[VAL_297]], {{.*}}[47]
-    // CHECK:           %[[VAL_386:.*]] = llvm.insertvalue %[[VAL_313]], {{.*}}[55]
-    // CHECK:           %[[VAL_394:.*]] = llvm.insertvalue %[[VAL_329]], {{.*}}[63]
+    // CHECK-NO: llvm.insertvalue
+    // CHECK-NO: llvm.extractvalue
     %108 = triton_gpu.convert_layout %arg : tensor<1024x32xf16, #dpas> -> tensor<1024x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>
     tt.return
   }
@@ -62,46 +24,135 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32
     // COM:   - 0, 1, 2, 3, 4, 5, 6, 7.
     // COM: The repetitions order of dot layout when the GEMM tiling is clustered as repCluster [2, 2]:
     // COM:   - 0, 2, 1, 3, 4, 6, 5, 7.
-    // CHECK:           %[[VAL_81:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_0:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_81]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_98:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_1:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_98]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_115:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_2:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_115]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_132:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_3:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_132]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_149:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_4:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_149]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_166:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_5:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_166]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_183:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_6:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_183]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_200:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_7:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_200]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_216:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_217:.*]] = llvm.extractelement %[[REP_0]]{{\[}}%[[VAL_216]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_232:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_233:.*]] = llvm.extractelement %[[REP_2]]{{\[}}%[[VAL_232]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_248:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_249:.*]] = llvm.extractelement %[[REP_1]]{{\[}}%[[VAL_248]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_264:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_265:.*]] = llvm.extractelement %[[REP_3]]{{\[}}%[[VAL_264]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_280:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_281:.*]] = llvm.extractelement %[[REP_4]]{{\[}}%[[VAL_280]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_296:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_297:.*]] = llvm.extractelement %[[REP_6]]{{\[}}%[[VAL_296]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_312:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_313:.*]] = llvm.extractelement %[[REP_5]]{{\[}}%[[VAL_312]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_328:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_329:.*]] = llvm.extractelement %[[REP_7]]{{\[}}%[[VAL_328]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_338:.*]] = llvm.insertvalue %[[VAL_217]], {{.*}}[7]
-    // CHECK:           %[[VAL_346:.*]] = llvm.insertvalue %[[VAL_233]], {{.*}}[15]
-    // CHECK:           %[[VAL_354:.*]] = llvm.insertvalue %[[VAL_249]], {{.*}}[23]
-    // CHECK:           %[[VAL_362:.*]] = llvm.insertvalue %[[VAL_265]], {{.*}}[31]
-    // CHECK:           %[[VAL_370:.*]] = llvm.insertvalue %[[VAL_281]], {{.*}}[39]
-    // CHECK:           %[[VAL_378:.*]] = llvm.insertvalue %[[VAL_297]], {{.*}}[47]
-    // CHECK:           %[[VAL_386:.*]] = llvm.insertvalue %[[VAL_313]], {{.*}}[55]
-    // CHECK:           %[[VAL_394:.*]] = llvm.insertvalue %[[VAL_329]], {{.*}}[63]
+    // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0]
+    // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][1]
+    // CHECK:           %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][2]
+    // CHECK:           %[[VAL_4:.*]] = llvm.extractvalue %[[VAL_0]][3]
+    // CHECK:           %[[VAL_5:.*]] = llvm.extractvalue %[[VAL_0]][4]
+    // CHECK:           %[[VAL_6:.*]] = llvm.extractvalue %[[VAL_0]][5]
+    // CHECK:           %[[VAL_7:.*]] = llvm.extractvalue %[[VAL_0]][6]
+    // CHECK:           %[[VAL_8:.*]] = llvm.extractvalue %[[VAL_0]][7]
+    // CHECK:           %[[VAL_9:.*]] = llvm.extractvalue %[[VAL_0]][8]
+    // CHECK:           %[[VAL_10:.*]] = llvm.extractvalue %[[VAL_0]][9]
+    // CHECK:           %[[VAL_11:.*]] = llvm.extractvalue %[[VAL_0]][10]
+    // CHECK:           %[[VAL_12:.*]] = llvm.extractvalue %[[VAL_0]][11]
+    // CHECK:           %[[VAL_13:.*]] = llvm.extractvalue %[[VAL_0]][12]
+    // CHECK:           %[[VAL_14:.*]] = llvm.extractvalue %[[VAL_0]][13]
+    // CHECK:           %[[VAL_15:.*]] = llvm.extractvalue %[[VAL_0]][14]
+    // CHECK:           %[[VAL_16:.*]] = llvm.extractvalue %[[VAL_0]][15]
+    // CHECK:           %[[VAL_17:.*]] = llvm.extractvalue %[[VAL_0]][16]
+    // CHECK:           %[[VAL_18:.*]] = llvm.extractvalue %[[VAL_0]][17]
+    // CHECK:           %[[VAL_19:.*]] = llvm.extractvalue %[[VAL_0]][18]
+    // CHECK:           %[[VAL_20:.*]] = llvm.extractvalue %[[VAL_0]][19]
+    // CHECK:           %[[VAL_21:.*]] = llvm.extractvalue %[[VAL_0]][20]
+    // CHECK:           %[[VAL_22:.*]] = llvm.extractvalue %[[VAL_0]][21]
+    // CHECK:           %[[VAL_23:.*]] = llvm.extractvalue %[[VAL_0]][22]
+    // CHECK:           %[[VAL_24:.*]] = llvm.extractvalue %[[VAL_0]][23]
+    // CHECK:           %[[VAL_25:.*]] = llvm.extractvalue %[[VAL_0]][24]
+    // CHECK:           %[[VAL_26:.*]] = llvm.extractvalue %[[VAL_0]][25]
+    // CHECK:           %[[VAL_27:.*]] = llvm.extractvalue %[[VAL_0]][26]
+    // CHECK:           %[[VAL_28:.*]] = llvm.extractvalue %[[VAL_0]][27]
+    // CHECK:           %[[VAL_29:.*]] = llvm.extractvalue %[[VAL_0]][28]
+    // CHECK:           %[[VAL_30:.*]] = llvm.extractvalue %[[VAL_0]][29]
+    // CHECK:           %[[VAL_31:.*]] = llvm.extractvalue %[[VAL_0]][30]
+    // CHECK:           %[[VAL_32:.*]] = llvm.extractvalue %[[VAL_0]][31]
+    // CHECK:           %[[VAL_33:.*]] = llvm.extractvalue %[[VAL_0]][32]
+    // CHECK:           %[[VAL_34:.*]] = llvm.extractvalue %[[VAL_0]][33]
+    // CHECK:           %[[VAL_35:.*]] = llvm.extractvalue %[[VAL_0]][34]
+    // CHECK:           %[[VAL_36:.*]] = llvm.extractvalue %[[VAL_0]][35]
+    // CHECK:           %[[VAL_37:.*]] = llvm.extractvalue %[[VAL_0]][36]
+    // CHECK:           %[[VAL_38:.*]] = llvm.extractvalue %[[VAL_0]][37]
+    // CHECK:           %[[VAL_39:.*]] = llvm.extractvalue %[[VAL_0]][38]
+    // CHECK:           %[[VAL_40:.*]] = llvm.extractvalue %[[VAL_0]][39]
+    // CHECK:           %[[VAL_41:.*]] = llvm.extractvalue %[[VAL_0]][40]
+    // CHECK:           %[[VAL_42:.*]] = llvm.extractvalue %[[VAL_0]][41]
+    // CHECK:           %[[VAL_43:.*]] = llvm.extractvalue %[[VAL_0]][42]
+    // CHECK:           %[[VAL_44:.*]] = llvm.extractvalue %[[VAL_0]][43]
+    // CHECK:           %[[VAL_45:.*]] = llvm.extractvalue %[[VAL_0]][44]
+    // CHECK:           %[[VAL_46:.*]] = llvm.extractvalue %[[VAL_0]][45]
+    // CHECK:           %[[VAL_47:.*]] = llvm.extractvalue %[[VAL_0]][46]
+    // CHECK:           %[[VAL_48:.*]] = llvm.extractvalue %[[VAL_0]][47]
+    // CHECK:           %[[VAL_49:.*]] = llvm.extractvalue %[[VAL_0]][48]
+    // CHECK:           %[[VAL_50:.*]] = llvm.extractvalue %[[VAL_0]][49]
+    // CHECK:           %[[VAL_51:.*]] = llvm.extractvalue %[[VAL_0]][50]
+    // CHECK:           %[[VAL_52:.*]] = llvm.extractvalue %[[VAL_0]][51]
+    // CHECK:           %[[VAL_53:.*]] = llvm.extractvalue %[[VAL_0]][52]
+    // CHECK:           %[[VAL_54:.*]] = llvm.extractvalue %[[VAL_0]][53]
+    // CHECK:           %[[VAL_55:.*]] = llvm.extractvalue %[[VAL_0]][54]
+    // CHECK:           %[[VAL_56:.*]] = llvm.extractvalue %[[VAL_0]][55]
+    // CHECK:           %[[VAL_57:.*]] = llvm.extractvalue %[[VAL_0]][56]
+    // CHECK:           %[[VAL_58:.*]] = llvm.extractvalue %[[VAL_0]][57]
+    // CHECK:           %[[VAL_59:.*]] = llvm.extractvalue %[[VAL_0]][58]
+    // CHECK:           %[[VAL_60:.*]] = llvm.extractvalue %[[VAL_0]][59]
+    // CHECK:           %[[VAL_61:.*]] = llvm.extractvalue %[[VAL_0]][60]
+    // CHECK:           %[[VAL_62:.*]] = llvm.extractvalue %[[VAL_0]][61]
+    // CHECK:           %[[VAL_63:.*]] = llvm.extractvalue %[[VAL_0]][62]
+    // CHECK:           %[[VAL_64:.*]] = llvm.extractvalue %[[VAL_0]][63]
+    // CHECK:           %[[VAL_65:.*]] = llvm.mlir.undef
+    // CHECK:           %[[VAL_66:.*]] = llvm.insertvalue %[[VAL_1]], %[[VAL_65]][0]
+    // CHECK:           %[[VAL_67:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_66]][1]
+    // CHECK:           %[[VAL_68:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_67]][2]
+    // CHECK:           %[[VAL_69:.*]] = llvm.insertvalue %[[VAL_4]], %[[VAL_68]][3]
+    // CHECK:           %[[VAL_70:.*]] = llvm.insertvalue %[[VAL_5]], %[[VAL_69]][4]
+    // CHECK:           %[[VAL_71:.*]] = llvm.insertvalue %[[VAL_6]], %[[VAL_70]][5]
+    // CHECK:           %[[VAL_72:.*]] = llvm.insertvalue %[[VAL_7]], %[[VAL_71]][6]
+    // CHECK:           %[[VAL_73:.*]] = llvm.insertvalue %[[VAL_8]], %[[VAL_72]][7]
+    // CHECK:           %[[VAL_74:.*]] = llvm.insertvalue %[[VAL_17]], %[[VAL_73]][8]
+    // CHECK:           %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_18]], %[[VAL_74]][9]
+    // CHECK:           %[[VAL_76:.*]] = llvm.insertvalue %[[VAL_19]], %[[VAL_75]][10]
+    // CHECK:           %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_20]], %[[VAL_76]][11]
+    // CHECK:           %[[VAL_78:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_77]][12]
+    // CHECK:           %[[VAL_79:.*]] = llvm.insertvalue %[[VAL_22]], %[[VAL_78]][13]
+    // CHECK:           %[[VAL_80:.*]] = llvm.insertvalue %[[VAL_23]], %[[VAL_79]][14]
+    // CHECK:           %[[VAL_81:.*]] = llvm.insertvalue %[[VAL_24]], %[[VAL_80]][15]
+    // CHECK:           %[[VAL_82:.*]] = llvm.insertvalue %[[VAL_9]], %[[VAL_81]][16]
+    // CHECK:           %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_10]], %[[VAL_82]][17]
+    // CHECK:           %[[VAL_84:.*]] = llvm.insertvalue %[[VAL_11]], %[[VAL_83]][18]
+    // CHECK:           %[[VAL_85:.*]] = llvm.insertvalue %[[VAL_12]], %[[VAL_84]][19]
+    // CHECK:           %[[VAL_86:.*]] = llvm.insertvalue %[[VAL_13]], %[[VAL_85]][20]
+    // CHECK:           %[[VAL_87:.*]] = llvm.insertvalue %[[VAL_14]], %[[VAL_86]][21]
+    // CHECK:           %[[VAL_88:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_87]][22]
+    // CHECK:           %[[VAL_89:.*]] = llvm.insertvalue %[[VAL_16]], %[[VAL_88]][23]
+    // CHECK:           %[[VAL_90:.*]] = llvm.insertvalue %[[VAL_25]], %[[VAL_89]][24]
+    // CHECK:           %[[VAL_91:.*]] = llvm.insertvalue %[[VAL_26]], %[[VAL_90]][25]
+    // CHECK:           %[[VAL_92:.*]] = llvm.insertvalue %[[VAL_27]], %[[VAL_91]][26]
+    // CHECK:           %[[VAL_93:.*]] = llvm.insertvalue %[[VAL_28]], %[[VAL_92]][27]
+    // CHECK:           %[[VAL_94:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_93]][28]
+    // CHECK:           %[[VAL_95:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_94]][29]
+    // CHECK:           %[[VAL_96:.*]] = llvm.insertvalue %[[VAL_31]], %[[VAL_95]][30]
+    // CHECK:           %[[VAL_97:.*]] = llvm.insertvalue %[[VAL_32]], %[[VAL_96]][31]
+    // CHECK:           %[[VAL_98:.*]] = llvm.insertvalue %[[VAL_33]], %[[VAL_97]][32]
+    // CHECK:           %[[VAL_99:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_98]][33]
+    // CHECK:           %[[VAL_100:.*]] = llvm.insertvalue %[[VAL_35]], %[[VAL_99]][34]
+    // CHECK:           %[[VAL_101:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_100]][35]
+    // CHECK:           %[[VAL_102:.*]] = llvm.insertvalue %[[VAL_37]], %[[VAL_101]][36]
+    // CHECK:           %[[VAL_103:.*]] = llvm.insertvalue %[[VAL_38]], %[[VAL_102]][37]
+    // CHECK:           %[[VAL_104:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_103]][38]
+    // CHECK:           %[[VAL_105:.*]] = llvm.insertvalue %[[VAL_40]], %[[VAL_104]][39]
+    // CHECK:           %[[VAL_106:.*]] = llvm.insertvalue %[[VAL_49]], %[[VAL_105]][40]
+    // CHECK:           %[[VAL_107:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_106]][41]
+    // CHECK:           %[[VAL_108:.*]] = llvm.insertvalue %[[VAL_51]], %[[VAL_107]][42]
+    // CHECK:           %[[VAL_109:.*]] = llvm.insertvalue %[[VAL_52]], %[[VAL_108]][43]
+    // CHECK:           %[[VAL_110:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_109]][44]
+    // CHECK:           %[[VAL_111:.*]] = llvm.insertvalue %[[VAL_54]], %[[VAL_110]][45]
+    // CHECK:           %[[VAL_112:.*]] = llvm.insertvalue %[[VAL_55]], %[[VAL_111]][46]
+    // CHECK:           %[[VAL_113:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_112]][47]
+    // CHECK:           %[[VAL_114:.*]] = llvm.insertvalue %[[VAL_41]], %[[VAL_113]][48]
+    // CHECK:           %[[VAL_115:.*]] = llvm.insertvalue %[[VAL_42]], %[[VAL_114]][49]
+    // CHECK:           %[[VAL_116:.*]] = llvm.insertvalue %[[VAL_43]], %[[VAL_115]][50]
+    // CHECK:           %[[VAL_117:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_116]][51]
+    // CHECK:           %[[VAL_118:.*]] = llvm.insertvalue %[[VAL_45]], %[[VAL_117]][52]
+    // CHECK:           %[[VAL_119:.*]] = llvm.insertvalue %[[VAL_46]], %[[VAL_118]][53]
+    // CHECK:           %[[VAL_120:.*]] = llvm.insertvalue %[[VAL_47]], %[[VAL_119]][54]
+    // CHECK:           %[[VAL_121:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_120]][55]
+    // CHECK:           %[[VAL_122:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_121]][56]
+    // CHECK:           %[[VAL_123:.*]] = llvm.insertvalue %[[VAL_58]], %[[VAL_122]][57]
+    // CHECK:           %[[VAL_124:.*]] = llvm.insertvalue %[[VAL_59]], %[[VAL_123]][58]
+    // CHECK:           %[[VAL_125:.*]] = llvm.insertvalue %[[VAL_60]], %[[VAL_124]][59]
+    // CHECK:           %[[VAL_126:.*]] = llvm.insertvalue %[[VAL_61]], %[[VAL_125]][60]
+    // CHECK:           %[[VAL_127:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_126]][61]
+    // CHECK:           %[[VAL_128:.*]] = llvm.insertvalue %[[VAL_63]], %[[VAL_127]][62]
+    // CHECK:           %[[VAL_129:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_128]][63]
     %108 = triton_gpu.convert_layout %arg : tensor<1024x32xf16, #dpas> -> tensor<1024x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>
     tt.return
   }
@@ -118,46 +169,135 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32
     // COM:   - 0, 1, 2, 3, 4, 5, 6, 7.
     // COM: The repetitions order of dot layout when the GEMM tiling is clustered as repCluster [4, 2]:
     // COM:   - 0, 2, 4, 6, 1, 3, 5, 7.
-    // CHECK:           %[[VAL_81:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_0:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_81]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_98:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_1:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_98]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_115:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_2:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_115]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_132:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_3:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_132]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_149:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_4:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_149]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_166:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_5:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_166]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_183:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_6:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_183]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_200:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[REP_7:.*]] = llvm.insertelement {{.*}}, {{.*}}{{\[}}%[[VAL_200]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_216:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_217:.*]] = llvm.extractelement %[[REP_0]]{{\[}}%[[VAL_216]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_232:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_233:.*]] = llvm.extractelement %[[REP_2]]{{\[}}%[[VAL_232]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_248:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_249:.*]] = llvm.extractelement %[[REP_4]]{{\[}}%[[VAL_248]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_264:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_265:.*]] = llvm.extractelement %[[REP_6]]{{\[}}%[[VAL_264]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_280:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_281:.*]] = llvm.extractelement %[[REP_1]]{{\[}}%[[VAL_280]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_296:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_297:.*]] = llvm.extractelement %[[REP_3]]{{\[}}%[[VAL_296]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_312:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_313:.*]] = llvm.extractelement %[[REP_5]]{{\[}}%[[VAL_312]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_328:.*]] = llvm.mlir.constant(7 : i32) : i32
-    // CHECK:           %[[VAL_329:.*]] = llvm.extractelement %[[REP_7]]{{\[}}%[[VAL_328]] : i32] : vector<8xf16>
-    // CHECK:           %[[VAL_338:.*]] = llvm.insertvalue %[[VAL_217]], {{.*}}[7]
-    // CHECK:           %[[VAL_346:.*]] = llvm.insertvalue %[[VAL_233]], {{.*}}[15]
-    // CHECK:           %[[VAL_354:.*]] = llvm.insertvalue %[[VAL_249]], {{.*}}[23]
-    // CHECK:           %[[VAL_362:.*]] = llvm.insertvalue %[[VAL_265]], {{.*}}[31]
-    // CHECK:           %[[VAL_370:.*]] = llvm.insertvalue %[[VAL_281]], {{.*}}[39]
-    // CHECK:           %[[VAL_378:.*]] = llvm.insertvalue %[[VAL_297]], {{.*}}[47]
-    // CHECK:           %[[VAL_386:.*]] = llvm.insertvalue %[[VAL_313]], {{.*}}[55]
-    // CHECK:           %[[VAL_394:.*]] = llvm.insertvalue %[[VAL_329]], {{.*}}[63]
+    // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0]
+    // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][1]
+    // CHECK:           %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][2]
+    // CHECK:           %[[VAL_4:.*]] = llvm.extractvalue %[[VAL_0]][3]
+    // CHECK:           %[[VAL_5:.*]] = llvm.extractvalue %[[VAL_0]][4]
+    // CHECK:           %[[VAL_6:.*]] = llvm.extractvalue %[[VAL_0]][5]
+    // CHECK:           %[[VAL_7:.*]] = llvm.extractvalue %[[VAL_0]][6]
+    // CHECK:           %[[VAL_8:.*]] = llvm.extractvalue %[[VAL_0]][7]
+    // CHECK:           %[[VAL_9:.*]] = llvm.extractvalue %[[VAL_0]][8]
+    // CHECK:           %[[VAL_10:.*]] = llvm.extractvalue %[[VAL_0]][9]
+    // CHECK:           %[[VAL_11:.*]] = llvm.extractvalue %[[VAL_0]][10]
+    // CHECK:           %[[VAL_12:.*]] = llvm.extractvalue %[[VAL_0]][11]
+    // CHECK:           %[[VAL_13:.*]] = llvm.extractvalue %[[VAL_0]][12]
+    // CHECK:           %[[VAL_14:.*]] = llvm.extractvalue %[[VAL_0]][13]
+    // CHECK:           %[[VAL_15:.*]] = llvm.extractvalue %[[VAL_0]][14]
+    // CHECK:           %[[VAL_16:.*]] = llvm.extractvalue %[[VAL_0]][15]
+    // CHECK:           %[[VAL_17:.*]] = llvm.extractvalue %[[VAL_0]][16]
+    // CHECK:           %[[VAL_18:.*]] = llvm.extractvalue %[[VAL_0]][17]
+    // CHECK:           %[[VAL_19:.*]] = llvm.extractvalue %[[VAL_0]][18]
+    // CHECK:           %[[VAL_20:.*]] = llvm.extractvalue %[[VAL_0]][19]
+    // CHECK:           %[[VAL_21:.*]] = llvm.extractvalue %[[VAL_0]][20]
+    // CHECK:           %[[VAL_22:.*]] = llvm.extractvalue %[[VAL_0]][21]
+    // CHECK:           %[[VAL_23:.*]] = llvm.extractvalue %[[VAL_0]][22]
+    // CHECK:           %[[VAL_24:.*]] = llvm.extractvalue %[[VAL_0]][23]
+    // CHECK:           %[[VAL_25:.*]] = llvm.extractvalue %[[VAL_0]][24]
+    // CHECK:           %[[VAL_26:.*]] = llvm.extractvalue %[[VAL_0]][25]
+    // CHECK:           %[[VAL_27:.*]] = llvm.extractvalue %[[VAL_0]][26]
+    // CHECK:           %[[VAL_28:.*]] = llvm.extractvalue %[[VAL_0]][27]
+    // CHECK:           %[[VAL_29:.*]] = llvm.extractvalue %[[VAL_0]][28]
+    // CHECK:           %[[VAL_30:.*]] = llvm.extractvalue %[[VAL_0]][29]
+    // CHECK:           %[[VAL_31:.*]] = llvm.extractvalue %[[VAL_0]][30]
+    // CHECK:           %[[VAL_32:.*]] = llvm.extractvalue %[[VAL_0]][31]
+    // CHECK:           %[[VAL_33:.*]] = llvm.extractvalue %[[VAL_0]][32]
+    // CHECK:           %[[VAL_34:.*]] = llvm.extractvalue %[[VAL_0]][33]
+    // CHECK:           %[[VAL_35:.*]] = llvm.extractvalue %[[VAL_0]][34]
+    // CHECK:           %[[VAL_36:.*]] = llvm.extractvalue %[[VAL_0]][35]
+    // CHECK:           %[[VAL_37:.*]] = llvm.extractvalue %[[VAL_0]][36]
+    // CHECK:           %[[VAL_38:.*]] = llvm.extractvalue %[[VAL_0]][37]
+    // CHECK:           %[[VAL_39:.*]] = llvm.extractvalue %[[VAL_0]][38]
+    // CHECK:           %[[VAL_40:.*]] = llvm.extractvalue %[[VAL_0]][39]
+    // CHECK:           %[[VAL_41:.*]] = llvm.extractvalue %[[VAL_0]][40]
+    // CHECK:           %[[VAL_42:.*]] = llvm.extractvalue %[[VAL_0]][41]
+    // CHECK:           %[[VAL_43:.*]] = llvm.extractvalue %[[VAL_0]][42]
+    // CHECK:           %[[VAL_44:.*]] = llvm.extractvalue %[[VAL_0]][43]
+    // CHECK:           %[[VAL_45:.*]] = llvm.extractvalue %[[VAL_0]][44]
+    // CHECK:           %[[VAL_46:.*]] = llvm.extractvalue %[[VAL_0]][45]
+    // CHECK:           %[[VAL_47:.*]] = llvm.extractvalue %[[VAL_0]][46]
+    // CHECK:           %[[VAL_48:.*]] = llvm.extractvalue %[[VAL_0]][47]
+    // CHECK:           %[[VAL_49:.*]] = llvm.extractvalue %[[VAL_0]][48]
+    // CHECK:           %[[VAL_50:.*]] = llvm.extractvalue %[[VAL_0]][49]
+    // CHECK:           %[[VAL_51:.*]] = llvm.extractvalue %[[VAL_0]][50]
+    // CHECK:           %[[VAL_52:.*]] = llvm.extractvalue %[[VAL_0]][51]
+    // CHECK:           %[[VAL_53:.*]] = llvm.extractvalue %[[VAL_0]][52]
+    // CHECK:           %[[VAL_54:.*]] = llvm.extractvalue %[[VAL_0]][53]
+    // CHECK:           %[[VAL_55:.*]] = llvm.extractvalue %[[VAL_0]][54]
+    // CHECK:           %[[VAL_56:.*]] = llvm.extractvalue %[[VAL_0]][55]
+    // CHECK:           %[[VAL_57:.*]] = llvm.extractvalue %[[VAL_0]][56]
+    // CHECK:           %[[VAL_58:.*]] = llvm.extractvalue %[[VAL_0]][57]
+    // CHECK:           %[[VAL_59:.*]] = llvm.extractvalue %[[VAL_0]][58]
+    // CHECK:           %[[VAL_60:.*]] = llvm.extractvalue %[[VAL_0]][59]
+    // CHECK:           %[[VAL_61:.*]] = llvm.extractvalue %[[VAL_0]][60]
+    // CHECK:           %[[VAL_62:.*]] = llvm.extractvalue %[[VAL_0]][61]
+    // CHECK:           %[[VAL_63:.*]] = llvm.extractvalue %[[VAL_0]][62]
+    // CHECK:           %[[VAL_64:.*]] = llvm.extractvalue %[[VAL_0]][63]
+    // CHECK:           %[[VAL_65:.*]] = llvm.mlir.undef
+    // CHECK:           %[[VAL_66:.*]] = llvm.insertvalue %[[VAL_1]], %[[VAL_65]][0]
+    // CHECK:           %[[VAL_67:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_66]][1]
+    // CHECK:           %[[VAL_68:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_67]][2]
+    // CHECK:           %[[VAL_69:.*]] = llvm.insertvalue %[[VAL_4]], %[[VAL_68]][3]
+    // CHECK:           %[[VAL_70:.*]] = llvm.insertvalue %[[VAL_5]], %[[VAL_69]][4]
+    // CHECK:           %[[VAL_71:.*]] = llvm.insertvalue %[[VAL_6]], %[[VAL_70]][5]
+    // CHECK:           %[[VAL_72:.*]] = llvm.insertvalue %[[VAL_7]], %[[VAL_71]][6]
+    // CHECK:           %[[VAL_73:.*]] = llvm.insertvalue %[[VAL_8]], %[[VAL_72]][7]
+    // CHECK:           %[[VAL_74:.*]] = llvm.insertvalue %[[VAL_17]], %[[VAL_73]][8]
+    // CHECK:           %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_18]], %[[VAL_74]][9]
+    // CHECK:           %[[VAL_76:.*]] = llvm.insertvalue %[[VAL_19]], %[[VAL_75]][10]
+    // CHECK:           %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_20]], %[[VAL_76]][11]
+    // CHECK:           %[[VAL_78:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_77]][12]
+    // CHECK:           %[[VAL_79:.*]] = llvm.insertvalue %[[VAL_22]], %[[VAL_78]][13]
+    // CHECK:           %[[VAL_80:.*]] = llvm.insertvalue %[[VAL_23]], %[[VAL_79]][14]
+    // CHECK:           %[[VAL_81:.*]] = llvm.insertvalue %[[VAL_24]], %[[VAL_80]][15]
+    // CHECK:           %[[VAL_82:.*]] = llvm.insertvalue %[[VAL_33]], %[[VAL_81]][16]
+    // CHECK:           %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_82]][17]
+    // CHECK:           %[[VAL_84:.*]] = llvm.insertvalue %[[VAL_35]], %[[VAL_83]][18]
+    // CHECK:           %[[VAL_85:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_84]][19]
+    // CHECK:           %[[VAL_86:.*]] = llvm.insertvalue %[[VAL_37]], %[[VAL_85]][20]
+    // CHECK:           %[[VAL_87:.*]] = llvm.insertvalue %[[VAL_38]], %[[VAL_86]][21]
+    // CHECK:           %[[VAL_88:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_87]][22]
+    // CHECK:           %[[VAL_89:.*]] = llvm.insertvalue %[[VAL_40]], %[[VAL_88]][23]
+    // CHECK:           %[[VAL_90:.*]] = llvm.insertvalue %[[VAL_49]], %[[VAL_89]][24]
+    // CHECK:           %[[VAL_91:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_90]][25]
+    // CHECK:           %[[VAL_92:.*]] = llvm.insertvalue %[[VAL_51]], %[[VAL_91]][26]
+    // CHECK:           %[[VAL_93:.*]] = llvm.insertvalue %[[VAL_52]], %[[VAL_92]][27]
+    // CHECK:           %[[VAL_94:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_93]][28]
+    // CHECK:           %[[VAL_95:.*]] = llvm.insertvalue %[[VAL_54]], %[[VAL_94]][29]
+    // CHECK:           %[[VAL_96:.*]] = llvm.insertvalue %[[VAL_55]], %[[VAL_95]][30]
+    // CHECK:           %[[VAL_97:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_96]][31]
+    // CHECK:           %[[VAL_98:.*]] = llvm.insertvalue %[[VAL_9]], %[[VAL_97]][32]
+    // CHECK:           %[[VAL_99:.*]] = llvm.insertvalue %[[VAL_10]], %[[VAL_98]][33]
+    // CHECK:           %[[VAL_100:.*]] = llvm.insertvalue %[[VAL_11]], %[[VAL_99]][34]
+    // CHECK:           %[[VAL_101:.*]] = llvm.insertvalue %[[VAL_12]], %[[VAL_100]][35]
+    // CHECK:           %[[VAL_102:.*]] = llvm.insertvalue %[[VAL_13]], %[[VAL_101]][36]
+    // CHECK:           %[[VAL_103:.*]] = llvm.insertvalue %[[VAL_14]], %[[VAL_102]][37]
+    // CHECK:           %[[VAL_104:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_103]][38]
+    // CHECK:           %[[VAL_105:.*]] = llvm.insertvalue %[[VAL_16]], %[[VAL_104]][39]
+    // CHECK:           %[[VAL_106:.*]] = llvm.insertvalue %[[VAL_25]], %[[VAL_105]][40]
+    // CHECK:           %[[VAL_107:.*]] = llvm.insertvalue %[[VAL_26]], %[[VAL_106]][41]
+    // CHECK:           %[[VAL_108:.*]] = llvm.insertvalue %[[VAL_27]], %[[VAL_107]][42]
+    // CHECK:           %[[VAL_109:.*]] = llvm.insertvalue %[[VAL_28]], %[[VAL_108]][43]
+    // CHECK:           %[[VAL_110:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_109]][44]
+    // CHECK:           %[[VAL_111:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_110]][45]
+    // CHECK:           %[[VAL_112:.*]] = llvm.insertvalue %[[VAL_31]], %[[VAL_111]][46]
+    // CHECK:           %[[VAL_113:.*]] = llvm.insertvalue %[[VAL_32]], %[[VAL_112]][47]
+    // CHECK:           %[[VAL_114:.*]] = llvm.insertvalue %[[VAL_41]], %[[VAL_113]][48]
+    // CHECK:           %[[VAL_115:.*]] = llvm.insertvalue %[[VAL_42]], %[[VAL_114]][49]
+    // CHECK:           %[[VAL_116:.*]] = llvm.insertvalue %[[VAL_43]], %[[VAL_115]][50]
+    // CHECK:           %[[VAL_117:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_116]][51]
+    // CHECK:           %[[VAL_118:.*]] = llvm.insertvalue %[[VAL_45]], %[[VAL_117]][52]
+    // CHECK:           %[[VAL_119:.*]] = llvm.insertvalue %[[VAL_46]], %[[VAL_118]][53]
+    // CHECK:           %[[VAL_120:.*]] = llvm.insertvalue %[[VAL_47]], %[[VAL_119]][54]
+    // CHECK:           %[[VAL_121:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_120]][55]
+    // CHECK:           %[[VAL_122:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_121]][56]
+    // CHECK:           %[[VAL_123:.*]] = llvm.insertvalue %[[VAL_58]], %[[VAL_122]][57]
+    // CHECK:           %[[VAL_124:.*]] = llvm.insertvalue %[[VAL_59]], %[[VAL_123]][58]
+    // CHECK:           %[[VAL_125:.*]] = llvm.insertvalue %[[VAL_60]], %[[VAL_124]][59]
+    // CHECK:           %[[VAL_126:.*]] = llvm.insertvalue %[[VAL_61]], %[[VAL_125]][60]
+    // CHECK:           %[[VAL_127:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_126]][61]
+    // CHECK:           %[[VAL_128:.*]] = llvm.insertvalue %[[VAL_63]], %[[VAL_127]][62]
+    // CHECK:           %[[VAL_129:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_128]][63]
     %108 = triton_gpu.convert_layout %arg : tensor<1024x32xf16, #dpas> -> tensor<1024x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>
     tt.return
   }
diff --git a/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir b/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir
index 2bb504d76f..1ecb0a5a2c 100644
--- a/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir
+++ b/test/TritonIntelGPU/tritonintlgpu-nested-layout.mlir
@@ -69,14 +69,13 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     // CHECK-DAG:           %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK-DAG:           %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
     // CHECK-DAG:           %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-DAG:           %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
-    // CHECK-DAG:           %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK-DAG:           %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK-DAG:           %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
     // CHECK-DAG:           %[[CST_3:.*]] = llvm.mlir.constant(3 : i32) : i32
+    // CHECK-DAG:           %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
     // CHECK-DAG:           %[[CST_5:.*]] = llvm.mlir.constant(5 : i32) : i32
     // CHECK-DAG:           %[[CST_6:.*]] = llvm.mlir.constant(6 : i32) : i32
     // CHECK-DAG:           %[[CST_7:.*]] = llvm.mlir.constant(7 : i32) : i32
+    // CHECK-DAG:           %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK-DAG:           %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
     // CHECK-DAG:           %[[CST_17:.*]] = llvm.mlir.constant(17 : i32) : i32
     // CHECK-DAG:           %[[CST_18:.*]] = llvm.mlir.constant(18 : i32) : i32
     // CHECK-DAG:           %[[CST_19:.*]] = llvm.mlir.constant(19 : i32) : i32
@@ -86,43 +85,46 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     // CHECK-DAG:           %[[CST_23:.*]] = llvm.mlir.constant(23 : i32) : i32
     // CHECK:           %[[THREAD_ID:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]])
     // CHECK:           %[[THREAD_ID_32:.*]] = llvm.trunc %[[THREAD_ID]] : i64 to i32
-    // CHECK:           %[[WARP_ID:.*]] = llvm.udiv %[[THREAD_ID_32]], %[[CST_16]]  : i32
     // CHECK:           %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_32]], %[[CST_16]]  : i32
-    // CHECK:           %[[VAL_29:.*]] = llvm.udiv %[[WARP_ID]], %[[CST_2]]  : i32
-    // CHECK:           %[[WARP_ID_X:.*]] = llvm.urem %[[VAL_29]], %[[CST_2]]  : i32
-    // CHECK:           %[[ROUNDED_WARP_ID_X:.*]] = llvm.urem %[[WARP_ID_X]], %[[CST_4]]  : i32
-    // CHECK:           %[[WARP_OFFSET:.*]] = llvm.mul %[[ROUNDED_WARP_ID_X]], %[[CST_8]] : i32
-    // CHECK:           %[[LANE_ID_X:.*]] = llvm.udiv %[[LANE_ID]], %[[CST_16]]  : i32
-    // CHECK:           %[[LANE_ID_Y:.*]] = llvm.urem %[[LANE_ID]], %[[CST_16]]  : i32
-    // CHECK:           %[[OFFSET_Y:.*]] = llvm.mul %[[LANE_ID_Y]], %[[CST_2]] : i32
-    // CHECK:           %[[OFFSET_x:.*]] = llvm.add %[[LANE_ID_X]], %[[WARP_OFFSET]] : i32
-    // CHECK:           %[[VAL_37:.*]] = llvm.urem %[[CST_0]], %[[CST_1]]  : i32
-    // CHECK:           %[[VAL_38:.*]] = llvm.udiv %[[CST_0]], %[[CST_1]]  : i32
-    // CHECK:           %[[VAL_39:.*]] = llvm.urem %[[VAL_38]], %[[CST_1]]  : i32
-    // CHECK:           %[[VAL_40:.*]] = llvm.urem %[[VAL_39]], %[[CST_1]]  : i32
-    // CHECK:           %[[VAL_41:.*]] = llvm.urem %[[VAL_37]], %[[CST_1]]  : i32
-    // CHECK:           %[[CTA_OFFSET_X:.*]] = llvm.mul %[[VAL_40]], %[[CST_32]] : i32
-    // CHECK:           %[[CTA_OFFSET_Y:.*]] = llvm.mul %[[VAL_41]], %[[CST_32]] : i32
-    // CHECK:           %[[VAL_44:.*]] = llvm.add %[[OFFSET_x]], %[[CTA_OFFSET_X]] : i32
-    // CHECK:           %[[VAL_45:.*]] = llvm.add %[[OFFSET_Y]], %[[CTA_OFFSET_Y]] : i32
-    // CHECK:           %[[OFFSET_X_0:.*]] = llvm.add %[[VAL_44]], %[[CST_0]] : i32
-    // CHECK:           %[[OFFSET_Y_0:.*]] = llvm.add %[[VAL_45]], %[[CST_0]] : i32
-    // CHECK:           %[[OFFSET_Y_1:.*]] = llvm.add %[[VAL_45]], %[[CST_1]] : i32
-    // CHECK:           %[[OFFSET_X_1:.*]] = llvm.add %[[VAL_44]], %[[CST_1]] : i32
-    // CHECK:           %[[OFFSET_X_2:.*]] = llvm.add %[[VAL_44]], %[[CST_2]] : i32
-    // CHECK:           %[[OFFSET_X_3:.*]] = llvm.add %[[VAL_44]], %[[CST_3]] : i32
-    // CHECK:           %[[OFFSET_X_4:.*]] = llvm.add %[[VAL_44]], %[[CST_4]] : i32
-    // CHECK:           %[[OFFSET_X_5:.*]] = llvm.add %[[VAL_44]], %[[CST_5]] : i32
-    // CHECK:           %[[OFFSET_X_6:.*]] = llvm.add %[[VAL_44]], %[[CST_6]] : i32
-    // CHECK:           %[[OFFSET_X_7:.*]] = llvm.add %[[VAL_44]], %[[CST_7]] : i32
-    // CHECK:           %[[OFFSET_X_8:.*]] = llvm.add %[[VAL_44]], %[[CST_16]] : i32
-    // CHECK:           %[[OFFSET_X_9:.*]] = llvm.add %[[VAL_44]], %[[CST_17]] : i32
-    // CHECK:           %[[OFFSET_X_10:.*]] = llvm.add %[[VAL_44]], %[[CST_18]] : i32
-    // CHECK:           %[[OFFSET_X_11:.*]] = llvm.add %[[VAL_44]], %[[CST_19]] : i32
-    // CHECK:           %[[OFFSET_X_12:.*]] = llvm.add %[[VAL_44]], %[[CST_20]] : i32
-    // CHECK:           %[[OFFSET_X_13:.*]] = llvm.add %[[VAL_44]], %[[CST_21]] : i32
-    // CHECK:           %[[OFFSET_X_14:.*]] = llvm.add %[[VAL_44]], %[[CST_22]] : i32
-    // CHECK:           %[[OFFSET_X_15:.*]] = llvm.add %[[VAL_44]], %[[CST_23]] : i32
+    // CHECK:           %[[WARP_ID:.*]] = llvm.udiv %[[THREAD_ID_32]], %[[CST_16]]  : i32
+    // CHECK:           %[[VAL_27:.*]] = llvm.and %[[LANE_ID]], %[[CST_1]] : i32
+    // CHECK:           %[[VAL_28:.*]] = llvm.icmp "eq" %[[VAL_27]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_29:.*]] = llvm.select %[[VAL_28]], %[[CST_0]], %[[CST_2]] : i1, i32
+    // CHECK:           %[[VAL_30:.*]] = llvm.xor %[[CST_0]], %[[VAL_29]] : i32
+    // CHECK:           %[[VAL_31:.*]] = llvm.and %[[LANE_ID]], %[[CST_2]] : i32
+    // CHECK:           %[[VAL_32:.*]] = llvm.icmp "eq" %[[VAL_31]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_33:.*]] = llvm.select %[[VAL_32]], %[[CST_0]], %[[CST_4]] : i1, i32
+    // CHECK:           %[[VAL_34:.*]] = llvm.xor %[[VAL_30]], %[[VAL_33]] : i32
+    // CHECK:           %[[VAL_35:.*]] = llvm.and %[[LANE_ID]], %[[CST_4]] : i32
+    // CHECK:           %[[VAL_36:.*]] = llvm.icmp "eq" %[[VAL_35]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_37:.*]] = llvm.select %[[VAL_36]], %[[CST_0]], %[[CST_8]] : i1, i32
+    // CHECK:           %[[VAL_38:.*]] = llvm.xor %[[VAL_34]], %[[VAL_37]] : i32
+    // CHECK:           %[[VAL_39:.*]] = llvm.and %[[LANE_ID]], %[[CST_8]] : i32
+    // CHECK:           %[[VAL_40:.*]] = llvm.icmp "eq" %[[VAL_39]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_41:.*]] = llvm.select %[[VAL_40]], %[[CST_0]], %[[CST_16]] : i1, i32
+    // CHECK:           %[[VAL_42:.*]] = llvm.xor %[[VAL_38]], %[[VAL_41]] : i32
+    // CHECK:           %[[VAL_43:.*]] = llvm.and %[[WARP_ID]], %[[CST_2]] : i32
+    // CHECK:           %[[VAL_44:.*]] = llvm.icmp "eq" %[[VAL_43]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_45:.*]] = llvm.select %[[VAL_44]], %[[CST_0]], %[[CST_8]] : i1, i32
+    // CHECK:           %[[VAL_46:.*]] = llvm.xor %[[CST_0]], %[[VAL_45]] : i32
+    // CHECK:           %[[OFFSET_X_0:.*]] = llvm.xor %[[VAL_46]], %[[CST_0]] : i32
+    // CHECK:           %[[OFFSET_Y_0:.*]] = llvm.xor %[[VAL_42]], %[[CST_0]] : i32
+    // CHECK:           %[[OFFSET_Y_1:.*]] = llvm.xor %[[VAL_42]], %[[CST_1]] : i32
+    // CHECK:           %[[OFFSET_X_1:.*]] = llvm.xor %[[VAL_46]], %[[CST_1]] : i32
+    // CHECK:           %[[OFFSET_X_2:.*]] = llvm.xor %[[VAL_46]], %[[CST_2]] : i32
+    // CHECK:           %[[OFFSET_X_3:.*]] = llvm.xor %[[VAL_46]], %[[CST_3]] : i32
+    // CHECK:           %[[OFFSET_X_4:.*]] = llvm.xor %[[VAL_46]], %[[CST_4]] : i32
+    // CHECK:           %[[OFFSET_X_5:.*]] = llvm.xor %[[VAL_46]], %[[CST_5]] : i32
+    // CHECK:           %[[OFFSET_X_6:.*]] = llvm.xor %[[VAL_46]], %[[CST_6]] : i32
+    // CHECK:           %[[OFFSET_X_7:.*]] = llvm.xor %[[VAL_46]], %[[CST_7]] : i32
+    // CHECK:           %[[OFFSET_X_8:.*]] = llvm.xor %[[VAL_46]], %[[CST_16]] : i32
+    // CHECK:           %[[OFFSET_X_9:.*]] = llvm.xor %[[VAL_46]], %[[CST_17]] : i32
+    // CHECK:           %[[OFFSET_X_10:.*]] = llvm.xor %[[VAL_46]], %[[CST_18]] : i32
+    // CHECK:           %[[OFFSET_X_11:.*]] = llvm.xor %[[VAL_46]], %[[CST_19]] : i32
+    // CHECK:           %[[OFFSET_X_12:.*]] = llvm.xor %[[VAL_46]], %[[CST_20]] : i32
+    // CHECK:           %[[OFFSET_X_13:.*]] = llvm.xor %[[VAL_46]], %[[CST_21]] : i32
+    // CHECK:           %[[OFFSET_X_14:.*]] = llvm.xor %[[VAL_46]], %[[CST_22]] : i32
+    // CHECK:           %[[OFFSET_X_15:.*]] = llvm.xor %[[VAL_46]], %[[CST_23]] : i32
     // CHECK:           llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_0]], %[[OFFSET_Y_0]], {{.*}}, {{.*}})
     // CHECK:           llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_0]], %[[OFFSET_Y_1]], {{.*}}, {{.*}})
     // CHECK:           llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_1]], %[[OFFSET_Y_0]], {{.*}}, {{.*}})
@@ -172,14 +174,13 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     // CHECK-DAG:           %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK-DAG:           %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
     // CHECK-DAG:           %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-DAG:           %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
-    // CHECK-DAG:           %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK-DAG:           %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK-DAG:           %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
     // CHECK-DAG:           %[[CST_3:.*]] = llvm.mlir.constant(3 : i32) : i32
+    // CHECK-DAG:           %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
     // CHECK-DAG:           %[[CST_5:.*]] = llvm.mlir.constant(5 : i32) : i32
     // CHECK-DAG:           %[[CST_6:.*]] = llvm.mlir.constant(6 : i32) : i32
     // CHECK-DAG:           %[[CST_7:.*]] = llvm.mlir.constant(7 : i32) : i32
+    // CHECK-DAG:           %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK-DAG:           %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
     // CHECK-DAG:           %[[CST_17:.*]] = llvm.mlir.constant(17 : i32) : i32
     // CHECK-DAG:           %[[CST_18:.*]] = llvm.mlir.constant(18 : i32) : i32
     // CHECK-DAG:           %[[CST_19:.*]] = llvm.mlir.constant(19 : i32) : i32
@@ -190,34 +191,26 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     // CHECK:           %[[THREADS_ID:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[CST_0]])
     // CHECK:           %[[THREADS_ID_32:.*]] = llvm.trunc %[[THREADS_ID]] : i64 to i32
     // CHECK:           %[[WARP_ID:.*]] = llvm.udiv %[[THREADS_ID_32]], %[[CST_16]]  : i32
-    // CHECK:           %[[LANE_ID:.*]] = llvm.urem %[[THREADS_ID_32]], %[[CST_16]]  : i32
-    // CHECK:           %[[VAL_29:.*]] = llvm.udiv %[[WARP_ID]], %[[CST_2]]  : i32
-    // CHECK:           %[[WARP_ID_X:.*]] = llvm.urem %[[VAL_29]], %[[CST_2]]  : i32
-    // CHECK:           %[[ROUNDED_WARP_ID_X:.*]] = llvm.urem %[[WARP_ID_X]], %[[CST_4]]  : i32
-    // CHECK:           %[[WARP_OFFSET_X:.*]] = llvm.mul %[[ROUNDED_WARP_ID_X]], %[[CST_8]] : i32
-    // CHECK:           %[[LANE_OFFSET_X:.*]] = llvm.udiv %[[LANE_ID]], %[[CST_16]]  : i32
-    // CHECK:           %[[OFFSET_X:.*]] = llvm.add %[[LANE_OFFSET_X]], %[[WARP_OFFSET_X]] : i32
-    // CHECK:           %[[VAL_35:.*]] = llvm.udiv %[[CST_0]], %[[CST_1]]  : i32
-    // CHECK:           %[[VAL_36:.*]] = llvm.urem %[[VAL_35]], %[[CST_1]]  : i32
-    // CHECK:           %[[VAL_37:.*]] = llvm.urem %[[VAL_36]], %[[CST_1]]  : i32
-    // CHECK:           %[[CTA_OFFSET_X:.*]] = llvm.mul %[[VAL_37]], %[[CST_32]] : i32
-    // CHECK:           %[[VAL_39:.*]] = llvm.add %[[OFFSET_X]], %[[CTA_OFFSET_X]] : i32
-    // CHECK:           %[[OFFSET_X_0:.*]] = llvm.add %[[VAL_39]], %[[CST_0]] : i32
-    // CHECK:           %[[OFFSET_X_1:.*]] = llvm.add %[[VAL_39]], %[[CST_1]] : i32
-    // CHECK:           %[[OFFSET_X_2:.*]] = llvm.add %[[VAL_39]], %[[CST_2]] : i32
-    // CHECK:           %[[OFFSET_X_3:.*]] = llvm.add %[[VAL_39]], %[[CST_3]] : i32
-    // CHECK:           %[[OFFSET_X_4:.*]] = llvm.add %[[VAL_39]], %[[CST_4]] : i32
-    // CHECK:           %[[OFFSET_X_5:.*]] = llvm.add %[[VAL_39]], %[[CST_5]] : i32
-    // CHECK:           %[[OFFSET_X_6:.*]] = llvm.add %[[VAL_39]], %[[CST_6]] : i32
-    // CHECK:           %[[OFFSET_X_7:.*]] = llvm.add %[[VAL_39]], %[[CST_7]] : i32
-    // CHECK:           %[[OFFSET_X_8:.*]] = llvm.add %[[VAL_39]], %[[CST_16]] : i32
-    // CHECK:           %[[OFFSET_X_9:.*]] = llvm.add %[[VAL_39]], %[[CST_17]] : i32
-    // CHECK:           %[[OFFSET_X_10:.*]] = llvm.add %[[VAL_39]], %[[CST_18]] : i32
-    // CHECK:           %[[OFFSET_X_11:.*]] = llvm.add %[[VAL_39]], %[[CST_19]] : i32
-    // CHECK:           %[[OFFSET_X_12:.*]] = llvm.add %[[VAL_39]], %[[CST_20]] : i32
-    // CHECK:           %[[OFFSET_X_13:.*]] = llvm.add %[[VAL_39]], %[[CST_21]] : i32
-    // CHECK:           %[[OFFSET_X_14:.*]] = llvm.add %[[VAL_39]], %[[CST_22]] : i32
-    // CHECK:           %[[OFFSET_X_15:.*]] = llvm.add %[[VAL_39]], %[[CST_23]] : i32
+    // CHECK:           %[[VAL_26:.*]] = llvm.and %[[WARP_ID]], %[[CST_2]] : i32
+    // CHECK:           %[[VAL_27:.*]] = llvm.icmp "eq" %[[VAL_26]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_28:.*]] = llvm.select %[[VAL_27]], %[[CST_0]], %[[CST_8]] : i1, i32
+    // CHECK:           %[[VAL_29:.*]] = llvm.xor %[[CST_0]], %[[VAL_28]] : i32
+    // CHECK:           %[[OFFSET_X_0:.*]] = llvm.xor %[[VAL_29]], %[[CST_0]] : i32
+    // CHECK:           %[[OFFSET_X_1:.*]] = llvm.xor %[[VAL_29]], %[[CST_1]] : i32
+    // CHECK:           %[[OFFSET_X_2:.*]] = llvm.xor %[[VAL_29]], %[[CST_2]] : i32
+    // CHECK:           %[[OFFSET_X_3:.*]] = llvm.xor %[[VAL_29]], %[[CST_3]] : i32
+    // CHECK:           %[[OFFSET_X_4:.*]] = llvm.xor %[[VAL_29]], %[[CST_4]] : i32
+    // CHECK:           %[[OFFSET_X_5:.*]] = llvm.xor %[[VAL_29]], %[[CST_5]] : i32
+    // CHECK:           %[[OFFSET_X_6:.*]] = llvm.xor %[[VAL_29]], %[[CST_6]] : i32
+    // CHECK:           %[[OFFSET_X_7:.*]] = llvm.xor %[[VAL_29]], %[[CST_7]] : i32
+    // CHECK:           %[[OFFSET_X_8:.*]] = llvm.xor %[[VAL_29]], %[[CST_16]] : i32
+    // CHECK:           %[[OFFSET_X_9:.*]] = llvm.xor %[[VAL_29]], %[[CST_17]] : i32
+    // CHECK:           %[[OFFSET_X_10:.*]] = llvm.xor %[[VAL_29]], %[[CST_18]] : i32
+    // CHECK:           %[[OFFSET_X_11:.*]] = llvm.xor %[[VAL_29]], %[[CST_19]] : i32
+    // CHECK:           %[[OFFSET_X_12:.*]] = llvm.xor %[[VAL_29]], %[[CST_20]] : i32
+    // CHECK:           %[[OFFSET_X_13:.*]] = llvm.xor %[[VAL_29]], %[[CST_21]] : i32
+    // CHECK:           %[[OFFSET_X_14:.*]] = llvm.xor %[[VAL_29]], %[[CST_22]] : i32
+    // CHECK:           %[[OFFSET_X_15:.*]] = llvm.xor %[[VAL_29]], %[[CST_23]] : i32
     // CHECK:           %[[VAL_56:.*]] = llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_0]], {{.*}}, {{.*}})
     // CHECK:           %[[VAL_57:.*]] = llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_1]], {{.*}}, {{.*}})
     // CHECK:           %[[VAL_58:.*]] = llvm.call @_Z18__spirv_ocl_printf({{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[OFFSET_X_2]], {{.*}}, {{.*}})
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp
index 4ee77e934d..6b902003fb 100644
--- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp
+++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp
@@ -341,38 +341,44 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
 
 } // anonymous namespace
 
+// clang-format off
 // The layout example repeat_count=8, systolic_depth=8,
 // execution_size=16 and operands_per_chan=2 for warp size 32.
 // For A operand:
-//                   systolic depth = 8
-//<----------------------------------------------------->
-// opsPerChan=2
-//<--------->
-// t0  ...  t0   t1  ... t1  ~  t6  ... t6  t7  ... t7   ^
-// t8  ...  t8   t9  ... t9  ~ t14 ... t14 t15 ... t15   |
-// t16 ...  t16  t17 ... t17 ~ t22 ... t22 t23 ... t23   |
-// t24 ...  t24  t25 ... t25 ~ t30 ... t30 t31 ... t31   | repeat count <= 8
-// t0  ...  t0   t1  ... t1  ~ t6  ... t6  t7  ... t7    |
-// t8  ...  t8   t9  ... t9  ~ t14 ... t14 t15 ... t15   |
-// t16 ...  t16  t17 ... t17 ~ t22 ... t22 t23 ... t23   |
-// t24 ...  t24  t25 ... t25 ~ t30 ... t30 t31 ... t31   v
+//                       K = 16 (K = systolic depth * opsPerChan)
+// <---------------------------------------------------------------------------->
+// t0   t1   t2   t3   t4   t5   t6   t7   t8   t9   t10  t11  t12  t13  t14  t15   ^
+// t16  t17  t18  t19  t20  t21  t22  t23  t24  t25  t26  t27  t28  t29  t30  t31   |
+// t0   t1   t2   t3   t4   t5   t6   t7   t8   t9   t10  t11  t12  t13  t14  t15   |
+// t16  t17  t18  t19  t20  t21  t22  t23  t24  t25  t26  t27  t28  t29  t30  t31   |
+// t0   t1   t2   t3   t4   t5   t6   t7   t8   t9   t10  t11  t12  t13  t14  t15   | M = 8 (repeat count)
+// t16  t17  t18  t19  t20  t21  t22  t23  t24  t25  t26  t27  t28  t29  t30  t31   |
+// t0   t1   t2   t3   t4   t5   t6   t7   t8   t9   t10  t11  t12  t13  t14  t15   |
+// t16  t17  t18  t19  t20  t21  t22  t23  t24  t25  t26  t27  t28  t29  t30  t31   v
 // In this case, the LinearLayout bases are:
-// Register:  {{0,1}, {4,0}}
-// Lane:      {{0,2}, {0,4}, {0,8}, {1,0}, {2,0}}
+// Register:  {{2,0}, {4,0}}
+// Lane:      {{0,1}, {0,2}, {0,4}, {0,8}, {1,0}}
+// clang-format on
 std::vector<std::vector<int32_t>> DPASRegBasesA(int opsPerChannel,
                                                 int repeatCount,
                                                 int threadsPerWarp,
                                                 int systolicDepth) {
-  int rowPerWarp = threadsPerWarp / systolicDepth;
-  int warpRepeats = repeatCount / rowPerWarp;
   std::vector<std::vector<int32_t>> regBases;
 
-  for (int opc = 1; opc < opsPerChannel; opc *= 2) {
+  // pack the value to i16 for scalar bit width <=16.
+  assert((opsPerChannel == 4 || opsPerChannel == 2 || opsPerChannel == 1) &&
+         "invalid opsPerChannel number.");
+  int packedOpsPerLane = opsPerChannel == 4 ? 2 : 1;
+  int packedColNum = (systolicDepth * opsPerChannel) / packedOpsPerLane;
+  int rowsPerWarp = mlir::ceil<int>(threadsPerWarp, packedColNum);
+  int warpRepeats = repeatCount / rowsPerWarp;
+
+  for (int opc = 1; opc < packedOpsPerLane; opc *= 2) {
     regBases.push_back({0, opc});
   }
 
   for (int warp = 1; warp < warpRepeats; warp *= 2) {
-    regBases.push_back({warp * rowPerWarp, 0});
+    regBases.push_back({warp * rowsPerWarp, 0});
   }
 
   return regBases;
@@ -382,11 +388,17 @@ std::vector<std::vector<int32_t>>
 DPASLaneBasesA(int opsPerChannel, int threadsPerWarp, int systolicDepth) {
   std::vector<std::vector<int32_t>> laneBases;
 
-  for (int tid = 1; tid < systolicDepth; tid *= 2) {
-    laneBases.push_back({0, opsPerChannel * tid});
+  // pack the value to i16 for scalar bit width <=16.
+  assert((opsPerChannel == 4 || opsPerChannel == 2 || opsPerChannel == 1) &&
+         "invalid opsPerChannel number.");
+  int packedOpsPerLane = opsPerChannel == 4 ? 2 : 1;
+  int packedColNum = (systolicDepth * opsPerChannel) / packedOpsPerLane;
+
+  for (int tid = 1; tid < packedColNum; tid *= 2) {
+    laneBases.push_back({0, packedOpsPerLane * tid});
   }
-  for (int tid = systolicDepth; tid < threadsPerWarp; tid *= 2) {
-    laneBases.push_back({tid / systolicDepth, 0});
+  for (int tid = packedColNum; tid < threadsPerWarp; tid *= 2) {
+    laneBases.push_back({tid / packedColNum, 0});
   }
 
   return laneBases;
@@ -602,8 +614,7 @@ std::optional<LinearLayout>
 dotOperandDpasToLinearLayout(DotOperandEncodingAttr dotDpasLayout,
                              ArrayRef<int64_t> shape) {
   auto dpasLayout = cast<intel::DpasEncodingAttr>(dotDpasLayout.getParent());
-  if (dotDpasLayout.getOpIdx() == 0)
-    return std::nullopt;
+
   return DPAStoLinearLayout(shape, dpasLayout, dotDpasLayout.getOpIdx());
 }
 
diff --git a/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp b/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp
index 6d42c9948a..d4f6d0b821 100644
--- a/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp
+++ b/third_party/intel/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp
@@ -59,17 +59,47 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_perInst) {
                 },
                 {S("dim0"), S("dim1")}));
   // Test Operand A (opIdx=0)
+  EXPECT_EQ(
+      DPAStoLinearLayout({8, 32}, dpas({1, 1}, 8, 8, 16, 4, {1, 1}, 32), 0),
+      LinearLayout(
+          {
+              {S("register"), {{0, 1}, {2, 0}, {4, 0}}},
+              {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {0, 16}, {1, 0}}},
+              {S("warp"), {}},
+              {S("block"), {}},
+          },
+          {S("dim0"), S("dim1")}));
   EXPECT_EQ(
       DPAStoLinearLayout({8, 16}, dpas({1, 1}, 8, 8, 16, 2, {1, 1}, 32), 0),
       LinearLayout(
           {
-              {S("register"), {{0, 1}, {4, 0}}},
-              {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}}},
+              {S("register"), {{2, 0}, {4, 0}}},
+              {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}},
+              {S("warp"), {}},
+              {S("block"), {}},
+          },
+          {S("dim0"), S("dim1")}));
+  EXPECT_EQ(
+      DPAStoLinearLayout({8, 8}, dpas({1, 1}, 8, 8, 16, 1, {1, 1}, 32), 0),
+      LinearLayout(
+          {
+              {S("register"), {{4, 0}}},
+              {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {1, 0}, {2, 0}}},
               {S("warp"), {}},
               {S("block"), {}},
           },
           {S("dim0"), S("dim1")}));
   // Test Operand B (opIdx=1)
+  EXPECT_EQ(
+      DPAStoLinearLayout({32, 16}, dpas({1, 1}, 8, 8, 16, 4, {1, 1}, 32), 1),
+      LinearLayout(
+          {
+              {S("register"), {{1, 0}, {2, 0}, {8, 0}, {16, 0}}},
+              {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {4, 0}}},
+              {S("warp"), {}},
+              {S("block"), {}},
+          },
+          {S("dim0"), S("dim1")}));
   EXPECT_EQ(
       DPAStoLinearLayout({16, 16}, dpas({1, 1}, 8, 8, 16, 2, {1, 1}, 32), 1),
       LinearLayout(
@@ -80,6 +110,16 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_perInst) {
               {S("block"), {}},
           },
           {S("dim0"), S("dim1")}));
+  EXPECT_EQ(
+      DPAStoLinearLayout({8, 16}, dpas({1, 1}, 8, 8, 16, 1, {1, 1}, 32), 1),
+      LinearLayout(
+          {
+              {S("register"), {{2, 0}, {4, 0}}},
+              {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}},
+              {S("warp"), {}},
+              {S("block"), {}},
+          },
+          {S("dim0"), S("dim1")}));
 }
 
 TEST_F(DPAStoLinearLayoutTest, DPAS_withRepCluster) {
@@ -98,8 +138,8 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_withRepCluster) {
       DPAStoLinearLayout({32, 16}, dpas({1, 1}, 8, 8, 16, 2, {4, 2}, 32), 0),
       LinearLayout(
           {
-              {S("register"), {{0, 1}, {4, 0}, {8, 0}, {16, 0}}},
-              {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}}},
+              {S("register"), {{2, 0}, {4, 0}, {8, 0}, {16, 0}}},
+              {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}},
               {S("warp"), {}},
               {S("block"), {}},
           },
@@ -154,8 +194,8 @@ TEST_F(DPAStoLinearLayoutTest, DPAS_withWarpOperandA) {
       LinearLayout(
           {
               {S("register"),
-               {{0, 1}, {4, 0}, {8, 0}, {16, 0}, {0, 16}, {0, 32}}},
-              {S("lane"), {{0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}}},
+               {{2, 0}, {4, 0}, {8, 0}, {16, 0}, {0, 16}, {0, 32}}},
+              {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}},
               {S("warp"), {{0, 0}, {32, 0}}},
               {S("block"), {}},
           },
diff --git a/third_party/proton/dialect/CMakeLists.txt b/third_party/proton/dialect/CMakeLists.txt
new file mode 100644
index 0000000000..c7b5413a0e
--- /dev/null
+++ b/third_party/proton/dialect/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
+add_subdirectory(include)
+add_subdirectory(lib)
+if(TRITON_BUILD_PYTHON_MODULE)
+  add_triton_plugin(TritonProton ${CMAKE_CURRENT_SOURCE_DIR}/triton_proton.cc LINK_LIBS ProtonIR)
+endif()
diff --git a/third_party/proton/dialect/include/CMakeLists.txt b/third_party/proton/dialect/include/CMakeLists.txt
new file mode 100644
index 0000000000..0ca0f41c5a
--- /dev/null
+++ b/third_party/proton/dialect/include/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Dialect)
diff --git a/third_party/proton/dialect/include/Dialect/CMakeLists.txt b/third_party/proton/dialect/include/Dialect/CMakeLists.txt
new file mode 100644
index 0000000000..f18c30ba1a
--- /dev/null
+++ b/third_party/proton/dialect/include/Dialect/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Proton)
diff --git a/third_party/proton/dialect/include/Dialect/Proton/CMakeLists.txt b/third_party/proton/dialect/include/Dialect/Proton/CMakeLists.txt
new file mode 100644
index 0000000000..f33061b2d8
--- /dev/null
+++ b/third_party/proton/dialect/include/Dialect/Proton/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/third_party/proton/dialect/include/Dialect/Proton/IR/CMakeLists.txt b/third_party/proton/dialect/include/Dialect/Proton/IR/CMakeLists.txt
new file mode 100644
index 0000000000..4645b0ebcd
--- /dev/null
+++ b/third_party/proton/dialect/include/Dialect/Proton/IR/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
+
+set(LLVM_TARGET_DEFINITIONS ProtonOps.td)
+mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=proton)
+mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=proton)
+mlir_tablegen(OpsConversions.inc -gen-llvmir-conversions)
+mlir_tablegen(Ops.h.inc -gen-op-decls)
+mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
+mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
+add_mlir_doc(ProtonDialect ProtonDialect dialects/ -gen-dialect-doc)
+add_mlir_doc(ProtonOps ProtonOps dialects/ -gen-op-doc)
+add_public_tablegen_target(ProtonTableGen)
+
+set(LLVM_TARGET_DEFINITIONS ProtonAttrDefs.td)
+mlir_tablegen(ProtonAttrDefs.h.inc -gen-attrdef-decls)
+mlir_tablegen(ProtonAttrDefs.cpp.inc -gen-attrdef-defs)
+add_public_tablegen_target(ProtonAttrDefsIncGen)
diff --git a/third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h b/third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h
new file mode 100644
index 0000000000..680a205f08
--- /dev/null
+++ b/third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h
@@ -0,0 +1,23 @@
+#ifndef TRITON_DIALECT_PROTON_IR_DIALECT_H_
+#define TRITON_DIALECT_PROTON_IR_DIALECT_H_
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "proton/dialect/include/Dialect/Proton/IR/Dialect.h.inc"
+#include "proton/dialect/include/Dialect/Proton/IR/OpsEnums.h.inc"
+
+#define GET_ATTRDEF_CLASSES
+#include "proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.h.inc"
+
+#define GET_OP_CLASSES
+#include "proton/dialect/include/Dialect/Proton/IR/Ops.h.inc"
+
+namespace mlir {
+namespace triton {
+namespace proton {} // namespace proton
+} // namespace triton
+} // namespace mlir
+
+#endif // TRITON_DIALECT_PROTON_IR_DIALECT_H_
diff --git a/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.td b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.td
new file mode 100644
index 0000000000..d469fbb35f
--- /dev/null
+++ b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonAttrDefs.td
@@ -0,0 +1,12 @@
+#ifndef PROTON_ATTRDEFS
+#define PROTON_ATTRDEFS
+
+include "mlir/IR/AttrTypeBase.td"
+include "ProtonDialect.td"
+
+class Proton_Attr<string name, list<Trait> traits = [],
+                     string baseCppClass = "::mlir::Attribute">
+  : AttrDef<Proton_Dialect, name, traits, baseCppClass> {
+}
+
+#endif // PROTON_ATTRDEFS
diff --git a/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonDialect.td b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonDialect.td
new file mode 100644
index 0000000000..245f2e09a2
--- /dev/null
+++ b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonDialect.td
@@ -0,0 +1,18 @@
+#ifndef PROTON_DIALECT
+#define PROTON_DIALECT
+
+include "mlir/IR/OpBase.td"
+
+def Proton_Dialect : Dialect {
+  let name = "proton";
+  let cppNamespace = "::mlir::triton::proton";
+
+  let description = [{
+    Proton Dialect provides core ops for building third-party compiler-based
+    performance profiling and analysis tools.
+  }];
+
+  let dependentDialects = [];
+}
+
+#endif
diff --git a/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonOps.td b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonOps.td
new file mode 100644
index 0000000000..d18a48d5d1
--- /dev/null
+++ b/third_party/proton/dialect/include/Dialect/Proton/IR/ProtonOps.td
@@ -0,0 +1,65 @@
+#ifndef PROTON_OPS
+#define PROTON_OPS
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/EnumAttr.td"
+include "triton/Dialect/Triton/IR/TritonTypes.td"
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "triton/Dialect/Triton/IR/TritonInterfaces.td"
+include "ProtonDialect.td"
+include "ProtonAttrDefs.td"
+
+class TT_Proton_Op<string mnemonic, list<Trait> traits = []> :
+    Op<Proton_Dialect, mnemonic, !listconcat(traits, [])> {
+}
+
+// Proton profiling metric.
+def MetricAttr : I32EnumAttr<
+    "Metric", "",
+    [
+        I32EnumAttrCase<"CYCLE", 0, "cycle">,
+    ]> {
+    let cppNamespace = "::mlir::triton::proton";
+}
+
+// Proton profiling granularity.
+def GranularityAttr : I32EnumAttr<
+    "Granularity", "",
+    [
+        I32EnumAttrCase<"WARPGROUP", 0, "warpgroup">,
+        I32EnumAttrCase<"WARP", 1, "warp">,
+    ]> {
+    let cppNamespace = "::mlir::triton::proton";
+}
+
+def TT_RecordOp : TT_Proton_Op<"record", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+  let summary = "Record a GPU hardware event";
+
+  let description = [{
+    The operator records GPU events from performance counters.
+    Currently only cycle counter is supported.
+
+    Example:
+
+    ```mlir
+    proton.record() {isStart = true, regionId = 4 : i32}
+    ...
+    proton.record() {isStart = false, regionId = 4 : i32}
+    ...
+    proton.record() {isStart = true, regionId = 1 : i32, granularity = 1 : i32}
+    ...
+    proton.record() {isStart = false, regionId = 1 : i32, granularity = 1 : i32}
+    ```
+  }];
+  let arguments = (
+    ins BoolAttr: $isStart,
+    ConfinedAttr<I32Attr, [IntNonNegative]>:$regionId,
+    DefaultValuedAttr<MetricAttr, "Metric::CYCLE">:$metric,
+    DefaultValuedAttr<GranularityAttr, "Granularity::WARPGROUP">:$granularity
+  );
+  let assemblyFormat = " `(` operands `)` attr-dict";
+}
+
+#endif // PROTON_OPS
diff --git a/third_party/proton/dialect/lib/CMakeLists.txt b/third_party/proton/dialect/lib/CMakeLists.txt
new file mode 100644
index 0000000000..0ca0f41c5a
--- /dev/null
+++ b/third_party/proton/dialect/lib/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Dialect)
diff --git a/third_party/proton/dialect/lib/Dialect/CMakeLists.txt b/third_party/proton/dialect/lib/Dialect/CMakeLists.txt
new file mode 100644
index 0000000000..f18c30ba1a
--- /dev/null
+++ b/third_party/proton/dialect/lib/Dialect/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(Proton)
diff --git a/third_party/proton/dialect/lib/Dialect/Proton/CMakeLists.txt b/third_party/proton/dialect/lib/Dialect/Proton/CMakeLists.txt
new file mode 100644
index 0000000000..f33061b2d8
--- /dev/null
+++ b/third_party/proton/dialect/lib/Dialect/Proton/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/third_party/proton/dialect/lib/Dialect/Proton/IR/CMakeLists.txt b/third_party/proton/dialect/lib/Dialect/Proton/IR/CMakeLists.txt
new file mode 100644
index 0000000000..5eea5cb3cf
--- /dev/null
+++ b/third_party/proton/dialect/lib/Dialect/Proton/IR/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_triton_library(ProtonIR
+  Dialect.cpp
+  Ops.cpp
+
+  DEPENDS
+  ProtonTableGen
+  ProtonAttrDefsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRLLVMDialect
+  TritonIR
+  TritonGPUIR
+)
diff --git a/third_party/proton/dialect/lib/Dialect/Proton/IR/Dialect.cpp b/third_party/proton/dialect/lib/Dialect/Proton/IR/Dialect.cpp
new file mode 100644
index 0000000000..60c2852654
--- /dev/null
+++ b/third_party/proton/dialect/lib/Dialect/Proton/IR/Dialect.cpp
@@ -0,0 +1,25 @@
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/OpImplementation.h"
+
+// clang-format off
+#include "Dialect/Proton/IR/Dialect.h"
+#include "Dialect/Proton/IR/Dialect.cpp.inc"
+// clang-format on
+
+using namespace mlir;
+using namespace mlir::triton::proton;
+
+void mlir::triton::proton::ProtonDialect::initialize() {
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "Dialect/Proton/IR/ProtonAttrDefs.cpp.inc"
+      >();
+
+  addOperations<
+#define GET_OP_LIST
+#include "Dialect/Proton/IR/Ops.cpp.inc"
+      >();
+}
+
+#define GET_ATTRDEF_CLASSES
+#include "Dialect/Proton/IR/ProtonAttrDefs.cpp.inc"
diff --git a/third_party/proton/dialect/lib/Dialect/Proton/IR/Ops.cpp b/third_party/proton/dialect/lib/Dialect/Proton/IR/Ops.cpp
new file mode 100644
index 0000000000..1a0799aea1
--- /dev/null
+++ b/third_party/proton/dialect/lib/Dialect/Proton/IR/Ops.cpp
@@ -0,0 +1,33 @@
+#include "Dialect/Proton/IR/Dialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/Interfaces/FunctionImplementation.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Types.h"
+#include "triton/Dialect/Triton/IR/Utility.h"
+
+#define GET_OP_CLASSES
+#include "Dialect/Proton/IR/Ops.cpp.inc"
+#include "Dialect/Proton/IR/OpsEnums.cpp.inc"
+
+namespace mlir {
+namespace triton {
+namespace proton {
+
+// -- RecordOp --
+void RecordOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Write::get(),
+                       SideEffects::DefaultResource::get());
+  effects.emplace_back(MemoryEffects::Read::get(),
+                       SideEffects::DefaultResource::get());
+}
+
+} // namespace proton
+} // namespace triton
+} // namespace mlir
diff --git a/third_party/proton/dialect/triton_proton.cc b/third_party/proton/dialect/triton_proton.cc
new file mode 100644
index 0000000000..8046539794
--- /dev/null
+++ b/third_party/proton/dialect/triton_proton.cc
@@ -0,0 +1,20 @@
+#include "Dialect/Proton/IR/Dialect.h"
+#include "mlir/Pass/PassManager.h"
+#include "passes.h"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+
+namespace py = pybind11;
+
+void init_triton_proton(py::module &&m) {
+  auto passes = m.def_submodule("passes");
+
+  // load dialects
+  m.def("load_dialects", [](mlir::MLIRContext &context) {
+    mlir::DialectRegistry registry;
+    registry.insert<mlir::triton::proton::ProtonDialect>();
+    context.appendDialectRegistry(registry);
+    context.loadAllAvailableDialects();
+  });
+}