Merge commit '100e2aaca903ed99564242f933198a6c221d3b50'

intel · Jun 2, 2024 · 0cdcffa · 0cdcffa
2 parents c37ca9c + 100e2aa
commit 0cdcffa
Show file tree

Hide file tree

Showing 32 changed files with 653 additions and 261 deletions.
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -32,6 +32,7 @@ jobs:
     outputs:
       matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
       matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
+      matrix-MACOS: ${{ steps.set-matrix.outputs.matrix-MACOS }}
     steps:
       - name: Decide pre-submit integration test enablement
         # Always enable integration tests for pre-submit pull requests.
@@ -106,9 +107,11 @@ jobs:
           if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
             echo '::set-output name=matrix-CUDA::[["self-hosted", "A100"], ["self-hosted", "H100"]]'
             echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"]]'
+            echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           else
             echo '::set-output name=matrix-CUDA::["ubuntu-latest"]'
             echo '::set-output name=matrix-HIP::["ubuntu-latest"]'
+            echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           fi
   pre-commit:
     name: pre-commit (code formatting)
@@ -165,6 +168,7 @@ jobs:
           echo "llvm=$(cat cmake/llvm-hash.txt | cut -c 1-8)" >> $GITHUB_OUTPUT
           echo "pybind11=$(cat cmake/pybind11-version.txt)" >> $GITHUB_OUTPUT
           echo "nvidia=$(cat cmake/nvidia-toolchain-version.txt)" >> $GITHUB_OUTPUT
+          echo "json=$(cat cmake/json-version.txt)" >> $GITHUB_OUTPUT
           echo "datetime=$(date -u -Iseconds)" >> $GITHUB_OUTPUT
         shell: bash
       - name: Cache build dependencies
@@ -176,7 +180,8 @@ jobs:
             ~/.triton/llvm
             ~/.triton/nvidia
             ~/.triton/pybind11
-          key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-pybind11-${{ steps.cache-key.outputs.pybind11 }}
+            ~/.triton/json
+          key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-pybind11-${{ steps.cache-key.outputs.pybind11 }}-json-${{ steps.cache-key.outputs.json }}
       - # Cache ~/.triton/cache because the vast majority of unit test time is
         # spent compiling.  Triton won't (well, should not) use these cached files
         # if something internal to Triton changes, because Triton's internal
@@ -301,6 +306,7 @@ jobs:
           echo "llvm=$(cat cmake/llvm-hash.txt | cut -c 1-8)" >> $GITHUB_OUTPUT
           echo "pybind11=$(cat cmake/pybind11-version.txt)" >> $GITHUB_OUTPUT
           echo "nvidia=$(cat cmake/nvidia-toolchain-version.txt)" >> $GITHUB_OUTPUT
+          echo "json=$(cat cmake/json-version.txt)" >> $GITHUB_OUTPUT
           echo "datetime=$(date -u -Iseconds)" >> $GITHUB_OUTPUT
         shell: bash
       - name: Cache build dependencies
@@ -312,7 +318,8 @@ jobs:
             ~/.triton/llvm
             ~/.triton/nvidia
             ~/.triton/pybind11
-          key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-pybind11-${{ steps.cache-key.outputs.pybind11 }}
+            ~/.triton/json
+          key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-pybind11-${{ steps.cache-key.outputs.pybind11 }}-json-${{ steps.cache-key.outputs.json }}
       - # Cache ~/.triton/cache because the vast majority of unit test time is
         # spent compiling.  Triton won't (well, should not) use these cached files
         # if something internal to Triton changes, because Triton's internal
@@ -398,6 +405,112 @@ jobs:
           ls -alh ~/.triton
           du -sh ~/.triton/**
 
+          mkdir -p ~/.cache/ccache
+          ls -alh ~/.cache/ccache
+          du -sh ~/.cache/ccache
+  Build-Tests:
+    needs: Runner-Preparation
+    if: needs.Runner-Preparation.outputs.matrix-MACOS != ''
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-MACOS)}}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Install brew dependencies
+        run: |
+          brew update
+          brew install ccache llvm
+      - name: Compute cache keys
+        id: cache-key
+        run: |
+          echo "llvm=$(cat cmake/llvm-hash.txt | cut -c 1-8)" >> $GITHUB_OUTPUT
+          echo "pybind11=$(cat cmake/pybind11-version.txt)" >> $GITHUB_OUTPUT
+          echo "nvidia=$(cat cmake/nvidia-toolchain-version.txt)" >> $GITHUB_OUTPUT
+          echo "json=$(cat cmake/json-version.txt)" >> $GITHUB_OUTPUT
+          echo "datetime=$(date -u -Iseconds)" >> $GITHUB_OUTPUT
+        shell: bash
+      - name: Cache build dependencies
+        uses: actions/cache@v4
+        with:
+          # Note that we cannot use environment variables here given there is
+          # no shell to interpret them in the paths.
+          path: |
+            ~/.triton/llvm
+            ~/.triton/nvidia
+            ~/.triton/pybind11
+            ~/.triton/json
+          key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-pybind11-${{ steps.cache-key.outputs.pybind11 }}-json-${{ steps.cache-key.outputs.json }}
+      - # Cache ~/.triton/cache because the vast majority of unit test time is
+        # spent compiling.  Triton won't (well, should not) use these cached files
+        # if something internal to Triton changes, because Triton's internal
+        # source code is part of the cache key.
+        #
+        # Similarly, cache ~/.cache/ccache to speed up compilation.
+        #
+        # On branch `main` we always start from an empty cache, i.e. we skip the
+        # "restore" step.  This is to prevent the caches from accumulating stale
+        # files over time.
+        name: Restore cache of ccache and Triton compilation artifacts
+        if: github.event_name != 'push'
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            ~/.triton/cache
+            ~/.cache/ccache
+          # Restore the most recent cache entry.
+          restore-keys: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-
+          # We expect this cache key never to hit and for us to fall back
+          # unconditionally to the restore-key, so it doesn't actually matter
+          # what we put here (so long as it doesn't hit an existing key).
+          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
+      - name: Inspect cache directory
+        run: |
+          mkdir -p ~/.triton
+          ls -alh ~/.triton
+      - name: Update PATH
+        run: |
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+          echo "/opt/homebrew/opt/llvm/bin" >> $GITHUB_PATH
+      - name: Install pip dependencies
+        run: |
+          python3 -m venv ~/.venv
+          source ~/.venv/bin/activate
+          python3 -m pip install --upgrade pip
+          python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit
+      - name: Install Triton
+        env:
+          TRITON_BUILD_WITH_CCACHE: "true"
+          TRITON_BUILD_WITH_O1: "true"
+          # macos-latest has 3 vcpus and 7GB DRAM, to save memory we limit the number of jobs to 3
+          # https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories
+          MAX_JOBS: 3
+        run: |
+          source ~/.venv/bin/activate
+          echo "PATH is '$PATH'"
+          cd python
+          python3 -m pip install --no-build-isolation .
+      - # If we're on branch `main`, save the ccache Triton compilation artifacts
+        # to the cache so they can be used by other (non-main) CI runs.
+        #
+        # (It wouldn't be a problem to save the cache on every run, because github
+        # evicts cache entries LRU, but maybe this saves a bit of time in CI.)
+        name: Save ccache and Triton compilation artifacts to cache
+        if: github.ref == 'refs/heads/main'
+        uses: actions/cache/save@v4
+        with:
+          path: ~/.triton/cache ~/.cache/ccache
+          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ runner.name }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
+      - name: Inspect cache directories
+        run: |
+          mkdir -p ~/.triton
+          ls -alh ~/.triton
+          du -sh ~/.triton/**
+
           mkdir -p ~/.cache/ccache
           ls -alh ~/.cache/ccache
           du -sh ~/.cache/ccache
diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in
@@ -35,6 +35,7 @@ jobs:
     outputs:
       matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
       matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
+      matrix-MACOS: ${{ steps.set-matrix.outputs.matrix-MACOS }}
     steps:
       - name: Decide pre-submit integration test enablement
         # Always enable integration tests for pre-submit pull requests.
@@ -114,9 +115,11 @@ jobs:
           if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
             echo '::set-output name=matrix-CUDA::[["self-hosted", "A100"], ["self-hosted", "H100"]]'
             echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"]]'
+            echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           else
             echo '::set-output name=matrix-CUDA::["ubuntu-latest"]'
             echo '::set-output name=matrix-HIP::["ubuntu-latest"]'
+            echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           fi
 
   pre-commit:
@@ -162,6 +165,7 @@ jobs:
         run: |
           git diff
 
+
   Integration-Tests:
     needs: Runner-Preparation
     if: needs.Runner-Preparation.outputs.matrix-CUDA != ''
@@ -186,6 +190,7 @@ jobs:
           echo "llvm=$(cat cmake/llvm-hash.txt | cut -c 1-8)" >> $GITHUB_OUTPUT
           echo "pybind11=$(cat cmake/pybind11-version.txt)" >> $GITHUB_OUTPUT
           echo "nvidia=$(cat cmake/nvidia-toolchain-version.txt)" >> $GITHUB_OUTPUT
+          echo "json=$(cat cmake/json-version.txt)" >> $GITHUB_OUTPUT
           echo "datetime=$(date -u -Iseconds)" >> $GITHUB_OUTPUT
         shell: bash
 
@@ -199,7 +204,8 @@ jobs:
             ~/.triton/llvm
             ~/.triton/nvidia
             ~/.triton/pybind11
-          key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-pybind11-${{ steps.cache-key.outputs.pybind11 }}
+            ~/.triton/json
+          key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-pybind11-${{ steps.cache-key.outputs.pybind11 }}-json-${{ steps.cache-key.outputs.json }}
 
       # Cache ~/.triton/cache because the vast majority of unit test time is
       # spent compiling.  Triton won't (well, should not) use these cached files
@@ -384,3 +390,52 @@ jobs:
       - *run-cpp-unittests-step
       - *save-build-artifacts-step
       - *inspect-cache-directories-step
+
+  Build-Tests:
+    needs: Runner-Preparation
+    if: needs.Runner-Preparation.outputs.matrix-MACOS != ''
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-MACOS)}}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Install brew dependencies
+        run: |
+          brew update
+          brew install ccache llvm
+
+      - *compute-cache-keys-step
+      - *cache-build-dependencies-step
+      - *restore-build-artifacts-step
+      - *inspect-cache-directory-step
+
+      - name: Update PATH
+        run: |
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+          echo "/opt/homebrew/opt/llvm/bin" >> $GITHUB_PATH
+      - name: Install pip dependencies
+        run: |
+          python3 -m venv ~/.venv
+          source ~/.venv/bin/activate
+          python3 -m pip install --upgrade pip
+          python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit
+      - name: Install Triton
+        env:
+          TRITON_BUILD_WITH_CCACHE: "true"
+          TRITON_BUILD_WITH_O1: "true"
+          # macos-latest has 3 vcpus and 7GB DRAM, to save memory we limit the number of jobs to 3
+          # https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories
+          MAX_JOBS: 3
+        run: |
+          source ~/.venv/bin/activate
+          echo "PATH is '$PATH'"
+          cd python
+          python3 -m pip install --no-build-isolation .
+
+      - *save-build-artifacts-step
+      - *inspect-cache-directories-step
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -34,6 +34,8 @@ set(TRITON_CODEGEN_BACKENDS "" CACHE STRING "Enable different codegen backends")
 # Customized release build type with assertions: TritonRelBuildWithAsserts
 set(CMAKE_C_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g")
 set(CMAKE_CXX_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g")
+set(CMAKE_C_FLAGS_TRITONBUILDWITHO1 "-O1")
+set(CMAKE_CXX_FLAGS_TRITONBUILDWITHO1 "-O1")
 
 # Default build type
 if(NOT CMAKE_BUILD_TYPE)
@@ -265,7 +267,7 @@ if(TRITON_BUILD_PYTHON_MODULE AND NOT WIN32)
 
   # Check if the platform is MacOS
   if(APPLE)
-    set(PYTHON_LDFLAGS "-undefined dynamic_lookup -flto")
+    set(PYTHON_LDFLAGS "-undefined dynamic_lookup")
   endif()
 
   target_link_libraries(triton PRIVATE ${PYTHON_LDFLAGS})

diff --git a/cmake/json-version.txt b/cmake/json-version.txt
@@ -0,0 +1 @@
+v3.11.3