From 10b4b4a6b63ce58440133216ee1b75ad73ff07a2 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Wed, 18 Dec 2024 23:54:12 -0500 Subject: [PATCH 01/24] Save memory to build --- .ci/tritonbench/install.sh | 3 +++ install.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.ci/tritonbench/install.sh b/.ci/tritonbench/install.sh index c30ab81c..2e4a156f 100644 --- a/.ci/tritonbench/install.sh +++ b/.ci/tritonbench/install.sh @@ -10,5 +10,8 @@ fi tritonbench_dir=$(dirname "$(readlink -f "$0")")/../.. cd ${tritonbench_dir} +# probe memory available +free -h + # Install Tritonbench and all its customized packages python install.py --all diff --git a/install.py b/install.py index e0e227a4..ab6ded70 100644 --- a/install.py +++ b/install.py @@ -67,8 +67,11 @@ def install_fa2(compile=False): def install_fa3(): FA3_PATH = REPO_PATH.joinpath("submodules", "flash-attention", "hopper") + env = os.environ.copy() + # nvcc will now spawn cicc and will cost ~1G memory + env["MAX_JOBS"] = "4" cmd = [sys.executable, "setup.py", "install"] - subprocess.check_call(cmd, cwd=str(FA3_PATH.resolve())) + subprocess.check_call(cmd, cwd=str(FA3_PATH.resolve()), env=env) def install_liger(): From 11bd39d2306e1bca2bea8a8827cf083ab44cd1db Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Wed, 18 Dec 2024 23:54:51 -0500 Subject: [PATCH 02/24] Only install fa3 --- .ci/tritonbench/install.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.ci/tritonbench/install.sh b/.ci/tritonbench/install.sh index 2e4a156f..a848238b 100644 --- a/.ci/tritonbench/install.sh +++ b/.ci/tritonbench/install.sh @@ -14,4 +14,5 @@ cd ${tritonbench_dir} free -h # Install Tritonbench and all its customized packages -python install.py --all +# Test: only install fa3 +python install.py --fa3 From 9a9a342eee199e95c1bc4c340dd85a8dec65edca Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 00:26:53 -0500 Subject: [PATCH 03/24] Test --- .ci/tritonbench/install.sh | 6 +----- install.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.ci/tritonbench/install.sh b/.ci/tritonbench/install.sh index a848238b..c30ab81c 100644 --- a/.ci/tritonbench/install.sh +++ b/.ci/tritonbench/install.sh @@ -10,9 +10,5 @@ fi tritonbench_dir=$(dirname "$(readlink -f "$0")")/../.. cd ${tritonbench_dir} -# probe memory available -free -h - # Install Tritonbench and all its customized packages -# Test: only install fa3 -python install.py --fa3 +python install.py --all diff --git a/install.py b/install.py index ab6ded70..2fa7a9e4 100644 --- a/install.py +++ b/install.py @@ -69,7 +69,7 @@ def install_fa3(): FA3_PATH = REPO_PATH.joinpath("submodules", "flash-attention", "hopper") env = os.environ.copy() # nvcc will now spawn cicc and will cost ~1G memory - env["MAX_JOBS"] = "4" + env["MAX_JOBS"] = "8" cmd = [sys.executable, "setup.py", "install"] subprocess.check_call(cmd, cwd=str(FA3_PATH.resolve()), env=env) From bdea110ac96f315d0dcd8fb0b31d81b55a8bd4e5 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 08:55:05 -0500 Subject: [PATCH 04/24] Disable colfax cutlass --- install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.py b/install.py index 2fa7a9e4..9658a14a 100644 --- a/install.py +++ b/install.py @@ -131,7 +131,7 @@ def setup_hip(args: argparse.Namespace): if args.fa3 or args.all: logger.info("[tritonbench] installing fa3...") install_fa3() - if args.colfax or args.all: + if args.colfax: logger.info("[tritonbench] installing colfax cutlass-kernels...") from tools.cutlass_kernels.install import install_colfax_cutlass From 481a5acf1f33a23cf918f4bca14a63fcfdd1ddf8 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 11:25:20 -0500 Subject: [PATCH 05/24] Upgrade tk --- install.py | 2 +- submodules/ThunderKittens | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/install.py b/install.py index 9658a14a..92279a1f 100644 --- a/install.py +++ b/install.py @@ -139,7 +139,7 @@ def setup_hip(args: argparse.Namespace): if args.jax or args.all: logger.info("[tritonbench] installing jax...") install_jax() - if args.tk or args.all: + if args.tk: logger.info("[tritonbench] installing thunderkittens...") from tools.tk.install import install_tk diff --git a/submodules/ThunderKittens b/submodules/ThunderKittens index 5d7107a1..cdfce886 160000 --- a/submodules/ThunderKittens +++ b/submodules/ThunderKittens @@ -1 +1 @@ -Subproject commit 5d7107a13b016811da38e781d7fc4f74140b2d03 +Subproject commit cdfce886b2660a0345a20bc5c7d52efd0db95fc0 From f5ef4d6e84e366d33695ba34e187b7cc6b8b759e Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 13:57:20 -0500 Subject: [PATCH 06/24] Reduce NVCC threads --- install.py | 11 ++------- tools/flash_attn/hopper.patch | 14 +++++++++++ tools/flash_attn/install.py | 44 +++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 tools/flash_attn/hopper.patch diff --git a/install.py b/install.py index 92279a1f..cb8d426c 100644 --- a/install.py +++ b/install.py @@ -65,15 +65,6 @@ def install_fa2(compile=False): subprocess.check_call(cmd) -def install_fa3(): - FA3_PATH = REPO_PATH.joinpath("submodules", "flash-attention", "hopper") - env = os.environ.copy() - # nvcc will now spawn cicc and will cost ~1G memory - env["MAX_JOBS"] = "8" - cmd = [sys.executable, "setup.py", "install"] - subprocess.check_call(cmd, cwd=str(FA3_PATH.resolve()), env=env) - - def install_liger(): # Liger-kernel has a conflict dependency `triton` with pytorch, # so we need to install it without dependencies @@ -130,6 +121,8 @@ def setup_hip(args: argparse.Namespace): install_fa2(compile=True) if args.fa3 or args.all: logger.info("[tritonbench] installing fa3...") + from tools.flash_attn.install import install_fa3 + install_fa3() if args.colfax: logger.info("[tritonbench] installing colfax cutlass-kernels...") diff --git a/tools/flash_attn/hopper.patch b/tools/flash_attn/hopper.patch new file mode 100644 index 00000000..584e8ffc --- /dev/null +++ b/tools/flash_attn/hopper.patch @@ -0,0 +1,14 @@ +diff --git a/hopper/setup.py b/hopper/setup.py +index f9f3cfd..132ce07 100644 +--- a/hopper/setup.py ++++ b/hopper/setup.py +@@ -78,7 +78,8 @@ def check_if_cuda_home_none(global_option: str) -> None: + + + def append_nvcc_threads(nvcc_extra_args): +- return nvcc_extra_args + ["--threads", "4"] ++ nvcc_threads = os.getenv("NVCC_THREADS") or "4" ++ return nvcc_extra_args + ["--threads", NVCC_THREADS] + + + cmdclass = {} diff --git a/tools/flash_attn/install.py b/tools/flash_attn/install.py index e69de29b..65b71632 100644 --- a/tools/flash_attn/install.py +++ b/tools/flash_attn/install.py @@ -0,0 +1,44 @@ +import os +import subprocess +import sys + +from pathlib import Path + +REPO_PATH = Path(os.path.abspath(__file__)).parent.parent.parent +CUR_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__))) + +def patch_fa3(): + patches = ["hopper.patch"] + for patch_file in patches: + patch_file_path = os.path.join(CUR_DIR, patch_file) + submodule_path = str(REPO_PATH.joinpath("submodules", "flash-attention").absolute()) + try: + subprocess.check_output( + [ + "patch", + "-p1", + "--forward", + "-i", + patch_file_path, + "-r", + "/tmp/rej", + ], + cwd=submodule_path, + ) + except subprocess.SubprocessError as e: + output_str = str(e.output) + if "previously applied" in output_str: + return + else: + print(str(output_str)) + sys.exit(1) + +def install_fa3(): + patch_fa3() + FA3_PATH = REPO_PATH.joinpath("submodules", "flash-attention", "hopper") + env = os.environ.copy() + # nvcc will spawn cicc process and will cost ~1G memory + env["MAX_JOBS"] = "8" + env["NVCC_THREADS"] = "2" + cmd = [sys.executable, "setup.py", "install"] + subprocess.check_call(cmd, cwd=str(FA3_PATH.resolve()), env=env) From b92d6b935731b4a521e57a29c6de3ac9f5779014 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 14:19:54 -0500 Subject: [PATCH 07/24] Take nvcc_threads --- tools/flash_attn/hopper.patch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flash_attn/hopper.patch b/tools/flash_attn/hopper.patch index 584e8ffc..b34a4025 100644 --- a/tools/flash_attn/hopper.patch +++ b/tools/flash_attn/hopper.patch @@ -8,7 +8,7 @@ index f9f3cfd..132ce07 100644 def append_nvcc_threads(nvcc_extra_args): - return nvcc_extra_args + ["--threads", "4"] + nvcc_threads = os.getenv("NVCC_THREADS") or "4" -+ return nvcc_extra_args + ["--threads", NVCC_THREADS] ++ return nvcc_extra_args + ["--threads", nvcc_threads] cmdclass = {} From 9d08747976180021b7143cd451af7ce1e448c520 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 15:39:41 -0500 Subject: [PATCH 08/24] Try with single thread --- tools/flash_attn/install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flash_attn/install.py b/tools/flash_attn/install.py index 65b71632..f06d4381 100644 --- a/tools/flash_attn/install.py +++ b/tools/flash_attn/install.py @@ -39,6 +39,6 @@ def install_fa3(): env = os.environ.copy() # nvcc will spawn cicc process and will cost ~1G memory env["MAX_JOBS"] = "8" - env["NVCC_THREADS"] = "2" + env["NVCC_THREADS"] = "1" cmd = [sys.executable, "setup.py", "install"] subprocess.check_call(cmd, cwd=str(FA3_PATH.resolve()), env=env) From 86c1c25962a7e0a5abc1038bfb94414b963eda70 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 17:32:42 -0500 Subject: [PATCH 09/24] Limit memory size to 100g --- .github/workflows/docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 25b6bc5f..a6c930ad 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -42,7 +42,7 @@ jobs: # branch name is github.head_ref when triggered by pull_request # and it is github.ref_name when triggered by workflow_dispatch branch_name=${{ github.head_ref || github.ref_name }} - docker build . --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ + docker build . --memory 100g --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest # Extract pytorch version from the docker PYTORCH_VERSION=$(docker run -e SETUP_SCRIPT="${SETUP_SCRIPT}" ghcr.io/pytorch-labs/tritonbench:latest bash -c '. "${SETUP_SCRIPT}"; python -c "import torch; print(torch.__version__)"') From 0cc512bfb20b4c81dc58ea3a8db5d5fd78deb77e Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 17:34:47 -0500 Subject: [PATCH 10/24] Set memory limit in docker build --- .github/workflows/docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index a6c930ad..fa7b1f47 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -42,7 +42,7 @@ jobs: # branch name is github.head_ref when triggered by pull_request # and it is github.ref_name when triggered by workflow_dispatch branch_name=${{ github.head_ref || github.ref_name }} - docker build . --memory 100g --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ + docker build . --memory 80g --shm-size 4g --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest # Extract pytorch version from the docker PYTORCH_VERSION=$(docker run -e SETUP_SCRIPT="${SETUP_SCRIPT}" ghcr.io/pytorch-labs/tritonbench:latest bash -c '. "${SETUP_SCRIPT}"; python -c "import torch; print(torch.__version__)"') From ba9e0d255e3ae52b4d4fd5acd0e5d71f7ee8fb54 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 17:38:27 -0500 Subject: [PATCH 11/24] Show disk space --- .github/workflows/docker.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index fa7b1f47..16b561f7 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -42,6 +42,8 @@ jobs: # branch name is github.head_ref when triggered by pull_request # and it is github.ref_name when triggered by workflow_dispatch branch_name=${{ github.head_ref || github.ref_name }} + # show disk space + df -h docker build . --memory 80g --shm-size 4g --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest # Extract pytorch version from the docker From f850878bf7b87c94c146f11402139f347b1b4a90 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 18:58:34 -0500 Subject: [PATCH 12/24] Test cpu --- .github/workflows/docker.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 16b561f7..23816c79 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -44,6 +44,8 @@ jobs: branch_name=${{ github.head_ref || github.ref_name }} # show disk space df -h + # show cpu + lscpu docker build . --memory 80g --shm-size 4g --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest # Extract pytorch version from the docker From b2478dff7ff09a7648c7ba6da0e289f40d4e6874 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 19:05:54 -0500 Subject: [PATCH 13/24] Save cpu cores to docker build --- .github/workflows/docker.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 23816c79..689a0372 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -46,8 +46,8 @@ jobs: df -h # show cpu lscpu - docker build . --memory 80g --shm-size 4g --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ - -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest + taskset -c 4-31 docker build . --memory 80g --shm-size 4g --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ + -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest # Extract pytorch version from the docker PYTORCH_VERSION=$(docker run -e SETUP_SCRIPT="${SETUP_SCRIPT}" ghcr.io/pytorch-labs/tritonbench:latest bash -c '. "${SETUP_SCRIPT}"; python -c "import torch; print(torch.__version__)"') export DOCKER_TAG=$(awk '{match($0, /dev[0-9]+/, arr); print arr[0]}' <<< "${PYTORCH_VERSION}") From e3d7dda8542d2bb0468e99a9e14b3429148a0092 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 21:20:51 -0500 Subject: [PATCH 14/24] Reduce the cpu --- .github/workflows/docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 689a0372..01dd3f66 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -46,7 +46,7 @@ jobs: df -h # show cpu lscpu - taskset -c 4-31 docker build . --memory 80g --shm-size 4g --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ + taskset -c 16-31 docker build . --memory 80g --shm-size 4g --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest # Extract pytorch version from the docker PYTORCH_VERSION=$(docker run -e SETUP_SCRIPT="${SETUP_SCRIPT}" ghcr.io/pytorch-labs/tritonbench:latest bash -c '. "${SETUP_SCRIPT}"; python -c "import torch; print(torch.__version__)"') From 3a1abf75933a11355fcdf9a67fbac200dcd58013 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 21:28:15 -0500 Subject: [PATCH 15/24] Compile fa3 first --- install.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/install.py b/install.py index cb8d426c..425c21e0 100644 --- a/install.py +++ b/install.py @@ -113,17 +113,16 @@ def setup_hip(args: argparse.Namespace): # checkout submodules checkout_submodules(REPO_PATH) # install submodules + if args.fa3 or args.all: + logger.info("[tritonbench] installing fa3...") + from tools.flash_attn.install import install_fa3 + install_fa3() if args.fbgemm or args.all: logger.info("[tritonbench] installing FBGEMM...") install_fbgemm() if args.fa2 or args.all: logger.info("[tritonbench] installing fa2 from source...") install_fa2(compile=True) - if args.fa3 or args.all: - logger.info("[tritonbench] installing fa3...") - from tools.flash_attn.install import install_fa3 - - install_fa3() if args.colfax: logger.info("[tritonbench] installing colfax cutlass-kernels...") from tools.cutlass_kernels.install import install_colfax_cutlass From 57150e9161943cfe10ccdf08a5a4852e36ef066d Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 22:25:52 -0500 Subject: [PATCH 16/24] Preinstall ninja --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 1cc03e35..4dc9f4fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ packaging pynvml psutil tabulate +ninja transformers==4.46.1 From c1f8bb0023f2749a8f0bd09712381e19cdb6cac5 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 22:27:57 -0500 Subject: [PATCH 17/24] Build fa3 --- tools/flash_attn/install.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/flash_attn/install.py b/tools/flash_attn/install.py index f06d4381..f3b23033 100644 --- a/tools/flash_attn/install.py +++ b/tools/flash_attn/install.py @@ -38,7 +38,7 @@ def install_fa3(): FA3_PATH = REPO_PATH.joinpath("submodules", "flash-attention", "hopper") env = os.environ.copy() # nvcc will spawn cicc process and will cost ~1G memory - env["MAX_JOBS"] = "8" - env["NVCC_THREADS"] = "1" + # env["MAX_JOBS"] = "8" + # env["NVCC_THREADS"] = "1" cmd = [sys.executable, "setup.py", "install"] subprocess.check_call(cmd, cwd=str(FA3_PATH.resolve()), env=env) From 728d62d180366c237eafb8767e4c4be7db483471 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 22:44:14 -0500 Subject: [PATCH 18/24] Build installer --- install.py | 2 +- tools/flash_attn/install.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/install.py b/install.py index 425c21e0..15408936 100644 --- a/install.py +++ b/install.py @@ -57,7 +57,7 @@ def install_fa2(compile=False): if compile: # compile from source (slow) FA2_PATH = REPO_PATH.joinpath("submodules", "flash-attention") - cmd = [sys.executable, "setup.py", "install"] + cmd = ["pip", "install", "-e", "."] subprocess.check_call(cmd, cwd=str(FA2_PATH.resolve())) else: # Install the pre-built binary diff --git a/tools/flash_attn/install.py b/tools/flash_attn/install.py index f3b23033..80133521 100644 --- a/tools/flash_attn/install.py +++ b/tools/flash_attn/install.py @@ -40,5 +40,5 @@ def install_fa3(): # nvcc will spawn cicc process and will cost ~1G memory # env["MAX_JOBS"] = "8" # env["NVCC_THREADS"] = "1" - cmd = [sys.executable, "setup.py", "install"] + cmd = ["pip", "install", "-e", "."] subprocess.check_call(cmd, cwd=str(FA3_PATH.resolve()), env=env) From b413944383e82dbfb200fcd6ccfaf0275d03090f Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Thu, 19 Dec 2024 23:20:43 -0500 Subject: [PATCH 19/24] Still limit compile threads --- .github/workflows/docker.yaml | 9 +++++---- install.py | 1 + tools/flash_attn/install.py | 6 +++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 01dd3f66..08eb10cd 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -20,6 +20,10 @@ env: jobs: build-push-docker: if: ${{ github.repository_owner == 'pytorch-labs' }} + # spec: + # 120G RAM + # 32 logical CPU cores + # 1T disk runs-on: 32-core-ubuntu environment: docker-s3-upload steps: @@ -42,10 +46,7 @@ jobs: # branch name is github.head_ref when triggered by pull_request # and it is github.ref_name when triggered by workflow_dispatch branch_name=${{ github.head_ref || github.ref_name }} - # show disk space - df -h - # show cpu - lscpu + # limit CPU core and memory usage to keep runner daemon alive on CI machine taskset -c 16-31 docker build . --memory 80g --shm-size 4g --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest # Extract pytorch version from the docker diff --git a/install.py b/install.py index 15408936..16868320 100644 --- a/install.py +++ b/install.py @@ -114,6 +114,7 @@ def setup_hip(args: argparse.Namespace): checkout_submodules(REPO_PATH) # install submodules if args.fa3 or args.all: + # we need to install fa3 above all other dependencies logger.info("[tritonbench] installing fa3...") from tools.flash_attn.install import install_fa3 install_fa3() diff --git a/tools/flash_attn/install.py b/tools/flash_attn/install.py index 80133521..b764c0d2 100644 --- a/tools/flash_attn/install.py +++ b/tools/flash_attn/install.py @@ -37,8 +37,8 @@ def install_fa3(): patch_fa3() FA3_PATH = REPO_PATH.joinpath("submodules", "flash-attention", "hopper") env = os.environ.copy() - # nvcc will spawn cicc process and will cost ~1G memory - # env["MAX_JOBS"] = "8" - # env["NVCC_THREADS"] = "1" + # limit nvcc memory usage on the CI machine + env["MAX_JOBS"] = "8" + env["NVCC_THREADS"] = "1" cmd = ["pip", "install", "-e", "."] subprocess.check_call(cmd, cwd=str(FA3_PATH.resolve()), env=env) From 1e03e04ed947bb8b659a40995c4783501839f87a Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 20 Dec 2024 06:16:15 -0500 Subject: [PATCH 20/24] Skip fa2 --- install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.py b/install.py index 16868320..fd59f587 100644 --- a/install.py +++ b/install.py @@ -121,7 +121,7 @@ def setup_hip(args: argparse.Namespace): if args.fbgemm or args.all: logger.info("[tritonbench] installing FBGEMM...") install_fbgemm() - if args.fa2 or args.all: + if args.fa2: logger.info("[tritonbench] installing fa2 from source...") install_fa2(compile=True) if args.colfax: From ed90ab1d9866a86b3d545c501bbca550021f0e1c Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 20 Dec 2024 06:55:35 -0500 Subject: [PATCH 21/24] Compile fa2 but not xformers --- install.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install.py b/install.py index fd59f587..e99aa1c0 100644 --- a/install.py +++ b/install.py @@ -121,7 +121,7 @@ def setup_hip(args: argparse.Namespace): if args.fbgemm or args.all: logger.info("[tritonbench] installing FBGEMM...") install_fbgemm() - if args.fa2: + if args.fa2 or args.all: logger.info("[tritonbench] installing fa2 from source...") install_fa2(compile=True) if args.colfax: @@ -140,7 +140,7 @@ def setup_hip(args: argparse.Namespace): if args.liger or args.all: logger.info("[tritonbench] installing liger-kernels...") install_liger() - if args.xformers or args.all: + if args.xformers: logger.info("[tritonbench] installing xformers...") from tools.xformers.install import install_xformers From 5744fcd5cc8b257125471c20d4162450eebe3b21 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 20 Dec 2024 10:21:28 -0500 Subject: [PATCH 22/24] Fix the build --- .github/workflows/docker.yaml | 3 +-- docker/tritonbench-nightly.dockerfile | 3 +++ requirements.txt | 1 - 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 08eb10cd..d449e0da 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -46,8 +46,7 @@ jobs: # branch name is github.head_ref when triggered by pull_request # and it is github.ref_name when triggered by workflow_dispatch branch_name=${{ github.head_ref || github.ref_name }} - # limit CPU core and memory usage to keep runner daemon alive on CI machine - taskset -c 16-31 docker build . --memory 80g --shm-size 4g --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ + docker build . --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest # Extract pytorch version from the docker PYTORCH_VERSION=$(docker run -e SETUP_SCRIPT="${SETUP_SCRIPT}" ghcr.io/pytorch-labs/tritonbench:latest bash -c '. "${SETUP_SCRIPT}"; python -c "import torch; print(torch.__version__)"') diff --git a/docker/tritonbench-nightly.dockerfile b/docker/tritonbench-nightly.dockerfile index 71d2c5d9..d69311f0 100644 --- a/docker/tritonbench-nightly.dockerfile +++ b/docker/tritonbench-nightly.dockerfile @@ -49,6 +49,9 @@ RUN cd /workspace/tritonbench && \ # which is from NVIDIA driver RUN sudo apt update && sudo apt-get install -y libnvidia-compute-550 patchelf patch +# Workaround: installing Ninja from setup.py hits "Failed to decode METADATA with UTF-8" error +RUN pip install ninja + # Install Tritonbench RUN cd /workspace/tritonbench && \ bash .ci/tritonbench/install.sh diff --git a/requirements.txt b/requirements.txt index 4dc9f4fd..1cc03e35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,4 @@ packaging pynvml psutil tabulate -ninja transformers==4.46.1 From 1e1b17347aa710dab6938fb697c0607190e85fbb Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 20 Dec 2024 10:28:33 -0500 Subject: [PATCH 23/24] Fix ninja install --- docker/tritonbench-nightly.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/tritonbench-nightly.dockerfile b/docker/tritonbench-nightly.dockerfile index d69311f0..f5713c72 100644 --- a/docker/tritonbench-nightly.dockerfile +++ b/docker/tritonbench-nightly.dockerfile @@ -50,7 +50,7 @@ RUN cd /workspace/tritonbench && \ RUN sudo apt update && sudo apt-get install -y libnvidia-compute-550 patchelf patch # Workaround: installing Ninja from setup.py hits "Failed to decode METADATA with UTF-8" error -RUN pip install ninja +RUN . ${SETUP_SCRIPT} && pip install ninja # Install Tritonbench RUN cd /workspace/tritonbench && \ From 4f9f74f18373a41a89d23dd405b040c1433b603b Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 20 Dec 2024 10:31:55 -0500 Subject: [PATCH 24/24] Fix indent --- .github/workflows/docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index d449e0da..718634f5 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -47,7 +47,7 @@ jobs: # and it is github.ref_name when triggered by workflow_dispatch branch_name=${{ github.head_ref || github.ref_name }} docker build . --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \ - -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest + -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest # Extract pytorch version from the docker PYTORCH_VERSION=$(docker run -e SETUP_SCRIPT="${SETUP_SCRIPT}" ghcr.io/pytorch-labs/tritonbench:latest bash -c '. "${SETUP_SCRIPT}"; python -c "import torch; print(torch.__version__)"') export DOCKER_TAG=$(awk '{match($0, /dev[0-9]+/, arr); print arr[0]}' <<< "${PYTORCH_VERSION}")