From 3ba88ce598aaab269169f0a5db5981c9a9ac8603 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 31 Oct 2024 22:38:36 +0000 Subject: [PATCH 01/12] Add pass to handle AMDGCN pseudo-intrinsics (abstract placeholders for target specific info), and add handling for `llvm.amdgcn.wavefrontsize`. --- clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 5 +- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 ++ .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 49 +++++++++ llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +- llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + .../AMDGPU/llvm.amdgcn.wavefrontsize.ll | 99 ++++++++++++++----- 7 files changed, 139 insertions(+), 29 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index bf5f2971cf118c..de6a06dad6a08d 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -1,6 +1,6 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s -// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s #pragma OPENCL EXTENSION cl_khr_fp64 : enable @@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu // CHECK-LABEL test_wavefrontsize( unsigned test_wavefrontsize() { - // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize() + // CHECK-AMDGCN: ret i32 {{[0-9]+}} + // CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize() return __builtin_amdgcn_wavefrontsize(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 95d0ad0f9dc96a..17d3e6ab7c65ab 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -345,6 +345,15 @@ extern char &AMDGPUPrintfRuntimeBindingID; void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &); extern char &AMDGPUResourceUsageAnalysisID; +struct AMDGPUExpandPseudoIntrinsicsPass + : PassInfoMixin { + const AMDGPUTargetMachine &TM; + AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + static bool isRequired() { return true; } +}; + struct AMDGPUPrintfRuntimeBindingPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp new file mode 100644 index 00000000000000..faa23bb8550dbc --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp @@ -0,0 +1,49 @@ +//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file implements a pass that deals with expanding AMDGCN generic pseudo- +// intrinsics into target specific quantities / sequences. In this context, a +// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a +// specific instruction, but rather is intended as a mechanism for abstractly +// conveying target specific info to a HLL / the FE, without concretely +// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize. +// This pass should run as early as possible / immediately after Clang CodeGen, +// so that the optimisation pipeline and the BE operate with concrete target +// data. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" + +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +using namespace llvm; + +static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST, + Function *WaveSize) { + if (WaveSize->hasZeroLiveUses()) + return PreservedAnalyses::all(); + + for (auto &&U : WaveSize->users()) + U->replaceAllUsesWith(ConstantInt::get(WaveSize->getReturnType(), + ST.getWavefrontSize())); + + return PreservedAnalyses::none(); +} + +PreservedAnalyses + AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) { + + if (auto WS = M.getFunction("llvm.amdgcn.wavefrontsize")) + return expandWaveSizeIntrinsic(TM.getSubtarget(*WS), WS); + + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 174a90f0aa419d..323c195c329168 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -27,6 +27,8 @@ MODULE_PASS("amdgpu-perf-hint", *static_cast(this))) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) +MODULE_PASS("amdgpu-expand-pseudo-intrinsics", + AMDGPUExpandPseudoIntrinsicsPass(*this)) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d93ec34a703d3d..2bf8df6588c59c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -739,7 +739,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #include "llvm/Passes/TargetPassRegistry.inc" PB.registerPipelineStartEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { + [this](ModulePassManager &PM, OptimizationLevel Level) { + PM.addPass(AMDGPUExpandPseudoIntrinsicsPass(*this)); FunctionPassManager FPM; PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); if (EnableHipStdPar) diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index fed29c3e14aae2..c9d4452b4a035c 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp + AMDGPUExpandPseudoIntrinsics.cpp AMDGPUExportClustering.cpp AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll index 824d3708c027db..efa53def5ee686 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s @@ -5,28 +6,43 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s ; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -passes='default' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -passes='default' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s ; GCN-LABEL: {{^}}fold_wavefrontsize: -; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32 ; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64 ; GCN: store_{{dword|b32}} v{{.+}}, [[V]] -; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() -; OPT: store i32 %tmp, ptr addrspace(1) %arg, align 4 -; OPT-NEXT: ret void define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]] +; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4 +; OPT-W32-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 store i32 %tmp, ptr addrspace(1) %arg, align 4 @@ -34,20 +50,35 @@ bb: } ; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize: -; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}} ; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}} ; GCN-NOT: cndmask ; GCN: store_{{dword|b32}} v{{.+}}, [[V]] -; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() -; OPT: %tmp1 = icmp ugt i32 %tmp, 32 -; OPT: %tmp2 = select i1 %tmp1, i32 2, i32 1 -; OPT: store i32 %tmp2, ptr addrspace(1) %arg -; OPT-NEXT: ret void define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]] +; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 +; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1 +; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-W32-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 %tmp1 = icmp ugt i32 %tmp, 32 @@ -57,15 +88,31 @@ bb: } ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize: -; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( - -; OPT: bb: -; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() -; OPT: %tmp1 = icmp ugt i32 %tmp, 32 -; OPT: bb3: -; OPT-NEXT: ret void define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]] +; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 +; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] +; OPT: [[BB2]]: +; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: br label %[[BB3]] +; OPT: [[BB3]]: +; OPT-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 %tmp1 = icmp ugt i32 %tmp, 32 From 826c291f59f05cb7065dceb6052f3d8b7bf33f57 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 1 Nov 2024 01:01:19 +0000 Subject: [PATCH 02/12] Implement review feedback. --- .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp index faa23bb8550dbc..b46097bbd33e99 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" @@ -33,17 +34,22 @@ static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST, return PreservedAnalyses::all(); for (auto &&U : WaveSize->users()) - U->replaceAllUsesWith(ConstantInt::get(WaveSize->getReturnType(), - ST.getWavefrontSize())); + U->replaceAllUsesWith( + ConstantInt::get(WaveSize->getReturnType(), ST.getWavefrontSize())); return PreservedAnalyses::none(); } PreservedAnalyses - AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) { +AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) { + if (M.empty()) + return PreservedAnalyses::all(); + + const auto &ST = TM.getSubtarget(*M.begin()); - if (auto WS = M.getFunction("llvm.amdgcn.wavefrontsize")) - return expandWaveSizeIntrinsic(TM.getSubtarget(*WS), WS); + if (auto WS = + Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize)) + return expandWaveSizeIntrinsic(ST, WS); return PreservedAnalyses::all(); } From ab6f5a22a2442468f2ef0a7f18239f858b6320b7 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 1 Nov 2024 02:02:08 +0000 Subject: [PATCH 03/12] Do not fold early for `generic` mcpu. --- llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 4 ++++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp index b46097bbd33e99..fb2ef7b7ed2d71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp @@ -47,6 +47,10 @@ AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) { const auto &ST = TM.getSubtarget(*M.begin()); + // This is not a concrete target, we should not fold early. + if (ST.getCPU().empty() || ST.getCPU() == "generic") + return PreservedAnalyses::all(); + if (auto WS = Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize)) return expandWaveSizeIntrinsic(ST, WS); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll index efa53def5ee686..2d060fd4305077 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -6,10 +6,10 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s ; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s -; RUN: opt -mtriple=amdgcn-- -passes='default' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -passes='default' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s ; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s From f8705fbe9f9c78148ca0a0360caf2650ab546185 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 1 Nov 2024 02:07:03 +0000 Subject: [PATCH 04/12] Fix formatting (again). --- llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp index fb2ef7b7ed2d71..bf0ec39ab6c6e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp @@ -51,8 +51,8 @@ AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) { if (ST.getCPU().empty() || ST.getCPU() == "generic") return PreservedAnalyses::all(); - if (auto WS = - Intrinsic::getDeclarationIfExists(&M, Intrinsic::amdgcn_wavefrontsize)) + if (auto WS = Intrinsic::getDeclarationIfExists( + &M, Intrinsic::amdgcn_wavefrontsize)) return expandWaveSizeIntrinsic(ST, WS); return PreservedAnalyses::all(); From 026ed0092adf5c8a8b08b1772338c08ed501b54a Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 4 Nov 2024 19:27:45 +0200 Subject: [PATCH 05/12] Remove pass, fold in InstCombine. --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 9 +++++++++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +---- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll | 6 +++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 8beb9defee66a0..d952103aa81fdb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1024,6 +1024,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } break; } + case Intrinsic::amdgcn_wavefrontsize: { + // TODO: this is a workaround for the pseudo-generic target one gets with no + // specified mcpu, which spoofs its wave size to 64; it should be removed. + if ((ST->getCPU().empty() || ST->getCPU() == "generic") && + !ST->getFeatureString().contains("+wavefrontsize")) + break; + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), + ST->getWavefrontSize())); + } case Intrinsic::amdgcn_wqm_vote: { // wqm_vote is identity when the argument is constant. if (!isa(II.getArgOperand(0))) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 851e0b25ad1625..86d8dbe4d803cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -744,10 +744,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #include "llvm/Passes/TargetPassRegistry.inc" PB.registerPipelineStartEPCallback( - [this](ModulePassManager &PM, OptimizationLevel Level) { - PM.addPass(AMDGPUExpandPseudoIntrinsicsPass(*this)); - FunctionPassManager FPM; - PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + [](ModulePassManager &PM, OptimizationLevel Level) { if (EnableHipStdPar) PM.addPass(HipStdParAcceleratorCodeSelectionPass()); }); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll index 2d060fd4305077..f1aed3dc00c100 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -7,9 +7,9 @@ ; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s ; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -passes='default' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -passes='default' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s ; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s From 195decc90bbdc1996d04bdf0ef4fe18f0d1953c2 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 4 Nov 2024 19:53:46 +0200 Subject: [PATCH 06/12] Remove leftovers. --- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 --------- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 -- llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 - 3 files changed, 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 17d3e6ab7c65ab..95d0ad0f9dc96a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -345,15 +345,6 @@ extern char &AMDGPUPrintfRuntimeBindingID; void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &); extern char &AMDGPUResourceUsageAnalysisID; -struct AMDGPUExpandPseudoIntrinsicsPass - : PassInfoMixin { - const AMDGPUTargetMachine &TM; - AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {} - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); - - static bool isRequired() { return true; } -}; - struct AMDGPUPrintfRuntimeBindingPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 323c195c329168..174a90f0aa419d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -27,8 +27,6 @@ MODULE_PASS("amdgpu-perf-hint", *static_cast(this))) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) -MODULE_PASS("amdgpu-expand-pseudo-intrinsics", - AMDGPUExpandPseudoIntrinsicsPass(*this)) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index c9d4452b4a035c..fed29c3e14aae2 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -54,7 +54,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp - AMDGPUExpandPseudoIntrinsics.cpp AMDGPUExportClustering.cpp AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp From 1a7abaffc499ff8d54bc7b1fd76ca2fdf78b92a0 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 4 Nov 2024 19:54:21 +0200 Subject: [PATCH 07/12] Remove pass. --- .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 59 ------------------- 1 file changed, 59 deletions(-) delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp deleted file mode 100644 index bf0ec39ab6c6e7..00000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp +++ /dev/null @@ -1,59 +0,0 @@ -//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// This file implements a pass that deals with expanding AMDGCN generic pseudo- -// intrinsics into target specific quantities / sequences. In this context, a -// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a -// specific instruction, but rather is intended as a mechanism for abstractly -// conveying target specific info to a HLL / the FE, without concretely -// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize. -// This pass should run as early as possible / immediately after Clang CodeGen, -// so that the optimisation pipeline and the BE operate with concrete target -// data. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUTargetMachine.h" -#include "GCNSubtarget.h" - -#include "llvm/IR/Constants.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" - -using namespace llvm; - -static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST, - Function *WaveSize) { - if (WaveSize->hasZeroLiveUses()) - return PreservedAnalyses::all(); - - for (auto &&U : WaveSize->users()) - U->replaceAllUsesWith( - ConstantInt::get(WaveSize->getReturnType(), ST.getWavefrontSize())); - - return PreservedAnalyses::none(); -} - -PreservedAnalyses -AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) { - if (M.empty()) - return PreservedAnalyses::all(); - - const auto &ST = TM.getSubtarget(*M.begin()); - - // This is not a concrete target, we should not fold early. - if (ST.getCPU().empty() || ST.getCPU() == "generic") - return PreservedAnalyses::all(); - - if (auto WS = Intrinsic::getDeclarationIfExists( - &M, Intrinsic::amdgcn_wavefrontsize)) - return expandWaveSizeIntrinsic(ST, WS); - - return PreservedAnalyses::all(); -} From 9aed76ceb02fd2a1b1edf68e65f9bdac6de0509e Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 4 Nov 2024 19:59:28 +0200 Subject: [PATCH 08/12] Fix formatting. --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index d952103aa81fdb..ae5b1292921d1e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1030,7 +1030,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if ((ST->getCPU().empty() || ST->getCPU() == "generic") && !ST->getFeatureString().contains("+wavefrontsize")) break; - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), + return IC.replaceInstUsesWith( + II, ConstantInt::get(II.getType(), ST->getWavefrontSize())); } case Intrinsic::amdgcn_wqm_vote: { From 246c22fb2afc9ad600d897771fce8a2dc28b7ed1 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 4 Nov 2024 20:38:03 +0200 Subject: [PATCH 09/12] Really fix formatting. --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index ae5b1292921d1e..0b2548af72fc0d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1031,8 +1031,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { !ST->getFeatureString().contains("+wavefrontsize")) break; return IC.replaceInstUsesWith( - II, ConstantInt::get(II.getType(), - ST->getWavefrontSize())); + II, ConstantInt::get(II.getType(), ST->getWavefrontSize())); } case Intrinsic::amdgcn_wqm_vote: { // wqm_vote is identity when the argument is constant. From 7cf75589441350d6207717ab936ae61582adbc73 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 6 Nov 2024 20:55:15 +0200 Subject: [PATCH 10/12] Split tests. --- .../AMDGPU/llvm.amdgcn.wavefrontsize.ll | 76 +----------- .../AMDGPU/llvm.amdgcn.wavefrontsize.ll | 114 ++++++++++++++++++ 2 files changed, 115 insertions(+), 75 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll index f1aed3dc00c100..33dd2bd540ad06 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -1,21 +1,9 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s -; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s -; RUN: opt -mtriple=amdgcn-- -passes='default' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s -; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s - ; GCN-LABEL: {{^}}fold_wavefrontsize: ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32 @@ -24,25 +12,7 @@ define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { -; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( -; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; OPT-NEXT: [[BB:.*:]] -; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]] -; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4 -; OPT-NEXT: ret void -; -; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize( -; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; OPT-W64-NEXT: [[BB:.*:]] -; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4 -; OPT-W64-NEXT: ret void -; -; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize( -; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; OPT-W32-NEXT: [[BB:.*:]] -; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4 -; OPT-W32-NEXT: ret void -; + bb: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 store i32 %tmp, ptr addrspace(1) %arg, align 4 @@ -58,27 +28,6 @@ bb: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { -; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( -; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { -; OPT-NEXT: [[BB:.*:]] -; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]] -; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 -; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1 -; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4 -; OPT-NEXT: ret void -; -; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( -; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { -; OPT-W64-NEXT: [[BB:.*:]] -; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4 -; OPT-W64-NEXT: ret void -; -; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( -; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { -; OPT-W32-NEXT: [[BB:.*:]] -; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 -; OPT-W32-NEXT: ret void -; bb: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 %tmp1 = icmp ugt i32 %tmp, 32 @@ -90,29 +39,6 @@ bb: ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { -; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( -; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { -; OPT-NEXT: [[BB:.*:]] -; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]] -; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 -; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] -; OPT: [[BB2]]: -; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 -; OPT-NEXT: br label %[[BB3]] -; OPT: [[BB3]]: -; OPT-NEXT: ret void -; -; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( -; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { -; OPT-W64-NEXT: [[BB:.*:]] -; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 -; OPT-W64-NEXT: ret void -; -; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( -; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { -; OPT-W32-NEXT: [[BB:.*:]] -; OPT-W32-NEXT: ret void -; bb: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 %tmp1 = icmp ugt i32 %tmp, 32 diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll new file mode 100644 index 00000000000000..d9c105f753e264 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s + +define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]] +; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4 +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + store i32 %tmp, ptr addrspace(1) %arg, align 4 + ret void +} + +define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]] +; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 +; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1 +; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + %tmp2 = select i1 %tmp1, i32 2, i32 1 + store i32 %tmp2, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]] +; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 +; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] +; OPT: [[BB2]]: +; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: br label %[[BB3]] +; OPT: [[BB3]]: +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: br i1 false, label %[[BB2:.*]], label %[[BB3:.*]] +; OPT-W32: [[BB2]]: +; OPT-W32-NEXT: br label %[[BB3]] +; OPT-W32: [[BB3]]: +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: br i1 true, label %[[BB2:.*]], label %[[BB3:.*]] +; OPT-W64: [[BB2]]: +; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: br label %[[BB3]] +; OPT-W64: [[BB3]]: +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + br i1 %tmp1, label %bb2, label %bb3 + +bb2: ; preds = %bb + store i32 1, ptr addrspace(1) %arg, align 4 + br label %bb3 + +bb3: ; preds = %bb2, %bb + ret void +} + +declare i32 @llvm.amdgcn.wavefrontsize() #0 + +attributes #0 = { nounwind readnone speculatable } From ed9f19f9154ae9868a12a78b9740523f727dc98c Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 18 Nov 2024 16:54:00 +0000 Subject: [PATCH 11/12] Tweak `generic` mcpu handling. --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 0b2548af72fc0d..688519e1fb6700 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1027,7 +1027,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { case Intrinsic::amdgcn_wavefrontsize: { // TODO: this is a workaround for the pseudo-generic target one gets with no // specified mcpu, which spoofs its wave size to 64; it should be removed. - if ((ST->getCPU().empty() || ST->getCPU() == "generic") && + if ((ST->getCPU().empty() || ST->getCPU().starts_with("generic")) && !ST->getFeatureString().contains("+wavefrontsize")) break; return IC.replaceInstUsesWith( From dcfe7be50909a01893c105b252e12f92dd4fd2b7 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sun, 24 Nov 2024 23:13:59 +0000 Subject: [PATCH 12/12] Use `isWaveSizeKnown` instead of gnarly hack. --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 78389dcb47f4b5..18a09c39a06387 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1025,13 +1025,10 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } case Intrinsic::amdgcn_wavefrontsize: { - // TODO: this is a workaround for the pseudo-generic target one gets with no - // specified mcpu, which spoofs its wave size to 64; it should be removed. - if ((ST->getCPU().empty() || ST->getCPU().starts_with("generic")) && - !ST->getFeatureString().contains("+wavefrontsize")) - break; - return IC.replaceInstUsesWith( - II, ConstantInt::get(II.getType(), ST->getWavefrontSize())); + if (ST->isWaveSizeKnown()) + return IC.replaceInstUsesWith( + II, ConstantInt::get(II.getType(), ST->getWavefrontSize())); + break; } case Intrinsic::amdgcn_wqm_vote: { // wqm_vote is identity when the argument is constant.