Merge branch 'handle_wavefrontsize_early' of https://github.com/AlexV…

…lx/llvm-project into handle_wavefrontsize_early
llvm · Nov 18, 2024 · c7be46f · c7be46f
2 parents c634b4e + dedc593
commit c7be46f
Show file tree

Hide file tree

Showing 7 changed files with 124 additions and 146 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -345,15 +345,6 @@ extern char &AMDGPUPrintfRuntimeBindingID;
 void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
 extern char &AMDGPUResourceUsageAnalysisID;
 
-struct AMDGPUExpandPseudoIntrinsicsPass
-    : PassInfoMixin<AMDGPUExpandPseudoIntrinsicsPass> {
-  const AMDGPUTargetMachine &TM;
-  AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-
-  static bool isRequired() { return true; }
-};
-
 struct AMDGPUPrintfRuntimeBindingPass
     : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1024,6 +1024,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     }
     break;
   }
+  case Intrinsic::amdgcn_wavefrontsize: {
+    // TODO: this is a workaround for the pseudo-generic target one gets with no
+    // specified mcpu, which spoofs its wave size to 64; it should be removed.
+    if ((ST->getCPU().empty() || ST->getCPU() == "generic") &&
+        !ST->getFeatureString().contains("+wavefrontsize"))
+      break;
+    return IC.replaceInstUsesWith(
+        II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
+  }
   case Intrinsic::amdgcn_wqm_vote: {
     // wqm_vote is identity when the argument is constant.
     if (!isa<Constant>(II.getArgOperand(0)))

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -27,8 +27,6 @@ MODULE_PASS("amdgpu-perf-hint",
               *static_cast<const GCNTargetMachine *>(this)))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
-MODULE_PASS("amdgpu-expand-pseudo-intrinsics",
-            AMDGPUExpandPseudoIntrinsicsPass(*this))
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS

diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,7 +54,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
-  AMDGPUExpandPseudoIntrinsics.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -1,21 +1,9 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 
-; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-
 ; GCN-LABEL: {{^}}fold_wavefrontsize:
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 32
@@ -24,25 +12,7 @@
 
 
 define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-NEXT:  [[BB:.*:]]
-; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]]
-; OPT-NEXT:    store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT:    ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-W64-NEXT:  [[BB:.*:]]
-; OPT-W64-NEXT:    store i32 64, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT:    ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-W32-NEXT:  [[BB:.*:]]
-; OPT-W32-NEXT:    store i32 32, ptr addrspace(1) [[ARG]], align 4
-; OPT-W32-NEXT:    ret void
-;
+
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   store i32 %tmp, ptr addrspace(1) %arg, align 4
@@ -58,27 +28,6 @@ bb:
 
 
 define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-NEXT:  [[BB:.*:]]
-; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
-; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
-; OPT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
-; OPT-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT:    ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W64-NEXT:  [[BB:.*:]]
-; OPT-W64-NEXT:    store i32 2, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT:    ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W32-NEXT:  [[BB:.*:]]
-; OPT-W32-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-W32-NEXT:    ret void
-;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32
@@ -90,29 +39,6 @@ bb:
 ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
 
 define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-NEXT:  [[BB:.*:]]
-; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
-; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
-; OPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
-; OPT:       [[BB2]]:
-; OPT-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT:    br label %[[BB3]]
-; OPT:       [[BB3]]:
-; OPT-NEXT:    ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W64-NEXT:  [[BB:.*:]]
-; OPT-W64-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT:    ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-; OPT-W32-NEXT:  [[BB:.*:]]
-; OPT-W32-NEXT:    ret void
-;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32

diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+
+define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]]
+; OPT-NEXT:    store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 32, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 64, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  store i32 %tmp, ptr addrspace(1) %arg, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
+; OPT-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 2, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  %tmp2 = select i1 %tmp1, i32 2, i32 1
+  store i32 %tmp2, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT:       [[BB2]]:
+; OPT-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    br label %[[BB3]]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    br i1 false, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W32:       [[BB2]]:
+; OPT-W32-NEXT:    br label %[[BB3]]
+; OPT-W32:       [[BB3]]:
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    br i1 true, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W64:       [[BB2]]:
+; OPT-W64-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    br label %[[BB3]]
+; OPT-W64:       [[BB3]]:
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  br i1 %tmp1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  store i32 1, ptr addrspace(1) %arg, align 4
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  ret void
+}
+
+declare i32 @llvm.amdgcn.wavefrontsize() #0
+
+attributes #0 = { nounwind readnone speculatable }