diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 17d3e6ab7c65ab..95d0ad0f9dc96a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -345,15 +345,6 @@ extern char &AMDGPUPrintfRuntimeBindingID; void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &); extern char &AMDGPUResourceUsageAnalysisID; -struct AMDGPUExpandPseudoIntrinsicsPass - : PassInfoMixin { - const AMDGPUTargetMachine &TM; - AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {} - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); - - static bool isRequired() { return true; } -}; - struct AMDGPUPrintfRuntimeBindingPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp deleted file mode 100644 index bf0ec39ab6c6e7..00000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp +++ /dev/null @@ -1,59 +0,0 @@ -//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// This file implements a pass that deals with expanding AMDGCN generic pseudo- -// intrinsics into target specific quantities / sequences. In this context, a -// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a -// specific instruction, but rather is intended as a mechanism for abstractly -// conveying target specific info to a HLL / the FE, without concretely -// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize. -// This pass should run as early as possible / immediately after Clang CodeGen, -// so that the optimisation pipeline and the BE operate with concrete target -// data. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUTargetMachine.h" -#include "GCNSubtarget.h" - -#include "llvm/IR/Constants.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" - -using namespace llvm; - -static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST, - Function *WaveSize) { - if (WaveSize->hasZeroLiveUses()) - return PreservedAnalyses::all(); - - for (auto &&U : WaveSize->users()) - U->replaceAllUsesWith( - ConstantInt::get(WaveSize->getReturnType(), ST.getWavefrontSize())); - - return PreservedAnalyses::none(); -} - -PreservedAnalyses -AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) { - if (M.empty()) - return PreservedAnalyses::all(); - - const auto &ST = TM.getSubtarget(*M.begin()); - - // This is not a concrete target, we should not fold early. - if (ST.getCPU().empty() || ST.getCPU() == "generic") - return PreservedAnalyses::all(); - - if (auto WS = Intrinsic::getDeclarationIfExists( - &M, Intrinsic::amdgcn_wavefrontsize)) - return expandWaveSizeIntrinsic(ST, WS); - - return PreservedAnalyses::all(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 8beb9defee66a0..0b2548af72fc0d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1024,6 +1024,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } break; } + case Intrinsic::amdgcn_wavefrontsize: { + // TODO: this is a workaround for the pseudo-generic target one gets with no + // specified mcpu, which spoofs its wave size to 64; it should be removed. + if ((ST->getCPU().empty() || ST->getCPU() == "generic") && + !ST->getFeatureString().contains("+wavefrontsize")) + break; + return IC.replaceInstUsesWith( + II, ConstantInt::get(II.getType(), ST->getWavefrontSize())); + } case Intrinsic::amdgcn_wqm_vote: { // wqm_vote is identity when the argument is constant. if (!isa(II.getArgOperand(0))) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 323c195c329168..174a90f0aa419d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -27,8 +27,6 @@ MODULE_PASS("amdgpu-perf-hint", *static_cast(this))) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) -MODULE_PASS("amdgpu-expand-pseudo-intrinsics", - AMDGPUExpandPseudoIntrinsicsPass(*this)) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index c9d4452b4a035c..fed29c3e14aae2 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -54,7 +54,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp - AMDGPUExpandPseudoIntrinsics.cpp AMDGPUExportClustering.cpp AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll index 2d060fd4305077..33dd2bd540ad06 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -1,21 +1,9 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s -; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -passes='default' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s -; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s -; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s - ; GCN-LABEL: {{^}}fold_wavefrontsize: ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32 @@ -24,25 +12,7 @@ define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { -; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( -; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; OPT-NEXT: [[BB:.*:]] -; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]] -; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4 -; OPT-NEXT: ret void -; -; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize( -; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; OPT-W64-NEXT: [[BB:.*:]] -; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4 -; OPT-W64-NEXT: ret void -; -; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize( -; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; OPT-W32-NEXT: [[BB:.*:]] -; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4 -; OPT-W32-NEXT: ret void -; + bb: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 store i32 %tmp, ptr addrspace(1) %arg, align 4 @@ -58,27 +28,6 @@ bb: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { -; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( -; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { -; OPT-NEXT: [[BB:.*:]] -; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]] -; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 -; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1 -; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4 -; OPT-NEXT: ret void -; -; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( -; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { -; OPT-W64-NEXT: [[BB:.*:]] -; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4 -; OPT-W64-NEXT: ret void -; -; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( -; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { -; OPT-W32-NEXT: [[BB:.*:]] -; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 -; OPT-W32-NEXT: ret void -; bb: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 %tmp1 = icmp ugt i32 %tmp, 32 @@ -90,29 +39,6 @@ bb: ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { -; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( -; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { -; OPT-NEXT: [[BB:.*:]] -; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]] -; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 -; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] -; OPT: [[BB2]]: -; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 -; OPT-NEXT: br label %[[BB3]] -; OPT: [[BB3]]: -; OPT-NEXT: ret void -; -; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( -; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { -; OPT-W64-NEXT: [[BB:.*:]] -; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 -; OPT-W64-NEXT: ret void -; -; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( -; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { -; OPT-W32-NEXT: [[BB:.*:]] -; OPT-W32-NEXT: ret void -; bb: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 %tmp1 = icmp ugt i32 %tmp, 32 diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll new file mode 100644 index 00000000000000..d9c105f753e264 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s + +define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]] +; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4 +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + store i32 %tmp, ptr addrspace(1) %arg, align 4 + ret void +} + +define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]] +; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 +; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1 +; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + %tmp2 = select i1 %tmp1, i32 2, i32 1 + store i32 %tmp2, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]] +; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 +; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] +; OPT: [[BB2]]: +; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-NEXT: br label %[[BB3]] +; OPT: [[BB3]]: +; OPT-NEXT: ret void +; +; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W32-NEXT: [[BB:.*:]] +; OPT-W32-NEXT: br i1 false, label %[[BB2:.*]], label %[[BB3:.*]] +; OPT-W32: [[BB2]]: +; OPT-W32-NEXT: br label %[[BB3]] +; OPT-W32: [[BB3]]: +; OPT-W32-NEXT: ret void +; +; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( +; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] { +; OPT-W64-NEXT: [[BB:.*:]] +; OPT-W64-NEXT: br i1 true, label %[[BB2:.*]], label %[[BB3:.*]] +; OPT-W64: [[BB2]]: +; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 +; OPT-W64-NEXT: br label %[[BB3]] +; OPT-W64: [[BB3]]: +; OPT-W64-NEXT: ret void +; +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + br i1 %tmp1, label %bb2, label %bb3 + +bb2: ; preds = %bb + store i32 1, ptr addrspace(1) %arg, align 4 + br label %bb3 + +bb3: ; preds = %bb2, %bb + ret void +} + +declare i32 @llvm.amdgcn.wavefrontsize() #0 + +attributes #0 = { nounwind readnone speculatable }