-
Notifications
You must be signed in to change notification settings - Fork 12.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[llvm][AMDGPU] Fold llvm.amdgcn.wavefrontsize
early
#114481
base: main
Are you sure you want to change the base?
Changes from 2 commits
3ba88ce
1376596
826c291
ab6f5a2
f8705fb
ed870a8
f5751a5
026ed00
195decc
1a7abaf
9aed76c
246c22f
5a11720
7cf7558
6a77b8a
be414a8
dedc593
c634b4e
c7be46f
ed9f19f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// This file implements a pass that deals with expanding AMDGCN generic pseudo- | ||
// intrinsics into target specific quantities / sequences. In this context, a | ||
// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a | ||
// specific instruction, but rather is intended as a mechanism for abstractly | ||
// conveying target specific info to a HLL / the FE, without concretely | ||
// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize. | ||
// This pass should run as early as possible / immediately after Clang CodeGen, | ||
// so that the optimisation pipeline and the BE operate with concrete target | ||
// data. | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "AMDGPU.h" | ||
#include "AMDGPUTargetMachine.h" | ||
#include "GCNSubtarget.h" | ||
|
||
#include "llvm/IR/Constants.h" | ||
#include "llvm/IR/Function.h" | ||
#include "llvm/IR/Module.h" | ||
#include "llvm/Pass.h" | ||
|
||
using namespace llvm; | ||
|
||
static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST, | ||
Function *WaveSize) { | ||
if (WaveSize->hasZeroLiveUses()) | ||
return PreservedAnalyses::all(); | ||
|
||
for (auto &&U : WaveSize->users()) | ||
U->replaceAllUsesWith(ConstantInt::get(WaveSize->getReturnType(), | ||
ST.getWavefrontSize())); | ||
|
||
return PreservedAnalyses::none(); | ||
} | ||
|
||
PreservedAnalyses | ||
AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) { | ||
|
||
if (auto WS = M.getFunction("llvm.amdgcn.wavefrontsize")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can query by intrinsic ID now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
return expandWaveSizeIntrinsic(TM.getSubtarget<GCNSubtarget>(*WS), WS); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Subtarget is per function, cannot rely on getting the subtarget from an intrinsic declaration There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
|
||
return PreservedAnalyses::all(); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -739,7 +739,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { | |
#include "llvm/Passes/TargetPassRegistry.inc" | ||
|
||
PB.registerPipelineStartEPCallback( | ||
[](ModulePassManager &PM, OptimizationLevel Level) { | ||
[this](ModulePassManager &PM, OptimizationLevel Level) { | ||
PM.addPass(AMDGPUExpandPseudoIntrinsicsPass(*this)); | ||
FunctionPassManager FPM; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need to get it run via module pass first, and then function pass again? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That was a typo, mashed up merge, apologies. |
||
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); | ||
if (EnableHipStdPar) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,53 +1,84 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
AlexVlx marked this conversation as resolved.
Show resolved
Hide resolved
|
||
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s | ||
|
||
; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s | ||
; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s | ||
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s | ||
; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s | ||
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s | ||
jhuber6 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s | ||
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s | ||
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s | ||
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s | ||
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s | ||
; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s | ||
; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s | ||
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This codegen test shouldn't be running all of these passes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It already was, mostly? It seems worthwhile to individualise the possible / plausible scenarios. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Simplified. |
||
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s | ||
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s | ||
|
||
; GCN-LABEL: {{^}}fold_wavefrontsize: | ||
; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( | ||
|
||
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32 | ||
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64 | ||
; GCN: store_{{dword|b32}} v{{.+}}, [[V]] | ||
|
||
; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() | ||
; OPT: store i32 %tmp, ptr addrspace(1) %arg, align 4 | ||
; OPT-NEXT: ret void | ||
|
||
define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) { | ||
; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( | ||
; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { | ||
; OPT-NEXT: [[BB:.*:]] | ||
; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]] | ||
; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4 | ||
; OPT-NEXT: ret void | ||
; | ||
; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize( | ||
; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { | ||
; OPT-W64-NEXT: [[BB:.*:]] | ||
; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4 | ||
; OPT-W64-NEXT: ret void | ||
; | ||
; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize( | ||
; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { | ||
; OPT-W32-NEXT: [[BB:.*:]] | ||
; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4 | ||
; OPT-W32-NEXT: ret void | ||
; | ||
bb: | ||
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 | ||
store i32 %tmp, ptr addrspace(1) %arg, align 4 | ||
ret void | ||
} | ||
|
||
; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize: | ||
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( | ||
|
||
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}} | ||
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}} | ||
; GCN-NOT: cndmask | ||
; GCN: store_{{dword|b32}} v{{.+}}, [[V]] | ||
|
||
; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() | ||
; OPT: %tmp1 = icmp ugt i32 %tmp, 32 | ||
; OPT: %tmp2 = select i1 %tmp1, i32 2, i32 1 | ||
; OPT: store i32 %tmp2, ptr addrspace(1) %arg | ||
; OPT-NEXT: ret void | ||
|
||
define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) { | ||
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( | ||
; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
; OPT-NEXT: [[BB:.*:]] | ||
; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]] | ||
; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 | ||
; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1 | ||
; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4 | ||
; OPT-NEXT: ret void | ||
; | ||
; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( | ||
; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
; OPT-W64-NEXT: [[BB:.*:]] | ||
; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4 | ||
; OPT-W64-NEXT: ret void | ||
; | ||
; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( | ||
; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
; OPT-W32-NEXT: [[BB:.*:]] | ||
; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 | ||
; OPT-W32-NEXT: ret void | ||
; | ||
bb: | ||
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 | ||
%tmp1 = icmp ugt i32 %tmp, 32 | ||
|
@@ -57,15 +88,31 @@ bb: | |
} | ||
|
||
; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize: | ||
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( | ||
|
||
; OPT: bb: | ||
; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() | ||
; OPT: %tmp1 = icmp ugt i32 %tmp, 32 | ||
; OPT: bb3: | ||
; OPT-NEXT: ret void | ||
|
||
define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) { | ||
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( | ||
; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
; OPT-NEXT: [[BB:.*:]] | ||
; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]] | ||
; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32 | ||
; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] | ||
; OPT: [[BB2]]: | ||
; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 | ||
; OPT-NEXT: br label %[[BB3]] | ||
; OPT: [[BB3]]: | ||
; OPT-NEXT: ret void | ||
; | ||
; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( | ||
; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] { | ||
; OPT-W64-NEXT: [[BB:.*:]] | ||
; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4 | ||
; OPT-W64-NEXT: ret void | ||
; | ||
; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( | ||
; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { | ||
; OPT-W32-NEXT: [[BB:.*:]] | ||
; OPT-W32-NEXT: ret void | ||
; | ||
bb: | ||
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 | ||
%tmp1 = icmp ugt i32 %tmp, 32 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The pass isn't needed now?