Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[llvm][AMDGPU] Fold llvm.amdgcn.wavefrontsize early #114481

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
3ba88ce
Add pass to handle AMDGCN pseudo-intrinsics (abstract placeholders fo…
AlexVlx Oct 31, 2024
1376596
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Oct 31, 2024
826c291
Implement review feedback.
AlexVlx Nov 1, 2024
ab6f5a2
Do not fold early for `generic` mcpu.
AlexVlx Nov 1, 2024
f8705fb
Fix formatting (again).
AlexVlx Nov 1, 2024
ed870a8
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 1, 2024
f5751a5
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 4, 2024
026ed00
Remove pass, fold in InstCombine.
AlexVlx Nov 4, 2024
195decc
Remove leftovers.
AlexVlx Nov 4, 2024
1a7abaf
Remove pass.
AlexVlx Nov 4, 2024
9aed76c
Fix formatting.
AlexVlx Nov 4, 2024
246c22f
Really fix formatting.
AlexVlx Nov 4, 2024
5a11720
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 6, 2024
7cf7558
Split tests.
AlexVlx Nov 6, 2024
6a77b8a
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 6, 2024
be414a8
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 7, 2024
dedc593
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 7, 2024
c634b4e
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 18, 2024
c7be46f
Merge branch 'handle_wavefrontsize_early' of https://github.com/AlexV…
AlexVlx Nov 18, 2024
ed9f19f
Tweak `generic` mcpu handling.
AlexVlx Nov 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s
// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s
// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s


#pragma OPENCL EXTENSION cl_khr_fp64 : enable
Expand Down Expand Up @@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu
// CHECK-LABEL test_wavefrontsize(
unsigned test_wavefrontsize() {

// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
// CHECK-AMDGCN: ret i32 {{[0-9]+}}
// CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
return __builtin_amdgcn_wavefrontsize();
}

Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,15 @@ extern char &AMDGPUPrintfRuntimeBindingID;
void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
extern char &AMDGPUResourceUsageAnalysisID;

struct AMDGPUExpandPseudoIntrinsicsPass
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pass isn't needed now?

: PassInfoMixin<AMDGPUExpandPseudoIntrinsicsPass> {
const AMDGPUTargetMachine &TM;
AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);

static bool isRequired() { return true; }
};

struct AMDGPUPrintfRuntimeBindingPass
: PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
Expand Down
49 changes: 49 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file implements a pass that deals with expanding AMDGCN generic pseudo-
// intrinsics into target specific quantities / sequences. In this context, a
// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
// specific instruction, but rather is intended as a mechanism for abstractly
// conveying target specific info to a HLL / the FE, without concretely
// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
// This pass should run as early as possible / immediately after Clang CodeGen,
// so that the optimisation pipeline and the BE operate with concrete target
// data.
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"

#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"

using namespace llvm;

static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST,
Function *WaveSize) {
if (WaveSize->hasZeroLiveUses())
return PreservedAnalyses::all();

for (auto &&U : WaveSize->users())
U->replaceAllUsesWith(ConstantInt::get(WaveSize->getReturnType(),
ST.getWavefrontSize()));

return PreservedAnalyses::none();
}

PreservedAnalyses
AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {

if (auto WS = M.getFunction("llvm.amdgcn.wavefrontsize"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can query by intrinsic ID now

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

return expandWaveSizeIntrinsic(TM.getSubtarget<GCNSubtarget>(*WS), WS);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Subtarget is per function, cannot rely on getting the subtarget from an intrinsic declaration

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


return PreservedAnalyses::all();
}
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ MODULE_PASS("amdgpu-perf-hint",
*static_cast<const GCNTargetMachine *>(this)))
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
MODULE_PASS("amdgpu-expand-pseudo-intrinsics",
AMDGPUExpandPseudoIntrinsicsPass(*this))
#undef MODULE_PASS

#ifndef MODULE_PASS_WITH_PARAMS
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#include "llvm/Passes/TargetPassRegistry.inc"

PB.registerPipelineStartEPCallback(
[](ModulePassManager &PM, OptimizationLevel Level) {
[this](ModulePassManager &PM, OptimizationLevel Level) {
PM.addPass(AMDGPUExpandPseudoIntrinsicsPass(*this));
FunctionPassManager FPM;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to get it run via module pass first, and then function pass again?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was a typo, mashed up merge, apologies.

PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
if (EnableHipStdPar)
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
AMDGPUCtorDtorLowering.cpp
AMDGPUExpandPseudoIntrinsics.cpp
AMDGPUExportClustering.cpp
AMDGPUFrameLowering.cpp
AMDGPUGlobalISelDivergenceLowering.cpp
Expand Down
99 changes: 73 additions & 26 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
Original file line number Diff line number Diff line change
@@ -1,53 +1,84 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
AlexVlx marked this conversation as resolved.
Show resolved Hide resolved
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s

; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
jhuber6 marked this conversation as resolved.
Show resolved Hide resolved
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This codegen test shouldn't be running all of these passes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It already was, mostly? It seems worthwhile to individualise the possible / plausible scenarios.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simplified.

; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s

; GCN-LABEL: {{^}}fold_wavefrontsize:
; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(

; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64
; GCN: store_{{dword|b32}} v{{.+}}, [[V]]

; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
; OPT: store i32 %tmp, ptr addrspace(1) %arg, align 4
; OPT-NEXT: ret void

define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; OPT-NEXT: [[BB:.*:]]
; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]]
; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
; OPT-NEXT: ret void
;
; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; OPT-W64-NEXT: [[BB:.*:]]
; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4
; OPT-W64-NEXT: ret void
;
; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; OPT-W32-NEXT: [[BB:.*:]]
; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4
; OPT-W32-NEXT: ret void
;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
store i32 %tmp, ptr addrspace(1) %arg, align 4
ret void
}

; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(

; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
; GCN-NOT: cndmask
; GCN: store_{{dword|b32}} v{{.+}}, [[V]]

; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
; OPT: %tmp1 = icmp ugt i32 %tmp, 32
; OPT: %tmp2 = select i1 %tmp1, i32 2, i32 1
; OPT: store i32 %tmp2, ptr addrspace(1) %arg
; OPT-NEXT: ret void

define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
; OPT-NEXT: [[BB:.*:]]
; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
; OPT-NEXT: ret void
;
; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
; OPT-W64-NEXT: [[BB:.*:]]
; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4
; OPT-W64-NEXT: ret void
;
; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
; OPT-W32-NEXT: [[BB:.*:]]
; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
; OPT-W32-NEXT: ret void
;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
Expand All @@ -57,15 +88,31 @@ bb:
}

; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(

; OPT: bb:
; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
; OPT: %tmp1 = icmp ugt i32 %tmp, 32
; OPT: bb3:
; OPT-NEXT: ret void

define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
; OPT-NEXT: [[BB:.*:]]
; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
; OPT: [[BB2]]:
; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
; OPT-NEXT: br label %[[BB3]]
; OPT: [[BB3]]:
; OPT-NEXT: ret void
;
; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
; OPT-W64-NEXT: [[BB:.*:]]
; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
; OPT-W64-NEXT: ret void
;
; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
; OPT-W32-NEXT: [[BB:.*:]]
; OPT-W32-NEXT: ret void
;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
Expand Down
Loading