Skip to content

Commit

Permalink
AMDGPU: Handle gfx950 valu write vdst + permlane read hazard
Browse files Browse the repository at this point in the history
  • Loading branch information
arsenm committed Nov 22, 2024
1 parent 0cbee40 commit 5f59ebe
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 2 deletions.
30 changes: 28 additions & 2 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2552,8 +2552,34 @@ int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
return isVCmpXWritesExec(*TII, *TRI, MI);
};

const int NumWaitStates = 4;
return NumWaitStates - getWaitStatesSince(IsVCmpXWritesExecFn, NumWaitStates);
auto IsVALUFn = [](const MachineInstr &MI) {
return SIInstrInfo::isVALU(MI);
};

const int VCmpXWritesExecWaitStates = 4;
const int VALUWritesVDstWaitStates = 2;
int WaitStatesNeeded = 0;

for (const MachineOperand &Op : MI->explicit_uses()) {
if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
continue;
Register Reg = Op.getReg();

int WaitStatesSinceDef =
VALUWritesVDstWaitStates -
getWaitStatesSinceDef(Reg, IsVALUFn,
/*MaxWaitStates=*/VALUWritesVDstWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
break;
}

int VCmpXHazardWaits =
VCmpXWritesExecWaitStates -
getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);

WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
return WaitStatesNeeded;
}

static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
Expand Down
113 changes: 113 additions & 0 deletions llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,116 @@ body: |
$vgpr4 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
...

---
# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0
# GCN: V_MOV_B32
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_PERMLANE
name: valu_write_vdst_read_permlane16_swap_0
body: |
bb.0:
liveins: $vgpr1
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
...

---
# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_1
# GCN: V_MOV_B32
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_PERMLANE
name: valu_write_vdst_read_permlane16_swap_1
body: |
bb.0:
liveins: $vgpr0
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
...

---
# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_0
# GCN: V_MOV_B32
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_PERMLANE
name: valu_write_vdst_read_permlane32_swap_0
body: |
bb.0:
liveins: $vgpr1
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
...

---
# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_1
# GCN: V_MOV_B32
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_PERMLANE
name: valu_write_vdst_read_permlane32_swap_1
body: |
bb.0:
liveins: $vgpr0
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
...

---
# No hazard, write of other register
# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0_otherreg
# GCN: V_MOV_B32
# GCN-NEXT: V_PERMLANE
name: valu_write_vdst_read_permlane16_swap_0_otherreg
body: |
bb.0:
liveins: $vgpr1
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
...

---
# Both permlane hazards at once.
# GCN-LABEL: name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
# GCN: V_MOV_B32
# GCN: V_CMPX_EQ_I32
# GCN-NEXT: S_NOP 3
# GCN-NEXT: V_PERMLANE
name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
body: |
bb.0:
liveins: $vgpr0, $vgpr2, $vgpr3
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
$exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
...

---
# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
# GCN: V_CMPX_EQ_I32
# GCN: V_MOV_B32
# GCN-NEXT: S_NOP 2
# GCN-NEXT: V_PERMLANE
name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
body: |
bb.0:
liveins: $vgpr0, $vgpr2, $vgpr3
$exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
...

---
# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
# GCN: V_CMPX_EQ_I32
# GCN: V_MOV_B32
# GCN: V_MOV_B32
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_PERMLANE
name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
body: |
bb.0:
liveins: $vgpr0, $vgpr2, $vgpr3
$exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
...
6 changes: 6 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 1
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
Expand All @@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
Expand All @@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
Expand All @@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
Expand All @@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
Expand All @@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
Expand Down
6 changes: 6 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vi(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 1
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
Expand All @@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vl(i32 %vdst_old) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
Expand All @@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane32_swap_b32_iv(i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 1, i32 %src0_old, i1 false, i1 false)
Expand All @@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane32_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
Expand All @@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane32_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
Expand All @@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
Expand Down

0 comments on commit 5f59ebe

Please sign in to comment.