Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU: Handle gfx950 valu write vdst + permlane read hazard #117287

Open
wants to merge 1 commit into
base: users/arsenm/gfx950/vcmpx-permlane-hazard
Choose a base branch
from

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Nov 22, 2024

No description provided.

This was referenced Nov 22, 2024
Copy link
Contributor Author

arsenm commented Nov 22, 2024

Warning

This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
Learn more

This stack of pull requests is managed by Graphite. Learn more about stacking.

@arsenm arsenm marked this pull request as ready for review November 22, 2024 04:45
@llvmbot
Copy link
Member

llvmbot commented Nov 22, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/117287.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+28-2)
  • (modified) llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir (+113)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll (+6)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll (+6)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 45ff1f4a63cf03..19651f2557b46d 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2552,8 +2552,34 @@ int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
     return isVCmpXWritesExec(*TII, *TRI, MI);
   };
 
-  const int NumWaitStates = 4;
-  return NumWaitStates - getWaitStatesSince(IsVCmpXWritesExecFn, NumWaitStates);
+  auto IsVALUFn = [](const MachineInstr &MI) {
+    return SIInstrInfo::isVALU(MI);
+  };
+
+  const int VCmpXWritesExecWaitStates = 4;
+  const int VALUWritesVDstWaitStates = 2;
+  int WaitStatesNeeded = 0;
+
+  for (const MachineOperand &Op : MI->explicit_uses()) {
+    if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
+      continue;
+    Register Reg = Op.getReg();
+
+    int WaitStatesSinceDef =
+        VALUWritesVDstWaitStates -
+        getWaitStatesSinceDef(Reg, IsVALUFn,
+                              /*MaxWaitStates=*/VALUWritesVDstWaitStates);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
+    if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
+      break;
+  }
+
+  int VCmpXHazardWaits =
+      VCmpXWritesExecWaitStates -
+      getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
+
+  WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
+  return WaitStatesNeeded;
 }
 
 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 97bef7be711ff2..75834316750951 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -142,3 +142,116 @@ body:             |
     $vgpr4 = V_MOV_B32_e32 0, implicit $exec
     renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
 ...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane16_swap_0
+body:             |
+  bb.0:
+    liveins: $vgpr1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_1
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane16_swap_1
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_0
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane32_swap_0
+body:             |
+  bb.0:
+    liveins: $vgpr1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_1
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane32_swap_1
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# No hazard, write of other register
+# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0_otherreg
+# GCN:      V_MOV_B32
+# GCN-NEXT: V_PERMLANE
+name:            valu_write_vdst_read_permlane16_swap_0_otherreg
+body:             |
+  bb.0:
+    liveins: $vgpr1
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
+...
+
+---
+# Both permlane hazards at once.
+# GCN-LABEL: name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
+# GCN:      V_MOV_B32
+# GCN:      V_CMPX_EQ_I32
+# GCN-NEXT: S_NOP 3
+# GCN-NEXT: V_PERMLANE
+name:            valu_writes_vdst__vcmpx_write_exec__permlane32_swap
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $vgpr3
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
+# GCN:      V_CMPX_EQ_I32
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 2
+# GCN-NEXT: V_PERMLANE
+name:            vcmpx_write_exec__valu_writes_vdst___permlane32_swap
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $vgpr3
+    $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
+
+---
+# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
+# GCN:      V_CMPX_EQ_I32
+# GCN:      V_MOV_B32
+# GCN:      V_MOV_B32
+# GCN-NEXT: S_NOP 1
+# GCN-NEXT: V_PERMLANE
+name:            vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $vgpr3
+    $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
index 0d5dfa46c2c260..e1cebe28f7fe8a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0xc1d1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
index e3b0879af4307d..121c379053fcf7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vi(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vl(i32 %vdst_old) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0xc1d1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane32_swap_b32_iv(i32 %src0_old) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane32_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane32_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane32_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants