From 91052169960477fbc39169c10f9fae3bec732510 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Wed, 17 Jul 2024 15:07:42 +0900 Subject: [PATCH 1/3] [AMDGPU] Implement workaround for GFX11.5 export priority On GFX11.5 shaders having completed exports need to execute/wait at a lower priority than shaders still executing exports. Add code to maintain normal priority of 2 for shaders that export and drop to priority 0 after exports. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 15 +- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 112 ++++++ llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 1 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + .../AMDGPU/required-export-priority.ll | 344 ++++++++++++++++++ .../AMDGPU/required-export-priority.mir | 293 +++++++++++++++ 6 files changed, 765 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/required-export-priority.ll create mode 100644 llvm/test/CodeGen/AMDGPU/required-export-priority.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index dfc8eaea66f7b..14fcf6a210a78 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -947,6 +947,12 @@ def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset", "Has restricted SOffset (immediate not supported)." >; +def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority", + "HasRequiredExportPriority", + "true", + "Export priority must be explicitly manipulated on GFX11.5" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -1597,14 +1603,16 @@ def FeatureISAVersion11_5_0 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureSALUFloatInsts, FeatureDPPSrc1SGPR, - FeatureVGPRSingleUseHintInsts])>; + FeatureVGPRSingleUseHintInsts, + FeatureRequiredExportPriority])>; def FeatureISAVersion11_5_1 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureSALUFloatInsts, FeatureDPPSrc1SGPR, FeatureVGPRSingleUseHintInsts, - FeatureGFX11FullVGPRs])>; + FeatureGFX11FullVGPRs, + FeatureRequiredExportPriority])>; def FeatureISAVersion12 : FeatureSet< [FeatureGFX12, diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a402fc6d7e611..a8b171aa82840 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -14,6 +14,7 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/TargetParser/TargetParser.h" @@ -1104,6 +1105,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixWMMAHazards(MI); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); + fixRequiredExportPriority(MI); } bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { @@ -2895,3 +2897,113 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { return true; } + +static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, + const SIInstrInfo &TII) { + MachineBasicBlock &EntryMBB = MF->front(); + if (EntryMBB.begin() != EntryMBB.end()) { + auto &EntryMI = *EntryMBB.begin(); + if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO && + EntryMI.getOperand(0).getImm() >= Priority) + return false; + } + + BuildMI(EntryMBB, EntryMBB.begin(), DebugLoc(), TII.get(AMDGPU::S_SETPRIO)) + .addImm(Priority); + return true; +} + +bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { + if (!ST.hasRequiredExportPriority()) + return false; + + // Assume the following shader types will never have exports, + // and avoid adding or adjusting S_SETPRIO. + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MBB->getParent(); + auto CC = MF->getFunction().getCallingConv(); + switch (CC) { + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: + case CallingConv::AMDGPU_KERNEL: + return false; + default: + break; + } + + const int MaxPriority = 3; + const int NormalPriority = 2; + const int PostExportPriority = 0; + + auto It = MI->getIterator(); + switch (MI->getOpcode()) { + case AMDGPU::S_ENDPGM: + case AMDGPU::S_ENDPGM_SAVED: + case AMDGPU::S_ENDPGM_ORDERED_PS_DONE: + case AMDGPU::SI_RETURN_TO_EPILOG: + // Ensure shader with calls raises priority at entry. + // This ensures correct priority if exports exist in callee. + if (MF->getFrameInfo().hasCalls()) + return ensureEntrySetPrio(MF, NormalPriority, TII); + return false; + case AMDGPU::S_SETPRIO: { + // Raise minimum priority unless in workaround. + auto &PrioOp = MI->getOperand(0); + int Prio = PrioOp.getImm(); + bool InWA = (Prio == PostExportPriority) && + (It != MBB->begin() && TII.isEXP(*std::prev(It))); + if (InWA || Prio >= NormalPriority) + return false; + PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority)); + return true; + } + default: + if (!TII.isEXP(*MI)) + return false; + break; + } + + // Check entry priority at each export (as there will only be a few). + // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. + bool Changed = false; + if (CC != CallingConv::AMDGPU_Gfx) + Changed = ensureEntrySetPrio(MF, NormalPriority, TII); + + auto NextMI = std::next(It); + bool EndOfShader = false; + if (NextMI != MBB->end()) { + // Only need WA at end of sequence of exports. + if (TII.isEXP(*NextMI)) + return Changed; + // Assume appropriate S_SETPRIO after export means WA already applied. + if (NextMI->getOpcode() == AMDGPU::S_SETPRIO && + NextMI->getOperand(0).getImm() == PostExportPriority) + return Changed; + EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM; + } + + const DebugLoc &DL = MI->getDebugLoc(); + + // Lower priority. + BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) + .addImm(PostExportPriority); + + if (!EndOfShader) { + // Wait for exports to complete. + BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT)) + .addReg(AMDGPU::SGPR_NULL) + .addImm(0); + } + + BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); + BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_NOP)).addImm(0); + + if (!EndOfShader) { + // Return to normal (higher) priority. + BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO)) + .addImm(NormalPriority); + } + + return true; +} diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 3ccca527c626b..f2a64ab48e180 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -107,6 +107,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { bool fixWMMAHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); + bool fixRequiredExportPriority(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); int checkMAIHazards908(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index e5817594a4521..def89c785b855 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -238,6 +238,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasVOPDInsts = false; bool HasVALUTransUseHazard = false; bool HasForceStoreSC0SC1 = false; + bool HasRequiredExportPriority = false; // Dummy feature to use for assembler in tablegen. bool FeatureDisable = false; @@ -1282,6 +1283,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } + bool hasRequiredExportPriority() const { return HasRequiredExportPriority; } + /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively. bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; } diff --git a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll new file mode 100644 index 0000000000000..377902f3f0d1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll @@ -0,0 +1,344 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define amdgpu_ps void @test_export_zeroes_f32() #0 { +; GCN-LABEL: test_export_zeroes_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: exp mrt0 off, off, off, off +; GCN-NEXT: exp mrt0 off, off, off, off done +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_endpgm + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 true, i1 false) + ret void +} + +define amdgpu_ps void @test_export_en_src0_f32() #0 { +; GCN-LABEL: test_export_en_src0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: v_mov_b32_e32 v0, 4.0 +; GCN-NEXT: v_mov_b32_e32 v1, 0.5 +; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_mov_b32_e32 v3, 1.0 +; GCN-NEXT: exp mrt0 v3, off, off, off done +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_endpgm + call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +define amdgpu_gs void @test_export_gs() #0 { +; GCN-LABEL: test_export_gs: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: v_mov_b32_e32 v0, 4.0 +; GCN-NEXT: v_mov_b32_e32 v1, 0.5 +; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_mov_b32_e32 v3, 1.0 +; GCN-NEXT: exp mrt0 off, v2, off, off done +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_endpgm + call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +define amdgpu_hs void @test_export_hs() #0 { +; GCN-LABEL: test_export_hs: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: v_mov_b32_e32 v0, 4.0 +; GCN-NEXT: v_mov_b32_e32 v1, 0.5 +; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_mov_b32_e32 v3, 1.0 +; GCN-NEXT: exp mrt0 off, v2, off, off done +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_endpgm + call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +define amdgpu_gfx void @test_export_gfx(float %v) #0 { +; GCN-LABEL: test_export_gfx: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 4.0 +; GCN-NEXT: v_mov_b32_e32 v2, 0.5 +; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: exp mrt0 off, v3, off, off done +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_waitcnt_expcnt null, 0x0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float %v, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +define amdgpu_cs void @test_export_cs() #0 { +; GCN-LABEL: test_export_cs: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v0, 4.0 +; GCN-NEXT: v_mov_b32_e32 v1, 0.5 +; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_mov_b32_e32 v3, 1.0 +; GCN-NEXT: exp mrt0 off, v2, off, off done +; GCN-NEXT: s_endpgm + call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +define amdgpu_kernel void @test_export_kernel() #0 { +; GCN-LABEL: test_export_kernel: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v0, 4.0 +; GCN-NEXT: v_mov_b32_e32 v1, 0.5 +; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: v_mov_b32_e32 v3, 1.0 +; GCN-NEXT: exp mrt0 off, v2, off, off done +; GCN-NEXT: s_endpgm + call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false) + ret void +} + +define amdgpu_gfx void @test_no_export_gfx(float %v) #0 { +; GCN-LABEL: test_no_export_gfx: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + ret void +} + +define amdgpu_ps void @test_no_export_ps(float %v) #0 { +; GCN-LABEL: test_no_export_ps: +; GCN: ; %bb.0: +; GCN-NEXT: s_endpgm + ret void +} + +define amdgpu_ps void @test_if_export_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { +; GCN-LABEL: test_if_export_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: v_cmpx_ne_u32_e32 0, v0 +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: ; %bb.1: ; %exp +; GCN-NEXT: exp mrt0 v1, v2, v3, v4 +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_waitcnt_expcnt null, 0x0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: .LBB9_2: ; %end +; GCN-NEXT: s_endpgm + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false) + br label %end + +end: + ret void +} + +define amdgpu_ps void @test_if_export_vm_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { +; GCN-LABEL: test_if_export_vm_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: v_cmpx_ne_u32_e32 0, v0 +; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: ; %bb.1: ; %exp +; GCN-NEXT: exp mrt0 v1, v2, v3, v4 +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_waitcnt_expcnt null, 0x0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: .LBB10_2: ; %end +; GCN-NEXT: s_endpgm + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 true) + br label %end + +end: + ret void +} + +define amdgpu_ps void @test_if_export_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { +; GCN-LABEL: test_if_export_done_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: v_cmpx_ne_u32_e32 0, v0 +; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: ; %bb.1: ; %exp +; GCN-NEXT: exp mrt0 v1, v2, v3, v4 done +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_waitcnt_expcnt null, 0x0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: .LBB11_2: ; %end +; GCN-NEXT: s_endpgm + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 false) + br label %end + +end: + ret void +} + +define amdgpu_ps void @test_if_export_vm_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { +; GCN-LABEL: test_if_export_vm_done_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: v_cmpx_ne_u32_e32 0, v0 +; GCN-NEXT: s_cbranch_execz .LBB12_2 +; GCN-NEXT: ; %bb.1: ; %exp +; GCN-NEXT: exp mrt0 v1, v2, v3, v4 done +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_waitcnt_expcnt null, 0x0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: .LBB12_2: ; %end +; GCN-NEXT: s_endpgm + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + br label %end + +end: + ret void +} + +define amdgpu_ps void @test_export_pos_before_param_across_load(i32 %idx) #0 { +; GCN-LABEL: test_export_pos_before_param_across_load: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: v_mov_b32_e32 v3, 0.5 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: exp pos0 v1, v1, v1, v0 done +; GCN-NEXT: exp invalid_target_32 v2, v2, v2, v2 +; GCN-NEXT: exp invalid_target_33 v2, v2, v2, v3 +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_endpgm + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float 1.0, float 1.0, float 1.0, float 0.5, i1 false, i1 false) + %load = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0) + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float %load, i1 true, i1 false) + ret void +} + +define amdgpu_ps void @test_export_across_store_load(i32 %idx, float %v) #0 { +; GCN-LABEL: test_export_across_store_load: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: v_mov_b32_e32 v2, 24 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 8, vcc_lo +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: scratch_store_b32 v0, v1, off +; GCN-NEXT: scratch_load_b32 v0, off, off +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: exp pos0 v2, v2, v2, v1 done +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_waitcnt_expcnt null, 0x0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: exp invalid_target_32 v0, v2, v1, v2 +; GCN-NEXT: exp invalid_target_33 v0, v2, v1, v2 +; GCN-NEXT: s_setprio 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_endpgm + %data0 = alloca <4 x float>, align 8, addrspace(5) + %data1 = alloca <4 x float>, align 8, addrspace(5) + %cmp = icmp eq i32 %idx, 1 + %data = select i1 %cmp, ptr addrspace(5) %data0, ptr addrspace(5) %data1 + store float %v, ptr addrspace(5) %data, align 8 + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float 1.0, i1 true, i1 false) + %load0 = load float, ptr addrspace(5) %data0, align 8 + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %load0, float 0.0, float 1.0, float 0.0, i1 false, i1 false) + ret void +} + +define amdgpu_ps void @test_export_in_callee(float %v) #0 { +; GCN-LABEL: test_export_in_callee: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s0, s0, test_export_gfx@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s1, s1, test_export_gfx@gotpcrel32@hi+12 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GCN-NEXT: s_endpgm + %x = fadd float %v, 1.0 + call void @test_export_gfx(float %x) + ret void +} + +define amdgpu_ps void @test_export_in_callee_prio(float %v) #0 { +; GCN-LABEL: test_export_in_callee_prio: +; GCN: ; %bb.0: +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: s_setprio 2 +; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s0, s0, test_export_gfx@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s1, s1, test_export_gfx@gotpcrel32@hi+12 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GCN-NEXT: s_endpgm + %x = fadd float %v, 1.0 + call void @llvm.amdgcn.s.setprio(i16 0) + call void @test_export_gfx(float %x) + ret void +} + +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 +declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #2 +declare void @llvm.amdgcn.s.setprio(i16) + +attributes #0 = { nounwind } +attributes #1 = { nounwind inaccessiblememonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/required-export-priority.mir b/llvm/test/CodeGen/AMDGPU/required-export-priority.mir new file mode 100644 index 0000000000000..eee04468036e5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/required-export-priority.mir @@ -0,0 +1,293 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=post-RA-hazard-rec -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX1150 %s + +--- | + define amdgpu_ps void @end_of_shader() { + ret void + } + define amdgpu_ps void @end_of_shader_return_to_epilogue() { + ret void + } + define amdgpu_ps void @end_of_block() { + ret void + } + define amdgpu_ps void @start_of_block() { + ret void + } + define amdgpu_ps void @block_of_exports() { + ret void + } + define amdgpu_ps void @sparse_exports() { + ret void + } + define amdgpu_ps void @existing_setprio_1() { + ret void + } + define amdgpu_ps void @existing_setprio_2() { + ret void + } +... + +--- +name: end_of_shader +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0' } +body: | + bb.0: + liveins: $vgpr0 + ; GFX1150-LABEL: name: end_of_shader + ; GFX1150: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_ENDPGM 0 + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + S_ENDPGM 0 +... + +--- +name: end_of_shader_return_to_epilogue +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0' } +body: | + bb.0: + liveins: $vgpr0 + ; GFX1150-LABEL: name: end_of_shader_return_to_epilogue + ; GFX1150: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 0 + ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + SI_RETURN_TO_EPILOG $vgpr0 +... + +--- +name: end_of_block +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0' } +body: | + ; GFX1150-LABEL: name: end_of_block + ; GFX1150: bb.0: + ; GFX1150-NEXT: successors: %bb.1(0x80000000) + ; GFX1150-NEXT: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 0 + ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: bb.1: + ; GFX1150-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0 + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + + bb.1: + S_ENDPGM 0 +... + +--- +name: start_of_block +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0' } +body: | + ; GFX1150-LABEL: name: start_of_block + ; GFX1150: bb.0: + ; GFX1150-NEXT: successors: %bb.1(0x80000000) + ; GFX1150-NEXT: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: bb.1: + ; GFX1150-NEXT: successors: %bb.2(0x80000000) + ; GFX1150-NEXT: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 0 + ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: bb.2: + ; GFX1150-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0 + + bb.1: + liveins: $vgpr0 + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + + bb.2: + S_ENDPGM 0 +... + +--- +name: block_of_exports +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0' } +body: | + bb.0: + liveins: $vgpr0 + ; GFX1150-LABEL: name: block_of_exports + ; GFX1150: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_ENDPGM 0 + EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + S_ENDPGM 0 +... + +--- +name: sparse_exports +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0' } +body: | + bb.0: + liveins: $vgpr0 + ; GFX1150-LABEL: name: sparse_exports + ; GFX1150: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 0 + ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec + ; GFX1150-NEXT: EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 0 + ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec + ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_ENDPGM 0 + EXP 2, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec + EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + S_ENDPGM 0 +... + +--- +name: existing_setprio_1 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0' } +body: | + ; GFX1150-LABEL: name: existing_setprio_1 + ; GFX1150: bb.0: + ; GFX1150-NEXT: successors: %bb.1(0x80000000) + ; GFX1150-NEXT: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: bb.1: + ; GFX1150-NEXT: successors: %bb.2(0x80000000) + ; GFX1150-NEXT: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: S_SETPRIO 3 + ; GFX1150-NEXT: $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: bb.2: + ; GFX1150-NEXT: successors: %bb.3(0x80000000) + ; GFX1150-NEXT: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: S_SETPRIO 3 + ; GFX1150-NEXT: $vgpr0 = V_OR_B32_e32 3, $vgpr0, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: bb.3: + ; GFX1150-NEXT: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0 + $vgpr0 = V_AND_B32_e32 1, $vgpr0, implicit $exec + + bb.1: + liveins: $vgpr0 + S_SETPRIO 3 + $vgpr0 = V_OR_B32_e32 2, $vgpr0, implicit $exec + S_SETPRIO 0 + + bb.2: + liveins: $vgpr0 + S_SETPRIO 1 + $vgpr0 = V_OR_B32_e32 3, $vgpr0, implicit $exec + S_SETPRIO 0 + + bb.3: + liveins: $vgpr0 + EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + S_ENDPGM 0 +... + +--- +name: existing_setprio_2 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0' } +body: | + bb.0: + liveins: $vgpr0 + ; GFX1150-LABEL: name: existing_setprio_2 + ; GFX1150: liveins: $vgpr0 + ; GFX1150-NEXT: {{ $}} + ; GFX1150-NEXT: S_SETPRIO 3 + ; GFX1150-NEXT: EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX1150-NEXT: S_SETPRIO 0 + ; GFX1150-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_NOP 0 + ; GFX1150-NEXT: S_SETPRIO 2 + ; GFX1150-NEXT: S_SETPRIO 3 + ; GFX1150-NEXT: S_ENDPGM 0 + S_SETPRIO 3 + EXP 1, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + S_SETPRIO 3 + S_ENDPGM 0 +... From 8ea44e65f2c19facff751aeb2ac960f907fb210f Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Wed, 17 Jul 2024 16:18:02 +0900 Subject: [PATCH 2/3] Remove -verify-machineinstrs from test. --- llvm/test/CodeGen/AMDGPU/required-export-priority.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll index 377902f3f0d1a..ebc209bd4d451 100644 --- a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll +++ b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s define amdgpu_ps void @test_export_zeroes_f32() #0 { ; GCN-LABEL: test_export_zeroes_f32: