From c609205ba0205f996a033807d384e9148d30ea2e Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 2 Mar 2023 14:10:01 -0800 Subject: [PATCH] AMDGPU: Match and Select BITOP3 on gfx950 Co-authored-by: Stanislav Mekhanoshin --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 170 ++++++++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 3 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 202 ++++++++++ .../Target/AMDGPU/AMDGPUInstructionSelector.h | 1 + llvm/lib/Target/AMDGPU/VOP3Instructions.td | 13 + llvm/test/CodeGen/AMDGPU/bitop3.ll | 368 ++++++++++++++++++ 6 files changed, 757 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/bitop3.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 7d78e9cd7eab6f..c0e01a020e0eb9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3552,6 +3552,176 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, return true; } +// Match BITOP3 operation and return a number of matched instructions plus +// truth table. +static std::pair BitOp3_Op(SDValue In, + SmallVectorImpl &Src) { + unsigned NumOpcodes = 0; + uint8_t LHSBits, RHSBits; + + auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool { + // Define truth table given Src0, Src1, Src2 bits permutations: + // 0 0 0 + // 0 0 1 + // 0 1 0 + // 0 1 1 + // 1 0 0 + // 1 0 1 + // 1 1 0 + // 1 1 1 + const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa }; + + if (auto *C = dyn_cast(Op)) { + if (C->isAllOnes()) { + Bits = 0xff; + return true; + } + if (C->isZero()) { + Bits = 0; + return true; + } + } + + for (unsigned I = 0; I < Src.size(); ++I) { + // Try to find existing reused operand + if (Src[I] == Op) { + Bits = SrcBits[I]; + return true; + } + // Try to replace parent operator + if (Src[I] == In) { + Bits = SrcBits[I]; + Src[I] = Op; + return true; + } + } + + if (Src.size() == 3) { + // No room left for operands. Try one last time, there can be a 'not' of + // one of our source operands. In this case we can compute the bits + // without growing Src vector. + if (Op.getOpcode() == ISD::XOR) { + if (auto *C = dyn_cast(Op.getOperand(1))) { + if (C->isAllOnes()) { + SDValue LHS = Op.getOperand(0); + for (unsigned I = 0; I < Src.size(); ++I) { + if (Src[I] == LHS) { + Bits = ~SrcBits[I]; + return true; + } + } + } + } + } + + return false; + } + + Bits = SrcBits[Src.size()]; + Src.push_back(Op); + return true; + }; + + switch (In.getOpcode()) { + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + SDValue LHS = In.getOperand(0); + SDValue RHS = In.getOperand(1); + + SmallVector Backup(Src.begin(), Src.end()); + if (!getOperandBits(LHS, LHSBits) || + !getOperandBits(RHS, RHSBits)) { + Src = Backup; + return std::make_pair(0, 0); + } + + // Recursion is naturally limited by the size of the operand vector. + auto Op = BitOp3_Op(LHS, Src); + if (Op.first) { + NumOpcodes += Op.first; + LHSBits = Op.second; + } + + Op = BitOp3_Op(RHS, Src); + if (Op.first) { + NumOpcodes += Op.first; + RHSBits = Op.second; + } + break; + } + default: + return std::make_pair(0, 0); + } + + uint8_t TTbl; + switch (In.getOpcode()) { + case ISD::AND: + TTbl = LHSBits & RHSBits; + break; + case ISD::OR: + TTbl = LHSBits | RHSBits; + break; + case ISD::XOR: + TTbl = LHSBits ^ RHSBits; + break; + default: + break; + } + + return std::make_pair(NumOpcodes + 1, TTbl); +} + +bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, + SDValue &Src2, SDValue &Tbl) const { + SmallVector Src; + uint8_t TTbl; + unsigned NumOpcodes; + + std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src); + + // Src.empty() case can happen if all operands are all zero or all ones. + // Normally it shall be optimized out before reaching this. + if (NumOpcodes < 2 || Src.empty()) + return false; + + // For a uniform case threshold should be higher to account for moves between + // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs + // and a readtfirstlane after. + if (NumOpcodes < 4 && !In->isDivergent()) + return false; + + if (NumOpcodes == 2 && In.getValueType() == MVT::i32) { + // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes + // asm more readable. This cannot be modeled with AddedComplexity because + // selector does not know how many operations did we match. + if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) && + (In.getOperand(0).getOpcode() == In.getOpcode() || + In.getOperand(1).getOpcode() == In.getOpcode())) + return false; + + if (In.getOpcode() == ISD::OR && + (In.getOperand(0).getOpcode() == ISD::AND || + In.getOperand(1).getOpcode() == ISD::AND)) + return false; + } + + // Last operand can be ignored, turning a ternary operation into a binary. + // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace + // 'c' with 'a' here without changing the answer. In some pathological + // cases it should be possible to get an operation with a single operand + // too if optimizer would not catch it. + while (Src.size() < 3) + Src.push_back(Src[0]); + + Src0 = Src[0]; + Src1 = Src[1]; + Src2 = Src[2]; + + Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32); + return true; +} + SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const { if (In.isUndef()) return CurDAG->getUNDEF(MVT::i32); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 5ae0b179d7d0e6..7e61eb470622f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -242,6 +242,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { SDValue &SrcMods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2, + SDValue &Tbl) const; + SDValue getHi16Elt(SDValue In) const; SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 7ce7562cdcaa95..71d23f9fe30c49 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3643,6 +3643,206 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { return true; } +// Match BITOP3 operation and return a number of matched instructions plus +// truth table. +static std::pair BitOp3_Op(Register R, + SmallVectorImpl &Src, + const MachineRegisterInfo &MRI) { + unsigned NumOpcodes = 0; + uint8_t LHSBits, RHSBits; + + auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool { + // Define truth table given Src0, Src1, Src2 bits permutations: + // 0 0 0 + // 0 0 1 + // 0 1 0 + // 0 1 1 + // 1 0 0 + // 1 0 1 + // 1 1 0 + // 1 1 1 + const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa }; + + if (mi_match(Op, MRI, m_AllOnesInt())) { + Bits = 0xff; + return true; + } + if (mi_match(Op, MRI, m_ZeroInt())) { + Bits = 0; + return true; + } + + for (unsigned I = 0; I < Src.size(); ++I) { + // Try to find existing reused operand + if (Src[I] == Op) { + Bits = SrcBits[I]; + return true; + } + // Try to replace parent operator + if (Src[I] == R) { + Bits = SrcBits[I]; + Src[I] = Op; + return true; + } + } + + if (Src.size() == 3) { + // No room left for operands. Try one last time, there can be a 'not' of + // one of our source operands. In this case we can compute the bits + // without growing Src vector. + Register LHS; + if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) { + LHS = getSrcRegIgnoringCopies(LHS, MRI); + for (unsigned I = 0; I < Src.size(); ++I) { + if (Src[I] == LHS) { + Bits = ~SrcBits[I]; + return true; + } + } + } + + return false; + } + + Bits = SrcBits[Src.size()]; + Src.push_back(Op); + return true; + }; + + MachineInstr *MI = MRI.getVRegDef(R); + switch (MI->getOpcode()) { + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: { + Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI); + Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI); + + SmallVector Backup(Src.begin(), Src.end()); + if (!getOperandBits(LHS, LHSBits) || + !getOperandBits(RHS, RHSBits)) { + Src = Backup; + return std::make_pair(0, 0); + } + + // Recursion is naturally limited by the size of the operand vector. + auto Op = BitOp3_Op(LHS, Src, MRI); + if (Op.first) { + NumOpcodes += Op.first; + LHSBits = Op.second; + } + + Op = BitOp3_Op(RHS, Src, MRI); + if (Op.first) { + NumOpcodes += Op.first; + RHSBits = Op.second; + } + break; + } + default: + return std::make_pair(0, 0); + } + + uint8_t TTbl; + switch (MI->getOpcode()) { + case TargetOpcode::G_AND: + TTbl = LHSBits & RHSBits; + break; + case TargetOpcode::G_OR: + TTbl = LHSBits | RHSBits; + break; + case TargetOpcode::G_XOR: + TTbl = LHSBits ^ RHSBits; + break; + default: + break; + } + + return std::make_pair(NumOpcodes + 1, TTbl); +} + +bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const { + if (!Subtarget->hasBitOp3Insts()) + return false; + + SmallVector Src; + uint8_t TTbl; + unsigned NumOpcodes; + Register DstReg = MI.getOperand(0).getReg(); + + std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI); + + // Src.empty() case can happen if all operands are all zero or all ones. + // Normally it shall be optimized out before reaching this. + if (NumOpcodes < 2 || Src.empty()) + return false; + + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); + const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; + + // For a uniform case threshold should be higher to account for moves between + // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs + // and a readtfirstlane after. + if (NumOpcodes < 4 && !IsVALU) + return false; + + bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32); + if (NumOpcodes == 2 && IsB32) { + // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes + // asm more readable. This cannot be modeled with AddedComplexity because + // selector does not know how many operations did we match. + if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) || + mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) || + mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg()))) + return false; + } + + unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64; + unsigned CBL = STI.getConstantBusLimit(Opc); + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + for (unsigned I = 0; I < Src.size(); ++I) { + const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI); + if (RB->getID() != AMDGPU::SGPRRegBankID) + continue; + if (CBL > 0) { + --CBL; + continue; + } + Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg) + .addReg(Src[I]); + Src[I] = NewReg; + } + + // Last operand can be ignored, turning a ternary operation into a binary. + // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace + // 'c' with 'a' here without changing the answer. In some pathological + // cases it should be possible to get an operation with a single operand + // too if optimizer would not catch it. + while (Src.size() < 3) + Src.push_back(Src[0]); + + auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg); + if (!IsB32) + MIB.addImm(0); // src_mod0 + MIB.addReg(Src[0]); + if (!IsB32) + MIB.addImm(0); // src_mod1 + MIB.addReg(Src[1]); + if (!IsB32) + MIB.addImm(0); // src_mod2 + MIB.addReg(Src[2]) + .addImm(TTbl); + if (!IsB32) + MIB.addImm(0); // op_sel + + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + MI.eraseFromParent(); + + return true; +} + bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const { Register SrcReg = MI.getOperand(0).getReg(); if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) @@ -3682,6 +3882,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_AND: case TargetOpcode::G_OR: case TargetOpcode::G_XOR: + if (selectBITOP3(I)) + return true; if (selectImpl(I, *CoverageInfo)) return true; return selectG_AND_OR_XOR(I); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index a81f1579fb9f33..d294300be40497 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -147,6 +147,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectSMFMACIntrin(MachineInstr &I) const; bool selectPermlaneSwapIntrin(MachineInstr &I, Intrinsic::ID IntrID) const; bool selectWaveAddress(MachineInstr &I) const; + bool selectBITOP3(MachineInstr &I) const; bool selectStackRestore(MachineInstr &MI) const; bool selectNamedBarrierInit(MachineInstr &I, Intrinsic::ID IID) const; bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 34850e42a3d605..c8c36714909adf 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -6,6 +6,9 @@ // //===----------------------------------------------------------------------===// +def BITOP3_32 : ComplexPattern; +def BITOP3_16 : ComplexPattern; + // Special case for v_div_fmas_{f32|f64}, since it seems to be the // only VOP instruction that implicitly reads VCC. let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in { @@ -1275,6 +1278,16 @@ let SubtargetPredicate = HasBitOp3Insts in { (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)), (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) >; + + def : GCNPat< + (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i8:$bitop3)), + (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)), + (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; } // End SubtargetPredicate = HasBitOp3Insts class DivFmasPat : GCNPat< diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll new file mode 100644 index 00000000000000..dd608ef0e5a53d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll @@ -0,0 +1,368 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-SDAG %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-GISEL %s + +; ========= Single bit functions ========= + +define amdgpu_ps float @not_and_not_and_not_and(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: not_and_not_and_not_and: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:1 +; GCN-NEXT: ; return to shader part epilog + %nota = xor i32 %a, -1 + %notb = xor i32 %b, -1 + %notc = xor i32 %c, -1 + %and1 = and i32 %nota, %notc + %and2 = and i32 %and1, %notb + %ret_cast = bitcast i32 %and2 to float + ret float %ret_cast +} + +define amdgpu_ps float @not_and_not_and_and(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: not_and_not_and_and: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:2 +; GCN-NEXT: ; return to shader part epilog + %nota = xor i32 %a, -1 + %notb = xor i32 %b, -1 + %and1 = and i32 %nota, %c + %and2 = and i32 %and1, %notb + %ret_cast = bitcast i32 %and2 to float + ret float %ret_cast +} + +define amdgpu_ps float @not_and_and_not_and(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: not_and_and_not_and: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:4 +; GCN-NEXT: ; return to shader part epilog + %nota = xor i32 %a, -1 + %notc = xor i32 %c, -1 + %and1 = and i32 %nota, %notc + %and2 = and i32 %and1, %b + %ret_cast = bitcast i32 %and2 to float + ret float %ret_cast +} + +define amdgpu_ps float @not_and_and_and(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: not_and_and_and: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:8 +; GCN-NEXT: ; return to shader part epilog + %nota = xor i32 %a, -1 + %and1 = and i32 %nota, %c + %and2 = and i32 %and1, %b + %ret_cast = bitcast i32 %and2 to float + ret float %ret_cast +} + +define amdgpu_ps float @and_not_and_not_and(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: and_not_and_not_and: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x10 +; GCN-NEXT: ; return to shader part epilog + %notb = xor i32 %b, -1 + %notc = xor i32 %c, -1 + %and1 = and i32 %a, %notc + %and2 = and i32 %and1, %notb + %ret_cast = bitcast i32 %and2 to float + ret float %ret_cast +} + +define amdgpu_ps float @and_not_and_and(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: and_not_and_and: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x20 +; GCN-NEXT: ; return to shader part epilog + %notb = xor i32 %b, -1 + %and1 = and i32 %a, %c + %and2 = and i32 %and1, %notb + %ret_cast = bitcast i32 %and2 to float + ret float %ret_cast +} + +define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: and_and_not_and: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x40 +; GCN-NEXT: ; return to shader part epilog + %notc = xor i32 %c, -1 + %and1 = and i32 %a, %notc + %and2 = and i32 %and1, %b + %ret_cast = bitcast i32 %and2 to float + ret float %ret_cast +} + +define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: and_and_and: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80 +; GCN-NEXT: ; return to shader part epilog + %and1 = and i32 %a, %c + %and2 = and i32 %and1, %b + %ret_cast = bitcast i32 %and2 to float + ret float %ret_cast +} + +; ========= Multi bit functions ========= + +define amdgpu_ps float @test_12(i32 %a, i32 %b) { +; GCN-LABEL: test_12: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc +; GCN-NEXT: ; return to shader part epilog + %nota = xor i32 %a, -1 + %and1 = and i32 %nota, %b + %ret_cast = bitcast i32 %and1 to float + ret float %ret_cast +} + +define amdgpu_ps float @test_63(i32 %a, i32 %b) { +; GCN-LABEL: test_63: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0x3f +; GCN-NEXT: ; return to shader part epilog + %nota = xor i32 %a, -1 + %notb = xor i32 %b, -1 + %or = or i32 %nota, %notb + %ret_cast = bitcast i32 %or to float + ret float %ret_cast +} + +define amdgpu_ps float @test_59(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: test_59: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x3b +; GCN-NEXT: ; return to shader part epilog + %nota = xor i32 %a, -1 + %notb = xor i32 %b, -1 + %and1 = and i32 %nota, %c + %or = or i32 %and1, %notb + %ret_cast = bitcast i32 %or to float + ret float %ret_cast +} + +define amdgpu_ps float @test_126(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: test_126: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v0, v0, v2, v1 bitop3:0x7e +; GCN-NEXT: ; return to shader part epilog + %xor1 = xor i32 %a, %b + %xor2 = xor i32 %a, %c + %or = or i32 %xor1, %xor2 + %ret_cast = bitcast i32 %or to float + ret float %ret_cast +} + +; Src vector exhausted during search but recovered using 'not' lookahead. +; GlobalISel has slightly different input, so it does not happen. + +; FIXME: Improve global isel code. + +define amdgpu_ps float @test_12_src_overflow(i32 %a, i32 %b, i32 %c) { +; GFX950-SDAG-LABEL: test_12_src_overflow: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: test_12_src_overflow: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_bitop3_b32 v3, v0, v2, v0 bitop3:0xc +; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v2, v0 bitop3:3 +; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v3, v1, v0 bitop3:0xc8 +; GFX950-GISEL-NEXT: ; return to shader part epilog + %nota = xor i32 %a, -1 + %notc = xor i32 %c, -1 + %and1 = and i32 %nota, %c + %and2 = and i32 %and1, %b + %and3 = and i32 %nota, %notc + %and4 = and i32 %and3, %b + %or = or i32 %and2, %and4 + %ret_cast = bitcast i32 %or to float + ret float %ret_cast +} + +; This could be a single LOP3 operation with tbl = 100, but Src vector exhausted during search. + +define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: test_100_src_overflow: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b32 v3, v1, v2, v0 bitop3:0x10 +; GCN-NEXT: v_bitop3_b32 v4, v0, v2, v1 bitop3:0x40 +; GCN-NEXT: v_bitop3_b32 v0, v1, v2, v0 bitop3:0x20 +; GCN-NEXT: v_or3_b32 v0, v3, v4, v0 +; GCN-NEXT: ; return to shader part epilog + %or1 = or i32 %c, %a + %not1 = xor i32 %or1, -1 + %and1 = and i32 %b, %not1 + %not2 = xor i32 %b, -1 + %and2 = and i32 %a, %not2 + %and3 = and i32 %and2, %c + %and4 = and i32 %b, %a + %not3 = xor i32 %c, -1 + %and5 = and i32 %and4, %not3 + %or2 = or i32 %and1, %and3 + %or3 = or i32 %or2, %and5 + %ret_cast = bitcast i32 %or3 to float + ret float %ret_cast +} + +; ========= Ternary logical operations take precedence ========= + +define amdgpu_ps float @test_xor3(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: test_xor3: +; GCN: ; %bb.0: +; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 +; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 +; GCN-NEXT: ; return to shader part epilog + %xor1 = xor i32 %a, %b + %xor2 = xor i32 %xor1, %c + %ret_cast = bitcast i32 %xor2 to float + ret float %ret_cast +} + +define amdgpu_ps float @test_or3(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: test_or3: +; GCN: ; %bb.0: +; GCN-NEXT: v_or3_b32 v0, v0, v1, v2 +; GCN-NEXT: ; return to shader part epilog + %or1 = or i32 %a, %b + %or2 = or i32 %or1, %c + %ret_cast = bitcast i32 %or2 to float + ret float %ret_cast +} + +define amdgpu_ps float @test_and_or(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: test_and_or: +; GCN: ; %bb.0: +; GCN-NEXT: v_and_or_b32 v0, v0, v1, v2 +; GCN-NEXT: ; return to shader part epilog + %and1 = and i32 %a, %b + %or1 = or i32 %and1, %c + %ret_cast = bitcast i32 %or1 to float + ret float %ret_cast +} + +; ========= Uniform cases ========= + +define amdgpu_ps float @uniform_3_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) { +; GCN-LABEL: uniform_3_op: +; GCN: ; %bb.0: +; GCN-NEXT: s_andn2_b32 s0, s2, s0 +; GCN-NEXT: s_and_b32 s0, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog + %nota = xor i32 %a, -1 + %and1 = and i32 %nota, %c + %and2 = and i32 %and1, %b + %ret_cast = bitcast i32 %and2 to float + ret float %ret_cast +} + +define amdgpu_ps float @uniform_4_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) { +; GCN-LABEL: uniform_4_op: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_bitop3_b32 v0, s0, v0, v1 bitop3:2 +; GCN-NEXT: ; return to shader part epilog + %nota = xor i32 %a, -1 + %notb = xor i32 %b, -1 + %and1 = and i32 %nota, %c + %and2 = and i32 %and1, %notb + %ret_cast = bitcast i32 %and2 to float + ret float %ret_cast +} + +; ========= 16 bit tests ========= + +define amdgpu_ps half @not_and_not_and_not_and_b16(i16 %a, i16 %b, i16 %c) { +; GCN-LABEL: not_and_not_and_not_and_b16: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:1 +; GCN-NEXT: ; return to shader part epilog + %nota = xor i16 %a, -1 + %notb = xor i16 %b, -1 + %notc = xor i16 %c, -1 + %and1 = and i16 %nota, %notc + %and2 = and i16 %and1, %notb + %ret_cast = bitcast i16 %and2 to half + ret half %ret_cast +} + +define amdgpu_ps half @not_and_not_and_and_b16(i16 %a, i16 %b, i16 %c) { +; GCN-LABEL: not_and_not_and_and_b16: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:2 +; GCN-NEXT: ; return to shader part epilog + %nota = xor i16 %a, -1 + %notb = xor i16 %b, -1 + %and1 = and i16 %nota, %c + %and2 = and i16 %and1, %notb + %ret_cast = bitcast i16 %and2 to half + ret half %ret_cast +} + +define amdgpu_ps half @not_and_and_not_and_b16(i16 %a, i16 %b, i16 %c) { +; GCN-LABEL: not_and_and_not_and_b16: +; GCN: ; %bb.0: +; GCN-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:4 +; GCN-NEXT: ; return to shader part epilog + %nota = xor i16 %a, -1 + %notc = xor i16 %c, -1 + %and1 = and i16 %nota, %notc + %and2 = and i16 %and1, %b + %ret_cast = bitcast i16 %and2 to half + ret half %ret_cast +} + +define amdgpu_ps half @test_xor3_b16(i16 %a, i16 %b, i16 %c) { +; GFX950-SDAG-LABEL: test_xor3_b16: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_bitop3_b16 v0, v0, v2, v1 bitop3:0x96 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: test_xor3_b16: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX950-GISEL-NEXT: ; return to shader part epilog + %xor1 = xor i16 %a, %b + %xor2 = xor i16 %xor1, %c + %ret_cast = bitcast i16 %xor2 to half + ret half %ret_cast +} + +define amdgpu_ps half @test_or3_b16(i16 %a, i16 %b, i16 %c) { +; GFX950-SDAG-LABEL: test_or3_b16: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_bitop3_b16 v0, v0, v2, v1 bitop3:0xfe +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: test_or3_b16: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX950-GISEL-NEXT: ; return to shader part epilog + %or1 = or i16 %a, %b + %or2 = or i16 %or1, %c + %ret_cast = bitcast i16 %or2 to half + ret half %ret_cast +} + +define amdgpu_ps half @test_and_or_b16(i16 %a, i16 %b, i16 %c) { +; GFX950-SDAG-LABEL: test_and_or_b16: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_bitop3_b16 v0, v0, v2, v1 bitop3:0xec +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: test_and_or_b16: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_and_or_b32 v0, v0, v1, v2 +; GFX950-GISEL-NEXT: ; return to shader part epilog + %and1 = and i16 %a, %b + %or1 = or i16 %and1, %c + %ret_cast = bitcast i16 %or1 to half + ret half %ret_cast +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX950: {{.*}}