From d97cf1f88902026b6ebe7fb9d844a285c3b113c5 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Wed, 11 Dec 2019 10:11:48 +0000 Subject: [PATCH] [ARM][LowOverheadLoops] Remove dead loop update instructions. After creating a low-overhead loop, the loop update instruction was still lingering around hurting performance. This removes dead loop update instructions, which in our case are mostly SUBS instructions. To support this, some helper functions were added to MachineLoopUtils and ReachingDefAnalysis to analyse live-ins of loop exit blocks and find uses before a particular loop instruction, respectively. This is a first version that removes a SUBS instruction when there are no other uses inside and outside the loop block, but there are some more interesting cases in test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll which shows that there is room for improvement. For example, we can't handle this case yet: .. dlstp.32 lr, r2 .LBB0_1: mov r3, r2 subs r2, #4 vldrh.u32 q2, [r1], #8 vmov q1, q0 vmla.u32 q0, q2, r0 letp lr, .LBB0_1 @ %bb.2: vctp.32 r3 .. which is a lot more tricky because r2 is not only used by the subs, but also by the mov to r3, which is used outside the low-overhead loop by the vctp instruction, and that requires a bit of a different approach, and I will follow up on this. Differential Revision: https://reviews.llvm.org/D71007 --- llvm/include/llvm/CodeGen/MachineLoopUtils.h | 5 + .../llvm/CodeGen/ReachingDefAnalysis.h | 7 + llvm/lib/CodeGen/MachineLoopUtils.cpp | 12 ++ llvm/lib/CodeGen/ReachingDefAnalysis.cpp | 26 ++- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp | 75 +++++++- .../dont-remove-loop-update.mir | 171 +++++++++++++++++ .../dont-remove-loop-update2.mir | 172 +++++++++++++++++ .../dont-remove-loop-update3.mir | 173 ++++++++++++++++++ .../Thumb2/LowOverheadLoops/fast-fp-loops.ll | 1 - .../LowOverheadLoops/mve-tail-data-types.ll | 1 - .../LowOverheadLoops/vector-arith-codegen.ll | 4 - 11 files changed, 638 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir diff --git a/llvm/include/llvm/CodeGen/MachineLoopUtils.h b/llvm/include/llvm/CodeGen/MachineLoopUtils.h index 41379b75d00a6a..2cb0134ca84835 100644 --- a/llvm/include/llvm/CodeGen/MachineLoopUtils.h +++ b/llvm/include/llvm/CodeGen/MachineLoopUtils.h @@ -10,6 +10,7 @@ #define LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H namespace llvm { +class MachineLoop; class MachineBasicBlock; class MachineRegisterInfo; class TargetInstrInfo; @@ -36,6 +37,10 @@ MachineBasicBlock *PeelSingleBlockLoop(LoopPeelDirection Direction, MachineRegisterInfo &MRI, const TargetInstrInfo *TII); +/// Return true if PhysReg is live outside the loop, i.e. determine if it +/// is live in the loop exit blocks, and false otherwise. +bool isRegLiveInExitBlocks(MachineLoop *Loop, int PhysReg); + } // namespace llvm #endif // LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h index ac001e326c5701..685ba94e57aaba 100644 --- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -110,6 +110,13 @@ class ReachingDefAnalysis : public MachineFunctionPass { /// use or a live out. bool isRegUsedAfter(MachineInstr *MI, int PhysReg); + /// Provides the first instruction before MI that uses PhysReg + MachineInstr *getInstWithUseBefore(MachineInstr *MI, int PhysReg); + + /// Provides all instructions before MI that uses PhysReg + void getAllInstWithUseBefore(MachineInstr *MI, int PhysReg, + SmallVectorImpl &Uses); + /// Provides the clearance - the number of instructions since the closest /// reaching def instuction of PhysReg that reaches MI. int getClearance(MachineInstr *MI, MCPhysReg PhysReg); diff --git a/llvm/lib/CodeGen/MachineLoopUtils.cpp b/llvm/lib/CodeGen/MachineLoopUtils.cpp index e074b76082f0e2..cf30e28449cd61 100644 --- a/llvm/lib/CodeGen/MachineLoopUtils.cpp +++ b/llvm/lib/CodeGen/MachineLoopUtils.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineLoopUtils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -130,3 +131,14 @@ MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction, return NewBB; } + +bool llvm::isRegLiveInExitBlocks(MachineLoop *Loop, int PhysReg) { + SmallVector ExitBlocks; + Loop->getExitBlocks(ExitBlocks); + + for (auto *MBB : ExitBlocks) + if (MBB->isLiveIn(PhysReg)) + return true; + + return false; +} diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index ad7f910be4c52e..e5b422e0b7ed86 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -227,7 +227,7 @@ int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCPhysReg PhysReg) { } void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg, - SmallVectorImpl &Uses) { + SmallVectorImpl &Uses) { MachineBasicBlock *MBB = Def->getParent(); MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def); while (++MI != MBB->end()) { @@ -272,3 +272,27 @@ bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) { return false; } +MachineInstr *ReachingDefAnalysis::getInstWithUseBefore(MachineInstr *MI, + int PhysReg) { + auto I = MachineBasicBlock::reverse_iterator(MI); + auto E = MI->getParent()->rend(); + I++; + + for ( ; I != E; I++) + for (auto &MO : I->operands()) + if (MO.isReg() && MO.isUse() && MO.getReg() == PhysReg) + return &*I; + + return nullptr; +} + +void ReachingDefAnalysis::getAllInstWithUseBefore(MachineInstr *MI, + int PhysReg, SmallVectorImpl &Uses) { + MachineInstr *Use = nullptr; + MachineInstr *Pos = MI; + + while ((Use = getInstWithUseBefore(Pos, PhysReg))) { + Uses.push_back(Use); + Pos = Use; + } +} diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 756d0fdb557026..1abd510588f668 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -24,6 +24,7 @@ #include "ARMSubtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineLoopUtils.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/ReachingDefAnalysis.h" @@ -163,6 +164,7 @@ namespace { ReachingDefAnalysis *RDA = nullptr; const ARMBaseInstrInfo *TII = nullptr; MachineRegisterInfo *MRI = nullptr; + const TargetRegisterInfo *TRI = nullptr; std::unique_ptr BBUtils = nullptr; public: @@ -200,6 +202,8 @@ namespace { void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; + void RemoveLoopUpdate(LowOverheadLoop &LoLoop); + void RemoveVPTBlocks(LowOverheadLoop &LoLoop); MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); @@ -383,6 +387,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); MRI = &MF->getRegInfo(); TII = static_cast(ST.getInstrInfo()); + TRI = ST.getRegisterInfo(); BBUtils = std::unique_ptr(new ARMBasicBlockUtils(*MF)); BBUtils->computeAllBlockSizes(); BBUtils->adjustBBOffsetsAfter(&MF->front()); @@ -511,7 +516,7 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { MIB.addImm(0); MIB.addImm(ARMCC::AL); MIB.addReg(ARM::NoRegister); - + MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc; @@ -631,6 +636,70 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { return &*MIB; } +// Goal is to optimise and clean-up these loops: +// +// vector.body: +// renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg +// renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4 +// .. +// $lr = MVE_DLSTP_32 renamable $r3 +// +// The SUB is the old update of the loop iteration count expression, which +// is no longer needed. This sub is removed when the element count, which is in +// r3 in this example, is defined by an instruction in the loop, and it has +// no uses. +// +void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) { + Register ElemCount = LoLoop.VCTP->getOperand(1).getReg(); + MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back(); + + LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n"); + + if (LoLoop.ML->getNumBlocks() != 1) { + LLVM_DEBUG(dbgs() << "ARM Loops: single block loop expected\n"); + return; + } + + LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing MO: "; + LoLoop.VCTP->getOperand(1).dump()); + + // Find the definition we are interested in removing, if there is one. + MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount); + if (!Def) + return; + + // Bail if we define CPSR and it is not dead + if (!Def->registerDefIsDead(ARM::CPSR, TRI)) { + LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n"); + return; + } + + // Bail if elemcount is used in exit blocks, i.e. if it is live-in. + if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n"); + return; + } + + // Bail if there are uses after this Def in the block. + SmallVector Uses; + RDA->getReachingLocalUses(Def, ElemCount, Uses); + if (Uses.size()) { + LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n"); + return; + } + + Uses.clear(); + RDA->getAllInstWithUseBefore(Def, ElemCount, Uses); + + // Remove Def if there are no uses, or if the only use is the VCTP + // instruction. + if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: "; + Def->dump()); + Def->eraseFromParent(); + } +} + void ARMLowOverheadLoops::RemoveVPTBlocks(LowOverheadLoop &LoLoop) { LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP); LoLoop.VCTP->eraseFromParent(); @@ -703,8 +772,10 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { RemoveDeadBranch(LoLoop.Start); LoLoop.End = ExpandLoopEnd(LoLoop); RemoveDeadBranch(LoLoop.End); - if (LoLoop.IsTailPredicationLegal()) + if (LoLoop.IsTailPredicationLegal()) { + RemoveLoopUpdate(LoLoop); RemoveVPTBlocks(LoLoop); + } } } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir new file mode 100644 index 00000000000000..1087d7ca6e3406 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir @@ -0,0 +1,171 @@ +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s + +# There are 2 SUBS, and the 2nd one is identified as the def. +# Thus, the 1st is a use, and we shouldn't optimise away the SUBS. + +# CHECK: bb.1.vector.body: +# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg +# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg +# CHECK: $lr = MVE_LETP renamable $lr, %bb.1 + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-arm-unknown-eabi" + + define dso_local void @use_before_def(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* + %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7) + %9 = sub i32 %7, 4 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 + %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) + %12 = icmp ne i32 %11, 0 + br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 + + for.cond.cleanup: ; preds = %vector.body, %entry + ret void + } + declare void @llvm.set.loop.iterations.i32(i32) #1 + declare <4 x i1> @llvm.arm.vctp32(i32) #2 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 + declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4 + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3 + declare void @llvm.stackprotector(i8*, i8**) #5 + + attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" } + attributes #1 = { noduplicate nounwind } + attributes #2 = { nounwind readnone } + attributes #3 = { argmemonly nounwind willreturn } + attributes #4 = { argmemonly nounwind readonly willreturn } + attributes #5 = { nounwind } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 1, !"min_enum_size", i32 4} + !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"} + !3 = !{!4, !4, i64 0} + !4 = !{!"int", !5, i64 0} + !5 = !{!"omnipotent char", !6, i64 0} + !6 = !{!"Simple C++ TBAA"} + !7 = distinct !{!7, !8} + !8 = !{!"llvm.loop.isvectorized", i32 1} + +... +--- +name: use_before_def +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $lr + + frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + renamable $lr = t2MOVi 1, 14, $noreg, $noreg + renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3) + renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3) + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + MVE_VPST 8, implicit $vpr + renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3) + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r7, def $pc + +... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir new file mode 100644 index 00000000000000..a43a228a2e4bf4 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir @@ -0,0 +1,172 @@ +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s + +# The CPSR is not dead: +# +# renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg +# +# We shouldn't optimise away the SUB. + +# CHECK: bb.1.vector.body: +# CHECK: renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg +# CHECK: $lr = MVE_LETP renamable $lr, %bb.1 + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-arm-unknown-eabi" + + define dso_local void @CPSR_not_dead(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* + %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7) + %9 = sub i32 %7, 4 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 + %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) + %12 = icmp ne i32 %11, 0 + br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 + + for.cond.cleanup: ; preds = %vector.body, %entry + ret void + } + declare void @llvm.set.loop.iterations.i32(i32) #1 + declare <4 x i1> @llvm.arm.vctp32(i32) #2 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 + declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4 + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3 + declare void @llvm.stackprotector(i8*, i8**) #5 + + attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" } + attributes #1 = { noduplicate nounwind } + attributes #2 = { nounwind readnone } + attributes #3 = { argmemonly nounwind willreturn } + attributes #4 = { argmemonly nounwind readonly willreturn } + attributes #5 = { nounwind } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 1, !"min_enum_size", i32 4} + !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"} + !3 = !{!4, !4, i64 0} + !4 = !{!"int", !5, i64 0} + !5 = !{!"omnipotent char", !6, i64 0} + !6 = !{!"Simple C++ TBAA"} + !7 = distinct !{!7, !8} + !8 = !{!"llvm.loop.isvectorized", i32 1} + +... +--- +name: CPSR_not_dead +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $lr + + frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + renamable $lr = t2MOVi 1, 14, $noreg, $noreg + renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3) + renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3) + renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + MVE_VPST 8, implicit $vpr + renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3) + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r7, def $pc + +... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir new file mode 100644 index 00000000000000..addeedb34ceab8 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir @@ -0,0 +1,173 @@ +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s + +# Local use after def, this mov is using r3: +# +# $r2 = tMOVr killed $r3, 14, $noreg +# +# We should optimise away the SUB + +# CHECK: bb.1.vector.body: +# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg +# CHECK: $lr = MVE_LETP renamable $lr, %bb.1 + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-arm-unknown-eabi" + + define dso_local void @local_use_after_def(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { + entry: + %cmp8 = icmp sgt i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] + %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* + %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7) + %9 = sub i32 %7, 4 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3 + %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 + %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) + %12 = icmp ne i32 %11, 0 + br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7 + + for.cond.cleanup: ; preds = %vector.body, %entry + ret void + } + declare void @llvm.set.loop.iterations.i32(i32) #1 + declare <4 x i1> @llvm.arm.vctp32(i32) #2 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 + declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4 + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3 + declare void @llvm.stackprotector(i8*, i8**) #5 + + attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" } + attributes #1 = { noduplicate nounwind } + attributes #2 = { nounwind readnone } + attributes #3 = { argmemonly nounwind willreturn } + attributes #4 = { argmemonly nounwind readonly willreturn } + attributes #5 = { nounwind } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 1, !"min_enum_size", i32 4} + !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"} + !3 = !{!4, !4, i64 0} + !4 = !{!"int", !5, i64 0} + !5 = !{!"omnipotent char", !6, i64 0} + !6 = !{!"Simple C++ TBAA"} + !7 = distinct !{!7, !8} + !8 = !{!"llvm.loop.isvectorized", i32 1} + +... +--- +name: local_use_after_def +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $lr + + frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate + renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg + renamable $lr = t2MOVi 1, 14, $noreg, $noreg + renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3) + renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3) + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg + renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + MVE_VPST 8, implicit $vpr + renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3) + renamable $lr = t2LoopDec killed renamable $lr, 1 + $r2 = tMOVr killed $r3, 14, $noreg + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r7, def $pc + +... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll index 6b226c4c8605d9..a2a20f626e6fd9 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -41,7 +41,6 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.f32 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB0_5 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll index b41068eba74eb2..46d6fb2635ae12 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -1176,7 +1176,6 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture ; CHECK-NEXT: vldrb.u16 q0, [r4] ; CHECK-NEXT: add.w r4, r2, r12 ; CHECK-NEXT: add.w r12, r12, #8 -; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vldrb.u16 q1, [r4] ; CHECK-NEXT: vmul.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll index 0b9ab240487371..bb297393568b0b 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -198,7 +198,6 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB3_1 @@ -250,7 +249,6 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB4_1 @@ -308,7 +306,6 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur ; CHECK-NEXT: vldrb.u8 q1, [r4] ; CHECK-NEXT: add.w r4, r0, r12 ; CHECK-NEXT: add.w r12, r12, #16 -; CHECK-NEXT: subs r3, #16 ; CHECK-NEXT: vmul.i8 q0, q1, q0 ; CHECK-NEXT: vstrb.8 q0, [r4] ; CHECK-NEXT: letp lr, .LBB5_1 @@ -363,7 +360,6 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16 -; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vmul.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB6_1