From 8a4c601abc2854f98b2586d7d24f28f522bc3f38 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Mon, 6 Mar 2017 17:24:04 +0000 Subject: [PATCH] [Hexagon] Early-if-convert branches that may exit the loop Merge the tail block into the loop in cases where the main loop body exits early, subject to profitability constraints. This will coalesce the loop body into fewer blocks. For example: loop: loop: // loop body // loop body if (...) jump exit --> // more body more: if (...) jump exit // more body jump loop jump loop llvm-svn: 297033 --- .../lib/Target/Hexagon/HexagonEarlyIfConv.cpp | 163 +++++++++++------- llvm/test/CodeGen/Hexagon/block-addr.ll | 2 +- .../CodeGen/Hexagon/early-if-merge-loop.ll | 91 ++++++++++ 3 files changed, 195 insertions(+), 61 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/early-if-merge-loop.ll diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp index c18916747212..4418b57f6a06 100644 --- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp +++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp @@ -105,6 +105,8 @@ namespace { cl::init(false), cl::desc("Enable branch probability info")); cl::opt SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion")); + cl::opt SkipExitBranches("eif-no-loop-exit", cl::init(false), + cl::Hidden, cl::desc("Do not convert branches that may exit the loop")); struct PrintMB { PrintMB(const MachineBasicBlock *B) : MB(B) {} @@ -142,8 +144,8 @@ namespace { raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) { OS << "{ SplitB:" << PrintMB(P.FP.SplitB) << ", PredR:" << PrintReg(P.FP.PredR, &P.TRI) - << ", TrueB:" << PrintMB(P.FP.TrueB) << ", FalseB:" - << PrintMB(P.FP.FalseB) + << ", TrueB:" << PrintMB(P.FP.TrueB) + << ", FalseB:" << PrintMB(P.FP.FalseB) << ", JoinB:" << PrintMB(P.FP.JoinB) << " }"; return OS; } @@ -187,7 +189,8 @@ namespace { bool usesUndefVReg(const MachineInstr *MI) const; bool isValid(const FlowPattern &FP) const; unsigned countPredicateDefs(const MachineBasicBlock *B) const; - unsigned computePhiCost(MachineBasicBlock *B) const; + unsigned computePhiCost(const MachineBasicBlock *B, + const FlowPattern &FP) const; bool isProfitable(const FlowPattern &FP) const; bool isPredicableStore(const MachineInstr *MI) const; bool isSafeToSpeculate(const MachineInstr *MI) const; @@ -199,6 +202,9 @@ namespace { MachineBasicBlock::iterator At, MachineBasicBlock *FromB, unsigned PredR, bool IfTrue); + unsigned buildMux(MachineBasicBlock *B, MachineBasicBlock::iterator At, + const TargetRegisterClass *DRC, unsigned PredR, unsigned TR, + unsigned TSR, unsigned FR, unsigned FSR); void updatePhiNodes(MachineBasicBlock *WhereB, const FlowPattern &FP); void convert(const FlowPattern &FP); @@ -230,7 +236,7 @@ bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const { return false; MachineBasicBlock *SB = *B->succ_begin(); MachineLoop *L = MLI->getLoopFor(SB); - return L && SB == L->getHeader(); + return L && SB == L->getHeader() && MDT->dominates(B, SB); } bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B, @@ -264,9 +270,6 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B, // mark as diamond with both sides equal? return false; } - // Loop could be null for both. - if (MLI->getLoopFor(T1B) != L || MLI->getLoopFor(T2B) != L) - return false; // Record the true/false blocks in such a way that "true" means "if (PredR)", // and "false" means "if (!PredR)". @@ -289,8 +292,14 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B, // it has a single successor. In fact, the block has to end either with // an unconditional branch (which can be predicated), or with a fall- // through. - bool TOk = (TNP == 1) && (TNS == 1); - bool FOk = (FNP == 1) && (FNS == 1); + // Also, skip blocks that do not belong to the same loop. + bool TOk = (TNP == 1 && TNS == 1 && MLI->getLoopFor(TB) == L); + bool FOk = (FNP == 1 && FNS == 1 && MLI->getLoopFor(FB) == L); + + // If requested (via an option), do not consider branches where the + // true and false targets do not belong to the same loop. + if (SkipExitBranches && MLI->getLoopFor(TB) != MLI->getLoopFor(FB)) + return false; // If neither is predicable, there is nothing interesting. if (!TOk && !FOk) @@ -307,17 +316,15 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B, // Diamond: "if (P) then TB; else FB;". } else { // TOk && !FOk - if (TSB == FB) { + if (TSB == FB) JB = FB; - FB = nullptr; - } + FB = nullptr; } } else { // !TOk && FOk (at least one must be true by now). - if (FSB == TB) { + if (FSB == TB) JB = TB; - TB = nullptr; - } + TB = nullptr; } // Don't try to predicate loop preheaders. if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) { @@ -448,24 +455,39 @@ bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const { return true; } -unsigned HexagonEarlyIfConversion::computePhiCost(MachineBasicBlock *B) const { - assert(B->pred_size() <= 2); +unsigned HexagonEarlyIfConversion::computePhiCost(const MachineBasicBlock *B, + const FlowPattern &FP) const { if (B->pred_size() < 2) return 0; unsigned Cost = 0; - MachineBasicBlock::const_iterator I, E = B->getFirstNonPHI(); - for (I = B->begin(); I != E; ++I) { - const MachineOperand &RO1 = I->getOperand(1); - const MachineOperand &RO3 = I->getOperand(3); - assert(RO1.isReg() && RO3.isReg()); + for (const MachineInstr &MI : *B) { + if (!MI.isPHI()) + break; + // If both incoming blocks are one of the TrueB/FalseB/SplitB, then + // a MUX may be needed. Otherwise the PHI will need to be updated at + // no extra cost. + // Find the interesting PHI operands for further checks. + SmallVector Inc; + for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { + const MachineBasicBlock *BB = MI.getOperand(i+1).getMBB(); + if (BB == FP.SplitB || BB == FP.TrueB || BB == FP.FalseB) + Inc.push_back(i); + } + assert(Inc.size() <= 2); + if (Inc.size() < 2) + continue; + + const MachineOperand &RA = MI.getOperand(1); + const MachineOperand &RB = MI.getOperand(3); + assert(RA.isReg() && RB.isReg()); // Must have a MUX if the phi uses a subregister. - if (RO1.getSubReg() != 0 || RO3.getSubReg() != 0) { + if (RA.getSubReg() != 0 || RA.getSubReg() != 0) { Cost++; continue; } - MachineInstr *Def1 = MRI->getVRegDef(RO1.getReg()); - MachineInstr *Def3 = MRI->getVRegDef(RO3.getReg()); + const MachineInstr *Def1 = MRI->getVRegDef(RA.getReg()); + const MachineInstr *Def3 = MRI->getVRegDef(RB.getReg()); if (!HII->isPredicable(*Def1) || !HII->isPredicable(*Def3)) Cost++; } @@ -491,7 +513,6 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs( bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const { if (FP.TrueB && FP.FalseB) { - // Do not IfCovert if the branch is one sided. if (MBPI) { BranchProbability Prob(9, 10); @@ -542,17 +563,17 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const { unsigned TotalPh = 0; unsigned PredDefs = countPredicateDefs(FP.SplitB); if (FP.JoinB) { - TotalPh = computePhiCost(FP.JoinB); + TotalPh = computePhiCost(FP.JoinB, FP); PredDefs += countPredicateDefs(FP.JoinB); } else { if (FP.TrueB && FP.TrueB->succ_size() > 0) { MachineBasicBlock *SB = *FP.TrueB->succ_begin(); - TotalPh += computePhiCost(SB); + TotalPh += computePhiCost(SB, FP); PredDefs += countPredicateDefs(SB); } if (FP.FalseB && FP.FalseB->succ_size() > 0) { MachineBasicBlock *SB = *FP.FalseB->succ_begin(); - TotalPh += computePhiCost(SB); + TotalPh += computePhiCost(SB, FP); PredDefs += countPredicateDefs(SB); } } @@ -739,6 +760,43 @@ void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB, } } +unsigned HexagonEarlyIfConversion::buildMux(MachineBasicBlock *B, + MachineBasicBlock::iterator At, const TargetRegisterClass *DRC, + unsigned PredR, unsigned TR, unsigned TSR, unsigned FR, unsigned FSR) { + unsigned Opc = 0; + switch (DRC->getID()) { + case Hexagon::IntRegsRegClassID: + Opc = Hexagon::C2_mux; + break; + case Hexagon::DoubleRegsRegClassID: + Opc = Hexagon::PS_pselect; + break; + case Hexagon::VectorRegsRegClassID: + Opc = Hexagon::PS_vselect; + break; + case Hexagon::VecDblRegsRegClassID: + Opc = Hexagon::PS_wselect; + break; + case Hexagon::VectorRegs128BRegClassID: + Opc = Hexagon::PS_vselect_128B; + break; + case Hexagon::VecDblRegs128BRegClassID: + Opc = Hexagon::PS_wselect_128B; + break; + default: + llvm_unreachable("unexpected register type"); + } + const MCInstrDesc &D = HII->get(Opc); + + DebugLoc DL = B->findBranchDebugLoc(); + unsigned MuxR = MRI->createVirtualRegister(DRC); + BuildMI(*B, At, DL, D, MuxR) + .addReg(PredR) + .addReg(TR, 0, TSR) + .addReg(FR, 0, FSR); + return MuxR; +} + void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB, const FlowPattern &FP) { // Visit all PHI nodes in the WhereB block and generate MUX instructions @@ -765,40 +823,25 @@ void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB, TR = SR, TSR = SSR; else if (FR == 0) FR = SR, FSR = SSR; - assert(TR && FR); - using namespace Hexagon; + assert(TR || FR); + unsigned MuxR = 0, MuxSR = 0; - unsigned DR = PN->getOperand(0).getReg(); - const TargetRegisterClass *RC = MRI->getRegClass(DR); - unsigned Opc = 0; - if (RC == &IntRegsRegClass) - Opc = C2_mux; - else if (RC == &DoubleRegsRegClass) - Opc = PS_pselect; - else if (RC == &VectorRegsRegClass) - Opc = PS_vselect; - else if (RC == &VecDblRegsRegClass) - Opc = PS_wselect; - else if (RC == &VectorRegs128BRegClass) - Opc = PS_vselect_128B; - else if (RC == &VecDblRegs128BRegClass) - Opc = PS_wselect_128B; - else - llvm_unreachable("unexpected register type"); - const MCInstrDesc &D = HII->get(Opc); + if (TR && FR) { + unsigned DR = PN->getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI->getRegClass(DR); + MuxR = buildMux(FP.SplitB, FP.SplitB->getFirstTerminator(), RC, + FP.PredR, TR, TSR, FR, FSR); + } else if (TR) { + MuxR = TR; + MuxSR = TSR; + } else { + MuxR = FR; + MuxSR = FSR; + } - MachineBasicBlock::iterator MuxAt = FP.SplitB->getFirstTerminator(); - DebugLoc DL; - if (MuxAt != FP.SplitB->end()) - DL = MuxAt->getDebugLoc(); - unsigned MuxR = MRI->createVirtualRegister(RC); - BuildMI(*FP.SplitB, MuxAt, DL, D, MuxR) - .addReg(FP.PredR) - .addReg(TR, 0, TSR) - .addReg(FR, 0, FSR); - - PN->addOperand(MachineOperand::CreateReg(MuxR, false)); + PN->addOperand(MachineOperand::CreateReg(MuxR, false, false, false, false, + false, false, MuxSR)); PN->addOperand(MachineOperand::CreateMBB(FP.SplitB)); } } diff --git a/llvm/test/CodeGen/Hexagon/block-addr.ll b/llvm/test/CodeGen/Hexagon/block-addr.ll index c0db2cef545e..5af3a69f8aab 100644 --- a/llvm/test/CodeGen/Hexagon/block-addr.ll +++ b/llvm/test/CodeGen/Hexagon/block-addr.ll @@ -2,7 +2,7 @@ ; CHECK: .LJTI ; CHECK-DAG: r[[REG:[0-9]+]] = memw(r{{[0-9]+}}{{ *}}+{{ *}}r{{[0-9]+<<#[0-9]+}}) -; CHECK-DAG: jumpr:nt r[[REG]] +; CHECK-DAG: jumpr r[[REG]] define void @main() #0 { entry: diff --git a/llvm/test/CodeGen/Hexagon/early-if-merge-loop.ll b/llvm/test/CodeGen/Hexagon/early-if-merge-loop.ll new file mode 100644 index 000000000000..f45058f029dd --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/early-if-merge-loop.ll @@ -0,0 +1,91 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s +; Make sure that the loop in the end has only one basic block. + +; CHECK-LABEL: fred +; Rely on the comments, make sure the one for the loop header is present. +; CHECK: %loop +; CHECK-NOT: %should_merge + +target triple = "hexagon" + +define i32 @fred(i32 %a0, i64* nocapture readonly %a1) #0 { +b2: + %v3 = bitcast i64* %a1 to i32* + %v4 = getelementptr inbounds i32, i32* %v3, i32 1 + %v5 = zext i32 %a0 to i64 + br label %loop + +loop: ; preds = %should_merge, %b2 + %v7 = phi i32 [ 0, %b2 ], [ %v49, %should_merge ] + %v8 = phi i32 [ 0, %b2 ], [ %v42, %should_merge ] + %v9 = phi i32* [ %v4, %b2 ], [ %v53, %should_merge ] + %v10 = phi i32 [ 0, %b2 ], [ %v30, %should_merge ] + %v11 = phi i32* [ %v3, %b2 ], [ %v51, %should_merge ] + %v12 = phi i32 [ 0, %b2 ], [ %v23, %should_merge ] + %v13 = phi i32 [ 2, %b2 ], [ %v54, %should_merge ] + %v14 = load i32, i32* %v11, align 4, !tbaa !0 + %v15 = load i32, i32* %v9, align 4, !tbaa !0 + %v16 = icmp ult i32 %v13, 30 + %v17 = zext i32 %v12 to i64 + %v18 = shl nuw i64 %v17, 32 + %v19 = zext i32 %v14 to i64 + %v20 = or i64 %v18, %v19 + %v21 = tail call i64 @llvm.hexagon.A2.addp(i64 %v20, i64 %v5) + %v22 = lshr i64 %v21, 32 + %v23 = trunc i64 %v22 to i32 + %v24 = zext i32 %v10 to i64 + %v25 = shl nuw i64 %v24, 32 + %v26 = zext i32 %v15 to i64 + %v27 = or i64 %v25, %v26 + %v28 = tail call i64 @llvm.hexagon.A2.addp(i64 %v27, i64 %v5) + %v29 = lshr i64 %v28, 32 + %v30 = trunc i64 %v29 to i32 + %v31 = getelementptr inbounds i32, i32* %v3, i32 %v13 + %v32 = load i32, i32* %v31, align 4, !tbaa !0 + %v33 = or i32 %v13, 1 + %v34 = getelementptr inbounds i32, i32* %v3, i32 %v33 + %v35 = load i32, i32* %v34, align 4, !tbaa !0 + %v36 = zext i32 %v8 to i64 + %v37 = shl nuw i64 %v36, 32 + %v38 = zext i32 %v32 to i64 + %v39 = or i64 %v37, %v38 + %v40 = tail call i64 @llvm.hexagon.A2.subp(i64 %v39, i64 %v5) + %v41 = lshr i64 %v40, 32 + %v42 = trunc i64 %v41 to i32 + %v43 = zext i32 %v7 to i64 + %v44 = shl nuw i64 %v43, 32 + %v45 = zext i32 %v35 to i64 + %v46 = or i64 %v44, %v45 + %v47 = tail call i64 @llvm.hexagon.A2.subp(i64 %v46, i64 %v5) + %v48 = lshr i64 %v47, 32 + %v49 = trunc i64 %v48 to i32 + br i1 %v16, label %should_merge, label %exit + +should_merge: ; preds = %loop + %v50 = add nuw nsw i32 %v13, 2 + %v51 = getelementptr inbounds i32, i32* %v3, i32 %v50 + %v52 = add nuw nsw i32 %v13, 3 + %v53 = getelementptr inbounds i32, i32* %v3, i32 %v52 + %v54 = add nuw nsw i32 %v13, 4 + br label %loop + +exit: ; preds = %loop + %v57 = tail call i64 @llvm.hexagon.A2.combinew(i32 %v42, i32 %v23) + %v58 = tail call i64 @llvm.hexagon.A2.combinew(i32 %v49, i32 %v30) + %v59 = tail call i64 @llvm.hexagon.A2.addp(i64 %v57, i64 %v58) + %v60 = lshr i64 %v59, 32 + %v61 = trunc i64 %v60 to i32 + ret i32 %v61 +} + +declare i64 @llvm.hexagon.A2.addp(i64, i64) #1 +declare i64 @llvm.hexagon.A2.subp(i64, i64) #1 +declare i64 @llvm.hexagon.A2.combinew(i32, i32) #1 + +attributes #0 = { nounwind readonly "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" } +attributes #1 = { nounwind readnone } + +!0 = !{!1, !1, i64 0} +!1 = !{!"long", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"}