diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index d0616fd678bd..20141f7f8d60 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -369,8 +369,9 @@ public: /// Set the Minimum Initiation Interval for this schedule attempt. void setMII(unsigned mii) { MII = mii; } - MachineInstr *applyInstrChange(MachineInstr *MI, SMSchedule &Schedule, - bool UpdateDAG = false); + void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule); + + void fixupRegisterOverlaps(std::deque &Instrs); /// Return the new base register that was stored away for the changed /// instruction. @@ -3390,9 +3391,8 @@ bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI, /// Apply changes to the instruction if needed. The changes are need /// to improve the scheduling and depend up on the final schedule. -MachineInstr *SwingSchedulerDAG::applyInstrChange(MachineInstr *MI, - SMSchedule &Schedule, - bool UpdateDAG) { +void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI, + SMSchedule &Schedule) { SUnit *SU = getSUnit(MI); DenseMap>::iterator It = InstrChanges.find(SU); @@ -3400,7 +3400,7 @@ MachineInstr *SwingSchedulerDAG::applyInstrChange(MachineInstr *MI, std::pair RegAndOffset = It->second; unsigned BasePos, OffsetPos; if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) - return nullptr; + return; unsigned BaseReg = MI->getOperand(BasePos).getReg(); MachineInstr *LoopDef = findDefInLoop(BaseReg); int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef)); @@ -3418,15 +3418,11 @@ MachineInstr *SwingSchedulerDAG::applyInstrChange(MachineInstr *MI, int64_t NewOffset = MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff; NewMI->getOperand(OffsetPos).setImm(NewOffset); - if (UpdateDAG) { - SU->setInstr(NewMI); - MISUnitMap[NewMI] = SU; - } + SU->setInstr(NewMI); + MISUnitMap[NewMI] = SU; NewMIs.insert(NewMI); - return NewMI; } } - return nullptr; } /// Return true for an order dependence that is loop carried potentially. @@ -3872,6 +3868,53 @@ bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) { return true; } +/// Attempt to fix the degenerate cases when the instruction serialization +/// causes the register lifetimes to overlap. For example, +/// p' = store_pi(p, b) +/// = load p, offset +/// In this case p and p' overlap, which means that two registers are needed. +/// Instead, this function changes the load to use p' and updates the offset. +void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque &Instrs) { + unsigned OverlapReg = 0; + unsigned NewBaseReg = 0; + for (SUnit *SU : Instrs) { + MachineInstr *MI = SU->getInstr(); + for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + // Look for an instruction that uses p. The instruction occurs in the + // same cycle but occurs later in the serialized order. + if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) { + // Check that the instruction appears in the InstrChanges structure, + // which contains instructions that can have the offset updated. + DenseMap>::iterator It = + InstrChanges.find(SU); + if (It != InstrChanges.end()) { + unsigned BasePos, OffsetPos; + // Update the base register and adjust the offset. + if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) { + MI->getOperand(BasePos).setReg(NewBaseReg); + int64_t Offset = MI->getOperand(OffsetPos).getImm(); + MI->getOperand(OffsetPos).setImm(Offset - It->second.second); + } + } + OverlapReg = 0; + NewBaseReg = 0; + break; + } + // Look for an instruction of the form p' = op(p), which uses and defines + // two virtual registers that get allocated to the same physical register. + unsigned TiedUseIdx = 0; + if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) { + // OverlapReg is p in the example above. + OverlapReg = MI->getOperand(TiedUseIdx).getReg(); + // NewBaseReg is p' in the example above. + NewBaseReg = MI->getOperand(i).getReg(); + break; + } + } + } +} + /// After the schedule has been formed, call this function to combine /// the instructions from the different stages/cycles. That is, this /// function creates a schedule that represents a single iteration. @@ -3932,7 +3975,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) { // map. We need to use the new registers to create the correct order. for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) { SUnit *SU = &SSD->SUnits[i]; - SSD->applyInstrChange(SU->getInstr(), *this, true); + SSD->applyInstrChange(SU->getInstr(), *this); } // Reorder the instructions in each cycle to fix and improve the @@ -3956,6 +3999,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) { // Replace the old order with the new order. cycleInstrs.swap(newOrderZC); cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end()); + SSD->fixupRegisterOverlaps(cycleInstrs); } DEBUG(dump();); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index bcd1a5089c71..c2125bec3a52 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1651,8 +1651,13 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint( bool HexagonInstrInfo::getIncrementValue(const MachineInstr &MI, int &Value) const { if (isPostIncrement(MI)) { + // For a post-increment, the offset is zero and the increment value is + // determined by the instruction's access size. + int Zero; unsigned AccessSize; - return getBaseAndOffset(MI, Value, AccessSize); + bool RetVal = getBaseAndOffset(MI, Zero, AccessSize); + Value = (int) AccessSize; + return RetVal; } if (MI.getOpcode() == Hexagon::A2_addi) { Value = MI.getOperand(2).getImm(); diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index 0f407c2d8366..2555b50f91cf 100644 --- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -496,6 +496,48 @@ void HexagonPacketizerList::useCalleesSP(MachineInstr &MI) { Off.setImm(Off.getImm() + FrameSize + HEXAGON_LRFP_SIZE); } +/// Return true if we can update the offset in MI so that MI and MJ +/// can be packetized together. +bool HexagonPacketizerList::updateOffset(SUnit *SUI, SUnit *SUJ) { + assert(SUI->getInstr() && SUJ->getInstr()); + MachineInstr &MI = *SUI->getInstr(); + MachineInstr &MJ = *SUJ->getInstr(); + + unsigned BPI, OPI; + if (!HII->getBaseAndOffsetPosition(MI, BPI, OPI)) + return false; + unsigned BPJ, OPJ; + if (!HII->getBaseAndOffsetPosition(MJ, BPJ, OPJ)) + return false; + unsigned Reg = MI.getOperand(BPI).getReg(); + if (Reg != MJ.getOperand(BPJ).getReg()) + return false; + // Make sure that the dependences do not restrict adding MI to the packet. + // That is, ignore anti dependences, and make sure the only data dependence + // involves the specific register. + for (const auto &PI : SUI->Preds) + if (PI.getKind() != SDep::Anti && + (PI.getKind() != SDep::Data || PI.getReg() != Reg)) + return false; + int Incr; + if (!HII->getIncrementValue(MJ, Incr)) + return false; + + int64_t Offset = MI.getOperand(OPI).getImm(); + MI.getOperand(OPI).setImm(Offset + Incr); + ChangedOffset = Offset; + return true; +} + +/// Undo the changed offset. This is needed if the instruction cannot be +/// added to the current packet due to a different instruction. +void HexagonPacketizerList::undoChangedOffset(MachineInstr &MI) { + unsigned BP, OP; + if (!HII->getBaseAndOffsetPosition(MI, BP, OP)) + llvm_unreachable("Unable to find base and offset operands."); + MI.getOperand(OP).setImm(ChangedOffset); +} + enum PredicateKind { PK_False, PK_True, @@ -980,6 +1022,7 @@ void HexagonPacketizerList::initPacketizerState() { GlueToNewValueJump = false; GlueAllocframeStore = false; FoundSequentialDependence = false; + ChangedOffset = INT64_MAX; } // Ignore bundling of pseudo instructions. @@ -1567,6 +1610,15 @@ bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) { useCalleesSP(I); GlueAllocframeStore = false; } + + if (ChangedOffset != INT64_MAX) + undoChangedOffset(I); + else if (updateOffset(SUI, SUJ)) { + FoundSequentialDependence = false; + Dependence = false; + return true; + } + return false; } diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h index 3a7bdf5101e6..cbdd2367429d 100644 --- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h +++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h @@ -38,6 +38,9 @@ class HexagonPacketizerList : public VLIWPacketizerList { // Has the feeder instruction been glued to new value jump. bool GlueToNewValueJump; + // This holds the offset value, when pruning the dependences. + int64_t ChangedOffset; + // Check if there is a dependence between some instruction already in this // packet and this instruction. bool Dependence; @@ -117,6 +120,8 @@ protected: bool demoteToDotOld(MachineInstr &MI); bool useCallersSP(MachineInstr &MI); void useCalleesSP(MachineInstr &MI); + bool updateOffset(SUnit *SUI, SUnit *SUJ); + void undoChangedOffset(MachineInstr &MI); bool arePredicatesComplements(MachineInstr &MI1, MachineInstr &MI2); bool restrictingDepExistInPacket(MachineInstr&, unsigned); bool isNewifiable(const MachineInstr &MI, const TargetRegisterClass *NewRC); diff --git a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll new file mode 100644 index 000000000000..5de0717654ff --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll @@ -0,0 +1,37 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Test that the instruction ordering code in the pipeliner fixes up dependences +; between post-increment register definitions and uses so that the register +; allocator does not allocate an additional register. The following test case +; should generate a single packet. + +; CHECK: loop0(.LBB0_[[LOOP:.]], +; CHECK: .LBB0_[[LOOP]]: +; CHECK: { +; CHECK-NOT: { +; CHECK: :endloop0 + +define void @test(i64* nocapture %v1, i64 %v2, i32 %len) local_unnamed_addr #0 { +entry: + %cmp7 = icmp sgt i32 %len, 0 + br i1 %cmp7, label %for.body, label %for.end + +for.body: + %arrayidx.phi = phi i64* [ %arrayidx.inc, %for.body ], [ %v1, %entry ] + %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %0 = load i64, i64* %arrayidx.phi, align 8 + %1 = tail call i64 @llvm.hexagon.M2.mmpyul.rs1(i64 %0, i64 %v2) + store i64 %1, i64* %arrayidx.phi, align 8 + %inc = add nuw nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc, %len + %arrayidx.inc = getelementptr i64, i64* %arrayidx.phi, i32 1 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +declare i64 @llvm.hexagon.M2.mmpyul.rs1(i64, i64) #1 + +attributes #0 = { nounwind "target-cpu"="hexagonv60" } +attributes #1 = { nounwind readnone }