[PowerPC] Add a peephole post RA to transform the inst that fed by add

If the arch is P8, we will select XFLOAD to load the floating point, and then, expand it to vsx and non-vsx X-form instruction post RA. This patch is trying to convert the X-form to D-form if it meets the requirement that one operand of the x-form inst is the special Zero register, and another operand fed by add inst. i.e. y = add imm, reg LFDX. 0, y --> LFD imm(reg) Reviewers: Nemanjai Differential Revision: https://reviews.llvm.org/D49007 llvm-svn: 340149
2018-08-20 02:52:55 +00:00 · 2018-08-20 02:52:55 +00:00 · f8f9af7ba5
parent fdca0c6d2e
commit f8f9af7ba5
20 changed files with 462 additions and 157 deletions
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@ -2088,11 +2088,9 @@ bool PPCInstrInfo::expandVSXMemPseudo(MachineInstr &MI) const {
    return true;
 }

-#ifndef NDEBUG
 static bool isAnImmediateOperand(const MachineOperand &MO) {
  return MO.isCPI() || MO.isGlobal() || MO.isImm();
 }
-#endif

 bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
  auto &MBB = *MI.getParent();
@ -2256,10 +2254,11 @@ void PPCInstrInfo::replaceInstrWithLI(MachineInstr &MI,
      .addImm(LII.Imm);
 }

-MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
-                                             unsigned &ConstOp,
-                                             bool &SeenIntermediateUse) const {
-  ConstOp = ~0U;
+MachineInstr *PPCInstrInfo::getForwardingDefMI(
+  MachineInstr &MI,
+  unsigned &OpNoForForwarding,
+  bool &SeenIntermediateUse) const {
+  OpNoForForwarding = ~0U;
  MachineInstr *DefMI = nullptr;
  MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo();
  const TargetRegisterInfo *TRI = &getRegisterInfo();
@ -2276,7 +2275,7 @@ MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
      if (TargetRegisterInfo::isVirtualRegister(TrueReg)) {
        DefMI = MRI->getVRegDef(TrueReg);
        if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8) {
-          ConstOp = i;
+          OpNoForForwarding = i;
          break;
        }
      }
@ -2319,15 +2318,22 @@ MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
        if (PPC::G8RCRegClass.contains(Reg))
          Reg = Reg - PPC::X0 + PPC::R0;

-        // Is this register defined by a load-immediate in this block?
+        // Is this register defined by some form of add-immediate (including
+        // load-immediate) within this basic block?
        for ( ; It != E; ++It) {
          if (It->modifiesRegister(Reg, &getRegisterInfo())) {
-            if (It->getOpcode() == PPC::LI || It->getOpcode() == PPC::LI8) {
-              ConstOp = i;
+            switch (It->getOpcode()) {
+            default: break;
+            case PPC::LI:
+            case PPC::LI8:
+            case PPC::ADDItocL:
+            case PPC::ADDI:
+            case PPC::ADDI8:
+              OpNoForForwarding = i;
              return &*It;
-            } else
-              break;
-          } else if (It->readsRegister(Reg, &getRegisterInfo()))
+            }
+            break;
+          } else if (It->readsRegister(Reg, &getRegisterInfo())) 
            // If we see another use of this reg between the def and the MI,
            // we want to flat it so the def isn't deleted.
            SeenIntermediateUse = true;
@ -2335,7 +2341,7 @@ MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
      }
    }
  }
-  return ConstOp == ~0U ? nullptr : DefMI;
+  return OpNoForForwarding == ~0U ? nullptr : DefMI;
 }

 const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const {
@ -2371,35 +2377,48 @@ const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const {
 }

 // If this instruction has an immediate form and one of its operands is a
-// result of a load-immediate, convert it to the immediate form if the constant
-// is in range.
+// result of a load-immediate or an add-immediate, convert it to
+// the immediate form if the constant is in range.
 bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
                                          MachineInstr **KilledDef) const {
  MachineFunction *MF = MI.getParent()->getParent();
  MachineRegisterInfo *MRI = &MF->getRegInfo();
  bool PostRA = !MRI->isSSA();
  bool SeenIntermediateUse = true;
-  unsigned ConstantOperand = ~0U;
-  MachineInstr *DefMI = getConstantDefMI(MI, ConstantOperand,
-                                         SeenIntermediateUse);
-  if (!DefMI || !DefMI->getOperand(1).isImm())
+  unsigned ForwardingOperand = ~0U;
+  MachineInstr *DefMI = getForwardingDefMI(MI, ForwardingOperand,
+                                           SeenIntermediateUse);
+  if (!DefMI)
+    return false;
+  assert(ForwardingOperand < MI.getNumOperands() &&
+         "The forwarding operand needs to be valid at this point");
+  bool KillFwdDefMI = !SeenIntermediateUse &&
+    MI.getOperand(ForwardingOperand).isKill();
+  if (KilledDef && KillFwdDefMI)
+    *KilledDef = DefMI;
+
+  ImmInstrInfo III;
+  bool HasImmForm = instrHasImmForm(MI, III);
+  // If this is a reg+reg instruction that has a reg+imm form,
+  // and one of the operands is produced by an add-immediate,
+  // try to convert it.
+  if (HasImmForm && transformToImmFormFedByAdd(MI, III, ForwardingOperand,
+                                               *DefMI, KillFwdDefMI))
+    return true;
+
+  if ((DefMI->getOpcode() != PPC::LI && DefMI->getOpcode() != PPC::LI8) ||
+      !DefMI->getOperand(1).isImm())
    return false;
-  assert(ConstantOperand < MI.getNumOperands() &&
-         "The constant operand needs to be valid at this point");

  int64_t Immediate = DefMI->getOperand(1).getImm();
  // Sign-extend to 64-bits.
  int64_t SExtImm = ((uint64_t)Immediate & ~0x7FFFuLL) != 0 ?
    (Immediate | 0xFFFFFFFFFFFF0000) : Immediate;

-  if (KilledDef && MI.getOperand(ConstantOperand).isKill() &&
-      !SeenIntermediateUse)
-    *KilledDef = DefMI;
-
-  // If this is a reg+reg instruction that has a reg+imm form, convert it now.
-  ImmInstrInfo III;
-  if (instrHasImmForm(MI, III))
-    return transformToImmForm(MI, III, ConstantOperand, SExtImm);
+  // If this is a reg+reg instruction that has a reg+imm form,
+  // and one of the operands is produced by LI, convert it now.
+  if (HasImmForm)
+    return transformToImmFormFedByLI(MI, III, ForwardingOperand, SExtImm);

  bool ReplaceWithLI = false;
  bool Is64BitLI = false;
@ -2610,10 +2629,11 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
  // are the update form loads/stores for which a constant operand 2 would need
  // to turn into a displacement and move operand 1 to the operand 2 position.
  III.ImmOpNo = 2;
-  III.ConstantOpNo = 2;
+  III.OpNoForForwarding = 2;
  III.ImmWidth = 16;
  III.ImmMustBeMultipleOf = 1;
  III.TruncateImmTo = 0;
+  III.IsSummingOperands = false;
  switch (Opc) {
  default: return false;
  case PPC::ADD4:
@ -2622,6 +2642,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
    III.ZeroIsSpecialOrig = 0;
    III.ZeroIsSpecialNew = 1;
    III.IsCommutative = true;
+    III.IsSummingOperands = true;
    III.ImmOpcode = Opc == PPC::ADD4 ? PPC::ADDI : PPC::ADDI8;
    break;
  case PPC::ADDC:
@ -2630,6 +2651,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
    III.ZeroIsSpecialOrig = 0;
    III.ZeroIsSpecialNew = 0;
    III.IsCommutative = true;
+    III.IsSummingOperands = true;
    III.ImmOpcode = Opc == PPC::ADDC ? PPC::ADDIC : PPC::ADDIC8;
    break;
  case PPC::ADDCo:
@ -2637,6 +2659,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
    III.ZeroIsSpecialOrig = 0;
    III.ZeroIsSpecialNew = 0;
    III.IsCommutative = true;
+    III.IsSummingOperands = true;
    III.ImmOpcode = PPC::ADDICo;
    break;
  case PPC::SUBFC:
@ -2809,8 +2832,9 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
    III.ZeroIsSpecialOrig = 1;
    III.ZeroIsSpecialNew = 2;
    III.IsCommutative = true;
+    III.IsSummingOperands = true;
    III.ImmOpNo = 1;
-    III.ConstantOpNo = 2;
+    III.OpNoForForwarding = 2;
    switch(Opc) {
    default: llvm_unreachable("Unknown opcode");
    case PPC::LBZX: III.ImmOpcode = PPC::LBZ; break;
@ -2866,8 +2890,9 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
    III.ZeroIsSpecialOrig = 2;
    III.ZeroIsSpecialNew = 3;
    III.IsCommutative = false;
+    III.IsSummingOperands = true;
    III.ImmOpNo = 2;
-    III.ConstantOpNo = 3;
+    III.OpNoForForwarding = 3;
    switch(Opc) {
    default: llvm_unreachable("Unknown opcode");
    case PPC::LBZUX: III.ImmOpcode = PPC::LBZU; break;
@ -2911,8 +2936,9 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
    III.ZeroIsSpecialOrig = 1;
    III.ZeroIsSpecialNew = 2;
    III.IsCommutative = true;
+    III.IsSummingOperands = true;
    III.ImmOpNo = 1;
-    III.ConstantOpNo = 2;
+    III.OpNoForForwarding = 2;
    switch(Opc) {
    default: llvm_unreachable("Unknown opcode");
    case PPC::LXVX:
@ -2984,13 +3010,256 @@ static void swapMIOperands(MachineInstr &MI, unsigned Op1, unsigned Op2) {
  }
 }

-bool PPCInstrInfo::transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
-                                      unsigned ConstantOpNo,
-                                      int64_t Imm) const {
+// Check if the 'MI' that has the index OpNoForForwarding 
+// meets the requirement described in the ImmInstrInfo.
+bool PPCInstrInfo::isUseMIElgibleForForwarding(MachineInstr &MI,
+                                               const ImmInstrInfo &III,
+                                               unsigned OpNoForForwarding
+                                               ) const {
+  // As the algorithm of checking for PPC::ZERO/PPC::ZERO8
+  // would not work pre-RA, we can only do the check post RA.
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  if (MRI.isSSA())
+    return false;
+
+  // Cannot do the transform if MI isn't summing the operands.
+  if (!III.IsSummingOperands)
+    return false;
+
+  // The instruction we are trying to replace must have the ZeroIsSpecialOrig set.
+  if (!III.ZeroIsSpecialOrig)
+    return false;
+
+  // We cannot do the transform if the operand we are trying to replace
+  // isn't the same as the operand the instruction allows.
+  if (OpNoForForwarding != III.OpNoForForwarding)
+    return false;
+
+  // Check if the instruction we are trying to transform really has
+  // the special zero register as its operand.
+  if (MI.getOperand(III.ZeroIsSpecialOrig).getReg() != PPC::ZERO &&
+      MI.getOperand(III.ZeroIsSpecialOrig).getReg() != PPC::ZERO8)
+    return false;
+
+  // This machine instruction is convertible if it is,
+  // 1. summing the operands.
+  // 2. one of the operands is special zero register.
+  // 3. the operand we are trying to replace is allowed by the MI.
+  return true;
+}
+
+// Check if the DefMI is the add inst and set the ImmMO and RegMO
+// accordingly.
+bool PPCInstrInfo::isDefMIElgibleForForwarding(MachineInstr &DefMI,
+                                               const ImmInstrInfo &III,
+                                               MachineOperand *&ImmMO,
+                                               MachineOperand *&RegMO) const {
+  unsigned Opc = DefMI.getOpcode();
+  if (Opc != PPC::ADDItocL && Opc != PPC::ADDI && Opc != PPC::ADDI8)
+    return false; 
+
+  assert(DefMI.getNumOperands() >= 3 &&
+         "Add inst must have at least three operands");
+  RegMO = &DefMI.getOperand(1);
+  ImmMO = &DefMI.getOperand(2);
+
+  // This DefMI is elgible for forwarding if it is:
+  // 1. add inst
+  // 2. one of the operands is Imm/CPI/Global.
+  return isAnImmediateOperand(*ImmMO);
+}
+
+bool PPCInstrInfo::isRegElgibleForForwarding(const MachineOperand &RegMO,
+                                             const MachineInstr &DefMI,
+                                             const MachineInstr &MI,
+                                             bool KillDefMI
+                                             ) const {
+  // x = addi y, imm
+  // ...
+  // z = lfdx 0, x   -> z = lfd imm(y)
+  // The Reg "y" can be forwarded to the MI(z) only when there is no DEF
+  // of "y" between the DEF of "x" and "z".
+  // The query is only valid post RA.
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  if (MRI.isSSA())
+    return false;
+
+  // MachineInstr::readsRegister only returns true if the machine
+  // instruction reads the exact register or its super-register. It
+  // does not consider uses of sub-registers which seems like strange
+  // behaviour. Nonetheless, if we end up with a 64-bit register here,
+  // get the corresponding 32-bit register to check.
+  unsigned Reg = RegMO.getReg();
+  if (PPC::G8RCRegClass.contains(Reg))
+    Reg = Reg - PPC::X0 + PPC::R0;
+
+  // Walking the inst in reverse(MI-->DefMI) to get the last DEF of the Reg.
+  MachineBasicBlock::const_reverse_iterator It = MI;
+  MachineBasicBlock::const_reverse_iterator E = MI.getParent()->rend();
+  It++;
+  for (; It != E; ++It) {
+    if (It->modifiesRegister(Reg, &getRegisterInfo()) && (&*It) != &DefMI)
+      return false;
+    // Made it to DefMI without encountering a clobber.
+    if ((&*It) == &DefMI)
+      break;
+  }
+  assert((&*It) == &DefMI && "DefMI is missing");
+
+  // If DefMI also uses the register to be forwarded, we can only forward it
+  // if DefMI is being erased.
+  if (DefMI.readsRegister(Reg, &getRegisterInfo()))
+    return KillDefMI;
+
+  return true;
+}
+
+bool PPCInstrInfo::isImmElgibleForForwarding(const MachineOperand &ImmMO,
+                                             const MachineInstr &DefMI,
+                                             const ImmInstrInfo &III,
+                                             int64_t &Imm) const {
+  assert(isAnImmediateOperand(ImmMO) && "ImmMO is NOT an immediate");
+  if (DefMI.getOpcode() == PPC::ADDItocL) {
+    // The operand for ADDItocL is CPI, which isn't imm at compiling time,
+    // However, we know that, it is 16-bit width, and has the alignment of 4.
+    // Check if the instruction met the requirement.
+    if (III.ImmMustBeMultipleOf > 4 ||
+       III.TruncateImmTo || III.ImmWidth != 16)
+      return false;
+
+    return true;
+  }
+
+  if (ImmMO.isImm()) {
+    // It is Imm, we need to check if the Imm fit the range.
+    int64_t Immediate = ImmMO.getImm();
+    // Sign-extend to 64-bits.
+    Imm = ((uint64_t)Immediate & ~0x7FFFuLL) != 0 ?
+      (Immediate | 0xFFFFFFFFFFFF0000) : Immediate;
+
+    if (Imm % III.ImmMustBeMultipleOf)
+      return false;
+    if (III.TruncateImmTo)
+      Imm &= ((1 << III.TruncateImmTo) - 1);
+    if (III.SignedImm) {
+      APInt ActualValue(64, Imm, true);
+      if (!ActualValue.isSignedIntN(III.ImmWidth))
+        return false;
+    } else {
+      uint64_t UnsignedMax = (1 << III.ImmWidth) - 1;
+      if ((uint64_t)Imm > UnsignedMax)
+        return false;
+    }
+  }
+  else
+    return false;
+
+  // This ImmMO is forwarded if it meets the requriement describle
+  // in ImmInstrInfo
+  return true;
+}
+
+// If an X-Form instruction is fed by an add-immediate and one of its operands
+// is the literal zero, attempt to forward the source of the add-immediate to
+// the corresponding D-Form instruction with the displacement coming from
+// the immediate being added.
+bool PPCInstrInfo::transformToImmFormFedByAdd(MachineInstr &MI,
+                                              const ImmInstrInfo &III,
+                                              unsigned OpNoForForwarding,
+                                              MachineInstr &DefMI,
+                                              bool KillDefMI) const {
+  //         RegMO ImmMO
+  //           |    |
+  // x = addi reg, imm  <----- DefMI
+  // y = op    0 ,  x   <----- MI
+  //                |
+  //         OpNoForForwarding
+  // Check if the MI meet the requirement described in the III.
+  if (!isUseMIElgibleForForwarding(MI, III, OpNoForForwarding))
+    return false;
+
+  // Check if the DefMI meet the requirement
+  // described in the III. If yes, set the ImmMO and RegMO accordingly.
+  MachineOperand *ImmMO = nullptr;
+  MachineOperand *RegMO = nullptr;
+  if (!isDefMIElgibleForForwarding(DefMI, III, ImmMO, RegMO))
+    return false;
+  assert(ImmMO && RegMO && "Imm and Reg operand must have been set");
+
+  // As we get the Imm operand now, we need to check if the ImmMO meet
+  // the requirement described in the III. If yes set the Imm.
+  int64_t Imm = 0;
+  if (!isImmElgibleForForwarding(*ImmMO, DefMI, III, Imm))
+    return false;
+
+  // Check if the RegMO can be forwarded to MI.
+  if (!isRegElgibleForForwarding(*RegMO, DefMI, MI, KillDefMI))
+    return false;
+
+  // We know that, the MI and DefMI both meet the pattern, and
+  // the Imm also meet the requirement with the new Imm-form.
+  // It is safe to do the transformation now.
+  LLVM_DEBUG(dbgs() << "Replacing instruction:\n");
+  LLVM_DEBUG(MI.dump());
+  LLVM_DEBUG(dbgs() << "Fed by:\n");
+  LLVM_DEBUG(DefMI.dump());
+
+  // Update the base reg first.
+  MI.getOperand(III.OpNoForForwarding).ChangeToRegister(RegMO->getReg(),
+                                                        false, false,
+                                                        RegMO->isKill());
+
+  // Then, update the imm.
+  if (ImmMO->isImm()) {
+    // If the ImmMO is Imm, change the operand that has ZERO to that Imm
+    // directly.
+    MI.getOperand(III.ZeroIsSpecialOrig).ChangeToImmediate(Imm);
+  }
+  else {
+    // Otherwise, it is Constant Pool Index(CPI) or Global,
+    // which is relocation in fact. We need to replace the special zero
+    // register with ImmMO.
+    // Before that, we need to fixup the target flags for imm. 
+    // For some reason, we miss to set the flag for the ImmMO if it is CPI.
+    if (DefMI.getOpcode() == PPC::ADDItocL)
+      ImmMO->setTargetFlags(PPCII::MO_TOC_LO);
+
+    // MI didn't have the interface such as MI.setOperand(i) though
+    // it has MI.getOperand(i). To repalce the ZERO MachineOperand with
+    // ImmMO, we need to remove ZERO operand and all the operands behind it,
+    // and, add the ImmMO, then, move back all the operands behind ZERO.
+    SmallVector<MachineOperand, 2> MOps;
+    for (unsigned i = MI.getNumOperands() - 1; i >= III.ZeroIsSpecialOrig; i--) {
+      MOps.push_back(MI.getOperand(i));
+      MI.RemoveOperand(i);
+    }
+
+    // Remove the last MO in the list, which is ZERO operand in fact.
+    MOps.pop_back();
+    // Add the imm operand.
+    MI.addOperand(*ImmMO);
+    // Now add the rest back.
+    for (auto &MO : MOps)
+      MI.addOperand(MO);
+  }
+
+  // Update the opcode.
+  MI.setDesc(get(III.ImmOpcode));
+
+  LLVM_DEBUG(dbgs() << "With:\n");
+  LLVM_DEBUG(MI.dump());
+
+  return true;
+}
+
+bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
+                                             const ImmInstrInfo &III,
+                                             unsigned ConstantOpNo,
+                                             int64_t Imm) const {
  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
  bool PostRA = !MRI.isSSA();
  // Exit early if we can't convert this.
-  if ((ConstantOpNo != III.ConstantOpNo) && !III.IsCommutative)
+  if ((ConstantOpNo != III.OpNoForForwarding) && !III.IsCommutative)
    return false;
  if (Imm % III.ImmMustBeMultipleOf)
    return false;
@ -3035,7 +3304,7 @@ bool PPCInstrInfo::transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
    Opc == PPC::SRW || Opc == PPC::SRWo || Opc == PPC::SRD || Opc == PPC::SRDo;

  MI.setDesc(get(III.ImmOpcode));
-  if (ConstantOpNo == III.ConstantOpNo) {
+  if (ConstantOpNo == III.OpNoForForwarding) {
    // Converting shifts to immediate form is a bit tricky since they may do
    // one of three things:
    // 1. If the shift amount is between OpSize and 2*OpSize, the result is zero
@ -3063,14 +3332,14 @@ bool PPCInstrInfo::transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
          uint64_t SH = RightShift ? 32 - ShAmt : ShAmt;
          uint64_t MB = RightShift ? ShAmt : 0;
          uint64_t ME = RightShift ? 31 : 31 - ShAmt;
-          MI.getOperand(III.ConstantOpNo).ChangeToImmediate(SH);
+          MI.getOperand(III.OpNoForForwarding).ChangeToImmediate(SH);
          MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(MB)
            .addImm(ME);
        } else {
          // Left shifts use (N, 63-N), right shifts use (64-N, N).
          uint64_t SH = RightShift ? 64 - ShAmt : ShAmt;
          uint64_t ME = RightShift ? ShAmt : 63 - ShAmt;
-          MI.getOperand(III.ConstantOpNo).ChangeToImmediate(SH);
+          MI.getOperand(III.OpNoForForwarding).ChangeToImmediate(SH);
          MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(ME);
        }
      }
@ -3081,14 +3350,14 @@ bool PPCInstrInfo::transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
  // desired one to an immediate.
  else if (III.IsCommutative) {
    MI.getOperand(ConstantOpNo).ChangeToImmediate(Imm);
-    swapMIOperands(MI, ConstantOpNo, III.ConstantOpNo);
+    swapMIOperands(MI, ConstantOpNo, III.OpNoForForwarding);
  } else
    llvm_unreachable("Should have exited early!");

  // For instructions for which the constant register replaces a different
  // operand than where the immediate goes, we need to swap them.
-  if (III.ConstantOpNo != III.ImmOpNo)
-    swapMIOperands(MI, III.ConstantOpNo, III.ImmOpNo);
+  if (III.OpNoForForwarding != III.ImmOpNo)
+    swapMIOperands(MI, III.OpNoForForwarding, III.ImmOpNo);

  // If the R0/X0 register is special for the original instruction and not for
  // the new instruction (or vice versa), we need to fix up the register class.
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@ -91,8 +91,8 @@ struct ImmInstrInfo {
  uint64_t ZeroIsSpecialNew : 3;
  // Is the operation commutative?
  uint64_t IsCommutative : 1;
-  // The operand number to check for load immediate.
-  uint64_t ConstantOpNo : 3;
+  // The operand number to check for add-immediate def.
+  uint64_t OpNoForForwarding : 3;
  // The operand number for the immediate.
  uint64_t ImmOpNo : 3;
  // The opcode of the new instruction.
@ -101,6 +101,8 @@ struct ImmInstrInfo {
  uint64_t ImmWidth : 5;
  // The immediate should be truncated to N bits.
  uint64_t TruncateImmTo : 5;
+  // Is the instruction summing the operand
+  uint64_t IsSummingOperands : 1;
 };

 // Information required to convert an instruction to just a materialized
@ -123,10 +125,42 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                            unsigned DestReg, int FrameIdx,
                            const TargetRegisterClass *RC,
                            SmallVectorImpl<MachineInstr *> &NewMIs) const;
-  bool transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
-                          unsigned ConstantOpNo, int64_t Imm) const;
-  MachineInstr *getConstantDefMI(MachineInstr &MI, unsigned &ConstOp,
-                                 bool &SeenIntermediateUse) const;
+
+  // If the inst has imm-form and one of its operand is produced by a LI,
+  // put the imm into the inst directly and remove the LI if possible.
+  bool transformToImmFormFedByLI(MachineInstr &MI, const ImmInstrInfo &III,
+                                 unsigned ConstantOpNo, int64_t Imm) const;
+  // If the inst has imm-form and one of its operand is produced by an
+  // add-immediate, try to transform it when possible.
+  bool transformToImmFormFedByAdd(MachineInstr &MI, const ImmInstrInfo &III,
+                                  unsigned ConstantOpNo,
+                                  MachineInstr &DefMI,
+                                  bool KillDefMI) const;
+  // Try to find that, if the instruction 'MI' contains any operand that 
+  // could be forwarded from some inst that feeds it. If yes, return the
+  // Def of that operand. And OpNoForForwarding is the operand index in
+  // the 'MI' for that 'Def'. If we see another use of this Def between
+  // the Def and the MI, SeenIntermediateUse becomes 'true'.
+  MachineInstr *getForwardingDefMI(MachineInstr &MI,
+                                   unsigned &OpNoForForwarding,
+                                   bool &SeenIntermediateUse) const;
+
+  // Can the user MI have it's source at index \p OpNoForForwarding
+  // forwarded from an add-immediate that feeds it?
+  bool isUseMIElgibleForForwarding(MachineInstr &MI, const ImmInstrInfo &III,
+                                   unsigned OpNoForForwarding) const;
+  bool isDefMIElgibleForForwarding(MachineInstr &DefMI,
+                                   const ImmInstrInfo &III,
+                                   MachineOperand *&ImmMO,
+                                   MachineOperand *&RegMO) const;
+  bool isImmElgibleForForwarding(const MachineOperand &ImmMO,
+                                 const MachineInstr &DefMI,
+                                 const ImmInstrInfo &III,
+                                 int64_t &Imm) const;
+  bool isRegElgibleForForwarding(const MachineOperand &RegMO,
+                                 const MachineInstr &DefMI,
+                                 const MachineInstr &MI,
+                                 bool KillDefMI) const;
  const unsigned *getStoreOpcodesForSpillArray() const;
  const unsigned *getLoadOpcodesForSpillArray() const;
  virtual void anchor();
--- a/llvm/test/CodeGen/PowerPC/bitcasts-direct-move.ll
+++ b/llvm/test/CodeGen/PowerPC/bitcasts-direct-move.ll
@ -18,7 +18,7 @@ define i64 @f64toi64(double %a) {
 entry:
  %0 = bitcast double %a to i64
  ret i64 %0
-; CHECK-P7: stfdx 1,
+; CHECK-P7: stfd 1,
 ; CHECK-P7: ld 3,
 ; CHECK: mffprd 3, 1
 }
@ -39,7 +39,7 @@ entry:
  %0 = bitcast i64 %a to double
  ret double %0
 ; CHECK-P7: std 3,
-; CHECK-P7: lfdx 1,
+; CHECK-P7: lfd 1,
 ; CHECK: mtvsrd 1, 3
 }

@ -58,7 +58,7 @@ define i64 @f64toi64u(double %a) {
 entry:
  %0 = bitcast double %a to i64
  ret i64 %0
-; CHECK-P7: stfdx 1,
+; CHECK-P7: stfd 1,
 ; CHECK-P7: ld 3,
 ; CHECK: mffprd 3, 1
 }
@ -79,6 +79,6 @@ entry:
  %0 = bitcast i64 %a to double
  ret double %0
 ; CHECK-P7: std 3,
-; CHECK-P7: lfdx 1,
+; CHECK-P7: lfd 1,
 ; CHECK: mtvsrd 1, 3
 }
--- a/llvm/test/CodeGen/PowerPC/branch_coalesce.ll
+++ b/llvm/test/CodeGen/PowerPC/branch_coalesce.ll
@ -13,10 +13,8 @@ define double @testBranchCoal(double %a, double %b, double %c, i32 %x) {
 ; CHECK-DAG: addis [[LD2REG:[0-9]+]], 2, .LCPI0_1@toc@ha
 ; CHECK-DAG: xxlxor 2, 2, 2
 ; CHECK-NOT: beq
-; CHECK-DAG: addi [[LD1BASE:[0-9]+]], [[LD1REG]]
-; CHECK-DAG: addi [[LD2BASE:[0-9]+]], [[LD2REG]]
-; CHECK-DAG: lfdx 1, 0, [[LD1BASE]]
-; CHECK-DAG: lfdx 3, 0, [[LD2BASE]]
+; CHECK-DAG: lfd 1, .LCPI0_0@toc@l([[LD1REG]])
+; CHECK-DAG: lfd 3, .LCPI0_1@toc@l([[LD2REG]])
 ; CHECK: .LBB[[LAB1]]
 ; CHECK: xsadddp 0, 1, 2
 ; CHECK: xsadddp 1, 0, 3
@ -32,16 +30,14 @@ define double @testBranchCoal(double %a, double %b, double %c, i32 %x) {
 ; CHECK-NOCOALESCE-NEXT:    beq 0, .LBB0_4
 ; CHECK-NOCOALESCE-NEXT:  .LBB0_3: # %entry
 ; CHECK-NOCOALESCE-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
-; CHECK-NOCOALESCE-NEXT:    addi 3, 3, .LCPI0_1@toc@l
-; CHECK-NOCOALESCE-NEXT:    lfdx 3, 0, 3
+; CHECK-NOCOALESCE-NEXT:    lfd 3, .LCPI0_1@toc@l(3)
 ; CHECK-NOCOALESCE-NEXT:  .LBB0_4: # %entry
 ; CHECK-NOCOALESCE-NEXT:    xsadddp 0, 1, 2
 ; CHECK-NOCOALESCE-NEXT:    xsadddp 1, 0, 3
 ; CHECK-NOCOALESCE-NEXT:    blr
 ; CHECK-NOCOALESCE-NEXT:  .LBB0_5: # %entry
 ; CHECK-NOCOALESCE-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
-; CHECK-NOCOALESCE-NEXT:    addi 3, 3, .LCPI0_0@toc@l
-; CHECK-NOCOALESCE-NEXT:    lfdx 1, 0, 3
+; CHECK-NOCOALESCE-NEXT:    lfd 1, .LCPI0_0@toc@l(3)
 ; CHECK-NOCOALESCE-NEXT:    beq 0, .LBB0_2
 ; CHECK-NOCOALESCE-NEXT:  .LBB0_6: # %entry
 ; CHECK-NOCOALESCE-NEXT:    xxlxor 2, 2, 2
--- a/llvm/test/CodeGen/PowerPC/fast-isel-load-store-vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/fast-isel-load-store-vsx.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel -mattr=+vsx -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64VSX
+; RUN: llc < %s -O0 -fast-isel -mattr=+vsx -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -ppc-late-peephole=false | FileCheck %s --check-prefix=ELF64VSX

 ;; The semantics of VSX stores for when R0 is used is different depending on
 ;; whether it is used as base or offset. If used as base, the effective
--- a/llvm/test/CodeGen/PowerPC/float-to-int.ll
+++ b/llvm/test/CodeGen/PowerPC/float-to-int.ll
@ -21,7 +21,7 @@ define i64 @foo(float %a) nounwind {

 ; CHECK-VSX: @foo
 ; CHECK-VSX: xscvdpsxds [[REG:[0-9]+]], 1
-; CHECK-VSX: stfdx [[REG]],
+; CHECK-VSX: stfd [[REG]],
 ; CHECK-VSX: ld 3,
 ; CHECK-VSX: blr

@ -44,7 +44,7 @@ define i64 @foo2(double %a) nounwind {

 ; CHECK-VSX: @foo2
 ; CHECK-VSX: xscvdpsxds [[REG:[0-9]+]], 1
-; CHECK-VSX: stfdx [[REG]],
+; CHECK-VSX: stfd [[REG]],
 ; CHECK-VSX: ld 3,
 ; CHECK-VSX: blr

@ -67,7 +67,7 @@ define i64 @foo3(float %a) nounwind {

 ; CHECK-VSX: @foo3
 ; CHECK-VSX: xscvdpuxds [[REG:[0-9]+]], 1
-; CHECK-VSX: stfdx [[REG]],
+; CHECK-VSX: stfd [[REG]],
 ; CHECK-VSX: ld 3,
 ; CHECK-VSX: blr

@ -90,7 +90,7 @@ define i64 @foo4(double %a) nounwind {

 ; CHECK-VSX: @foo4
 ; CHECK-VSX: xscvdpuxds [[REG:[0-9]+]], 1
-; CHECK-VSX: stfdx [[REG]],
+; CHECK-VSX: stfd [[REG]],
 ; CHECK-VSX: ld 3,
 ; CHECK-VSX: blr

--- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
+++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
@ -165,16 +165,14 @@ define float @fmul_fma_reassoc1(float %x) {
 ; FMF-LABEL: fmul_fma_reassoc1:
 ; FMF:       # %bb.0:
 ; FMF-NEXT:    addis 3, 2, .LCPI6_0@toc@ha
-; FMF-NEXT:    addi 3, 3, .LCPI6_0@toc@l
-; FMF-NEXT:    lfsx 0, 0, 3
+; FMF-NEXT:    lfs 0, .LCPI6_0@toc@l(3)
 ; FMF-NEXT:    xsmulsp 1, 1, 0
 ; FMF-NEXT:    blr
 ;
 ; GLOBAL-LABEL: fmul_fma_reassoc1:
 ; GLOBAL:       # %bb.0:
 ; GLOBAL-NEXT:    addis 3, 2, .LCPI6_0@toc@ha
-; GLOBAL-NEXT:    addi 3, 3, .LCPI6_0@toc@l
-; GLOBAL-NEXT:    lfsx 0, 0, 3
+; GLOBAL-NEXT:    lfs 0, .LCPI6_0@toc@l(3)
 ; GLOBAL-NEXT:    xsmulsp 1, 1, 0
 ; GLOBAL-NEXT:    blr
  %mul = fmul float %x, 42.0
@ -196,16 +194,14 @@ define float @fmul_fma_reassoc2(float %x) {
 ; FMF-LABEL: fmul_fma_reassoc2:
 ; FMF:       # %bb.0:
 ; FMF-NEXT:    addis 3, 2, .LCPI7_0@toc@ha
-; FMF-NEXT:    addi 3, 3, .LCPI7_0@toc@l
-; FMF-NEXT:    lfsx 0, 0, 3
+; FMF-NEXT:    lfs 0, .LCPI7_0@toc@l(3)
 ; FMF-NEXT:    xsmulsp 1, 1, 0
 ; FMF-NEXT:    blr
 ;
 ; GLOBAL-LABEL: fmul_fma_reassoc2:
 ; GLOBAL:       # %bb.0:
 ; GLOBAL-NEXT:    addis 3, 2, .LCPI7_0@toc@ha
-; GLOBAL-NEXT:    addi 3, 3, .LCPI7_0@toc@l
-; GLOBAL-NEXT:    lfsx 0, 0, 3
+; GLOBAL-NEXT:    lfs 0, .LCPI7_0@toc@l(3)
 ; GLOBAL-NEXT:    xsmulsp 1, 1, 0
 ; GLOBAL-NEXT:    blr
  %mul = fmul reassoc float %x, 42.0
@ -227,16 +223,14 @@ define float @fmul_fma_fast1(float %x) {
 ; FMF-LABEL: fmul_fma_fast1:
 ; FMF:       # %bb.0:
 ; FMF-NEXT:    addis 3, 2, .LCPI8_0@toc@ha
-; FMF-NEXT:    addi 3, 3, .LCPI8_0@toc@l
-; FMF-NEXT:    lfsx 0, 0, 3
+; FMF-NEXT:    lfs 0, .LCPI8_0@toc@l(3)
 ; FMF-NEXT:    xsmulsp 1, 1, 0
 ; FMF-NEXT:    blr
 ;
 ; GLOBAL-LABEL: fmul_fma_fast1:
 ; GLOBAL:       # %bb.0:
 ; GLOBAL-NEXT:    addis 3, 2, .LCPI8_0@toc@ha
-; GLOBAL-NEXT:    addi 3, 3, .LCPI8_0@toc@l
-; GLOBAL-NEXT:    lfsx 0, 0, 3
+; GLOBAL-NEXT:    lfs 0, .LCPI8_0@toc@l(3)
 ; GLOBAL-NEXT:    xsmulsp 1, 1, 0
 ; GLOBAL-NEXT:    blr
  %mul = fmul float %x, 42.0
@ -258,16 +252,14 @@ define float @fmul_fma_fast2(float %x) {
 ; FMF-LABEL: fmul_fma_fast2:
 ; FMF:       # %bb.0:
 ; FMF-NEXT:    addis 3, 2, .LCPI9_0@toc@ha
-; FMF-NEXT:    addi 3, 3, .LCPI9_0@toc@l
-; FMF-NEXT:    lfsx 0, 0, 3
+; FMF-NEXT:    lfs 0, .LCPI9_0@toc@l(3)
 ; FMF-NEXT:    xsmulsp 1, 1, 0
 ; FMF-NEXT:    blr
 ;
 ; GLOBAL-LABEL: fmul_fma_fast2:
 ; GLOBAL:       # %bb.0:
 ; GLOBAL-NEXT:    addis 3, 2, .LCPI9_0@toc@ha
-; GLOBAL-NEXT:    addi 3, 3, .LCPI9_0@toc@l
-; GLOBAL-NEXT:    lfsx 0, 0, 3
+; GLOBAL-NEXT:    lfs 0, .LCPI9_0@toc@l(3)
 ; GLOBAL-NEXT:    xsmulsp 1, 1, 0
 ; GLOBAL-NEXT:    blr
  %mul = fmul fast float %x, 42.0
@ -294,8 +286,7 @@ define float @sqrt_afn(float %x) {
 ; FMF-NEXT:  # %bb.1:
 ; FMF-NEXT:    addis 3, 2, .LCPI10_0@toc@ha
 ; FMF-NEXT:    xsrsqrtesp 3, 1
-; FMF-NEXT:    addi 3, 3, .LCPI10_0@toc@l
-; FMF-NEXT:    lfsx 0, 0, 3
+; FMF-NEXT:    lfs 0, .LCPI10_0@toc@l(3)
 ; FMF-NEXT:    xsmulsp 2, 1, 0
 ; FMF-NEXT:    xsmulsp 4, 3, 3
 ; FMF-NEXT:    xssubsp 2, 2, 1
@ -317,8 +308,7 @@ define float @sqrt_afn(float %x) {
 ; GLOBAL-NEXT:    fneg 0, 1
 ; GLOBAL-NEXT:    addis 3, 2, .LCPI10_0@toc@ha
 ; GLOBAL-NEXT:    fmr 4, 1
-; GLOBAL-NEXT:    addi 3, 3, .LCPI10_0@toc@l
-; GLOBAL-NEXT:    lfsx 3, 0, 3
+; GLOBAL-NEXT:    lfs 3, .LCPI10_0@toc@l(3)
 ; GLOBAL-NEXT:    xsmaddasp 4, 0, 3
 ; GLOBAL-NEXT:    xsmulsp 0, 2, 2
 ; GLOBAL-NEXT:    xsmaddasp 3, 4, 0
@ -352,8 +342,7 @@ define float @sqrt_fast(float %x) {
 ; FMF-NEXT:    fneg 0, 1
 ; FMF-NEXT:    addis 3, 2, .LCPI11_0@toc@ha
 ; FMF-NEXT:    fmr 4, 1
-; FMF-NEXT:    addi 3, 3, .LCPI11_0@toc@l
-; FMF-NEXT:    lfsx 3, 0, 3
+; FMF-NEXT:    lfs 3, .LCPI11_0@toc@l(3)
 ; FMF-NEXT:    xsmaddasp 4, 0, 3
 ; FMF-NEXT:    xsmulsp 0, 2, 2
 ; FMF-NEXT:    xsmaddasp 3, 4, 0
@ -373,8 +362,7 @@ define float @sqrt_fast(float %x) {
 ; GLOBAL-NEXT:    fneg 0, 1
 ; GLOBAL-NEXT:    addis 3, 2, .LCPI11_0@toc@ha
 ; GLOBAL-NEXT:    fmr 4, 1
-; GLOBAL-NEXT:    addi 3, 3, .LCPI11_0@toc@l
-; GLOBAL-NEXT:    lfsx 3, 0, 3
+; GLOBAL-NEXT:    lfs 3, .LCPI11_0@toc@l(3)
 ; GLOBAL-NEXT:    xsmaddasp 4, 0, 3
 ; GLOBAL-NEXT:    xsmulsp 0, 2, 2
 ; GLOBAL-NEXT:    xsmaddasp 3, 4, 0
--- a/llvm/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
+++ b/llvm/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
@ -7,10 +7,8 @@
 define i128 @test_abs(ppc_fp128 %x) nounwind  {
 entry:
 ; PPC64-LABEL: test_abs:
-; PPC64-DAG: stfdx 2, 0, [[ADDR_HI:[0-9]+]]
-; PPC64-DAG: stfdx 1, 0, [[ADDR_LO:[0-9]+]]
-; PPC64-DAG: addi [[ADDR_HI]], [[SP:[0-9]+]], [[OFFSET_HI:-?[0-9]+]]
-; PPC64-DAG: addi [[ADDR_LO]], [[SP]], [[OFFSET_LO:-?[0-9]+]]
+; PPC64-DAG: stfd 2, [[OFFSET_HI:-?[0-9]+]]([[SP:[0-9]+]])
+; PPC64-DAG: stfd 1, [[OFFSET_LO:-?[0-9]+]]([[SP]]) 
 ; PPC64-DAG: ld [[HI:[0-9]+]], [[OFFSET_LO]]([[SP]])
 ; PPC64-DAG: ld [[LO:[0-9]+]], [[OFFSET_HI]]([[SP]])
 ; PPC64-DAG: rldicr [[FLIP_BIT:[0-9]+]], [[HI]], 0, 0
@ -44,10 +42,8 @@ entry:
 define i128 @test_neg(ppc_fp128 %x) nounwind  {
 entry:
 ; PPC64-LABEL: test_neg:
-; PPC64-DAG: stfdx 2, 0, [[ADDR_HI:[0-9]+]]
-; PPC64-DAG: stfdx 1, 0, [[ADDR_LO:[0-9]+]]
-; PPC64-DAG: addi [[ADDR_HI]], [[SP:[0-9]+]], [[OFFSET_HI:-?[0-9]+]]
-; PPC64-DAG: addi [[ADDR_LO]], [[SP]], [[OFFSET_LO:-?[0-9]+]]
+; PPC64-DAG: stfd 2, [[OFFSET_HI:-?[0-9]+]]([[SP:[0-9]+]])
+; PPC64-DAG: stfd 1, [[OFFSET_LO:-?[0-9]+]]([[SP]]) 
 ; PPC64-DAG: li [[FLIP_BIT:[0-9]+]], 1
 ; PPC64-DAG: sldi [[FLIP_BIT]], [[FLIP_BIT]], 63
 ; PPC64-DAG: ld [[HI:[0-9]+]], [[OFFSET_LO]]([[SP]])
@ -85,8 +81,7 @@ entry:
 define i128 @test_copysign(ppc_fp128 %x) nounwind  {
 entry:
 ; PPC64-LABEL: test_copysign:
-; PPC64-DAG: stfdx 1, 0, [[ADDR_REG:[0-9]+]]
-; PPC64-DAG: addi [[ADDR_REG]], 1, [[OFFSET:-?[0-9]+]]
+; PPC64-DAG: stfd 1, [[OFFSET:-?[0-9]+]](1)
 ; PPC64-DAG: li [[HI_TMP:[0-9]+]], 16399
 ; PPC64-DAG: li [[LO_TMP:[0-9]+]], 3019
 ; PPC64-NOT: BARRIER
--- a/llvm/test/CodeGen/PowerPC/i64-to-float.ll
+++ b/llvm/test/CodeGen/PowerPC/i64-to-float.ll
@ -20,7 +20,7 @@ entry:

 ; CHECK-VSX: @foo
 ; CHECK-VSX: std 3,
-; CHECK-VSX: lfdx [[REG:[0-9]+]],
+; CHECK-VSX: lfd [[REG:[0-9]+]],
 ; CHECK-VSX: fcfids 1, [[REG]]
 ; CHECK-VSX: blr

@ -44,7 +44,7 @@ entry:

 ; CHECK-VSX: @goo
 ; CHECK-VSX: std 3,
-; CHECK-VSX: lfdx [[REG:[0-9]+]],
+; CHECK-VSX: lfd [[REG:[0-9]+]],
 ; CHECK-VSX: xscvsxddp 1, [[REG]]
 ; CHECK-VSX: blr

@ -68,7 +68,7 @@ entry:

 ; CHECK-VSX: @foou
 ; CHECK-VSX: std 3,
-; CHECK-VSX: lfdx [[REG:[0-9]+]],
+; CHECK-VSX: lfd [[REG:[0-9]+]],
 ; CHECK-VSX: fcfidus 1, [[REG]]
 ; CHECK-VSX: blr

@ -92,7 +92,7 @@ entry:

 ; CHECK-VSX: @goou
 ; CHECK-VSX: std 3,
-; CHECK-VSX: lfdx [[REG:[0-9]+]],
+; CHECK-VSX: lfd [[REG:[0-9]+]],
 ; CHECK-VSX: xscvuxddp 1, [[REG]]
 ; CHECK-VSX: blr

--- a/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll
+++ b/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll
@ -1,4 +1,4 @@
-; RUN: llc -O3 -o - %s | FileCheck %s
+; RUN: llc -O3 -ppc-late-peephole=false -o - %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"

--- a/llvm/test/CodeGen/PowerPC/mcm-12.ll
+++ b/llvm/test/CodeGen/PowerPC/mcm-12.ll
@ -26,8 +26,7 @@ entry:
 ; CHECK-VSX: .quad 4562098671269285104
 ; CHECK-VSX-LABEL: test_double_const:
 ; CHECK-VSX: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
-; CHECK-VSX: addi [[REG1]], {{[0-9]+}}, [[VAR]]@toc@l
-; CHECK-VSX: lfdx {{[0-9]+}}, 0, [[REG1]]
+; CHECK-VSX: lfd {{[0-9]+}}, [[VAR]]@toc@l({{[0-9]+}}) 

 ; CHECK-P9: [[VAR:[a-z0-9A-Z_.]+]]:
 ; CHECK-P9: .quad 4562098671269285104
--- a/llvm/test/CodeGen/PowerPC/mcm-4.ll
+++ b/llvm/test/CodeGen/PowerPC/mcm-4.ll
@ -33,8 +33,7 @@ entry:
 ; MEDIUM-VSX: .quad 4562098671269285104
 ; MEDIUM-VSX-LABEL: test_double_const:
 ; MEDIUM-VSX: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
-; MEDIUM-VSX: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
-; MEDIUM-VSX: lfdx {{[0-9]+}}, 0, [[REG2]]
+; MEDIUM-VSX: lfd {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])

 ; LARGE: [[VAR:[a-z0-9A-Z_.]+]]:
 ; LARGE: .quad 4562098671269285104
--- a/llvm/test/CodeGen/PowerPC/ppc64-align-long-double.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-align-long-double.ll
@ -44,10 +44,8 @@ entry:
 ; CHECK-VSX-DAG: std 3, 48(1)
 ; CHECK-VSX-DAG: std 5, -16(1)
 ; CHECK-VSX-DAG: std 6, -8(1)
-; CHECK-VSX-DAG: addi [[REG1:[0-9]+]], 1, -16
-; CHECK-VSX-DAG: addi 3, 1, -8
-; CHECK-VSX: lfdx 1, 0, [[REG1]]
-; CHECK-VSX: lfdx 2, 0, 3
+; CHECK-VSX: lfd 1, -16(1)
+; CHECK-VSX: lfd 2, -8(1)

 ; FIXME-VSX: addi 4, 1, 48
 ; FIXME-VSX: lxsdx 1, 4, 3
--- a/llvm/test/CodeGen/PowerPC/ppc64le-smallarg.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64le-smallarg.ll
@ -42,8 +42,7 @@ entry:
  ret float %x
 }
 ; CHECK: @callee2
-; CHECK: addi [[TOCREG:[0-9]+]], 1, 136
-; CHECK: lfsx {{[0-9]+}}, {{[0-9]+}}, [[TOCREG]]
+; CHECK: lfs {{[0-9]+}}, 136(1) 
 ; CHECK: blr

 define void @caller2() {
@ -53,8 +52,7 @@ entry:
  ret void
 }
 ; CHECK: @caller2
-; CHECK: addi [[TOCOFF:[0-9]+]], {{[0-9]+}}, 136
-; CHECK: stfsx {{[0-9]+}}, 0, [[TOCOFF]]
+; CHECK: stfs {{[0-9]+}}, 136({{[0-9]+}})
 ; CHECK: bl test2

 declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float)
--- a/llvm/test/CodeGen/PowerPC/pr25157-peephole.ll
+++ b/llvm/test/CodeGen/PowerPC/pr25157-peephole.ll
@ -57,7 +57,7 @@ L.LB38_2452:
 }

 ; CHECK-LABEL: @aercalc_
-; CHECK: lfsx
+; CHECK: lfs
 ; CHECK: xxspltd
 ; CHECK: stxvd2x
 ; CHECK-NOT: xxswapd
--- a/llvm/test/CodeGen/PowerPC/pr25157.ll
+++ b/llvm/test/CodeGen/PowerPC/pr25157.ll
@ -57,6 +57,6 @@ L.LB38_2452:
 }

 ; CHECK-LABEL: @aercalc_
-; CHECK: lfsx
+; CHECK: lfs
 ; CHECK-P9-LABEL: @aercalc_
 ; CHECK-P9: lfs
--- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_1.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_1.ll
@ -163,16 +163,14 @@ define <2 x double> @s2v_test_f2(double* nocapture readonly %f64, <2 x double> %

 ; P8LE-LABEL: s2v_test_f2:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    addi r3, r3, 8
-; P8LE-NEXT:    lfdx f0, 0, r3
+; P8LE-NEXT:    lfd f0, 8(r3)
 ; P8LE-NEXT:    xxspltd vs0, vs0, 0
 ; P8LE-NEXT:    xxpermdi v2, v2, vs0, 1
 ; P8LE-NEXT:    blr

 ; P8BE-LABEL: s2v_test_f2:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    addi r3, r3, 8
-; P8BE-NEXT:    lfdx f0, 0, r3
+; P8BE-NEXT:    lfd f0, 8(r3)
 ; P8BE-NEXT:    xxpermdi v2, vs0, v2, 1
 ; P8BE-NEXT:    blr
 entry:
@ -238,16 +236,14 @@ define <2 x double> @s2v_test_f4(double* nocapture readonly %f64, <2 x double> %

 ; P8LE-LABEL: s2v_test_f4:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    addi r3, r3, 8
-; P8LE-NEXT:    lfdx f0, 0, r3
+; P8LE-NEXT:    lfd f0, 8(r3)
 ; P8LE-NEXT:    xxspltd vs0, vs0, 0
 ; P8LE-NEXT:    xxpermdi v2, v2, vs0, 1
 ; P8LE-NEXT:    blr

 ; P8BE-LABEL: s2v_test_f4:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    addi r3, r3, 8
-; P8BE-NEXT:    lfdx f0, 0, r3
+; P8BE-NEXT:    lfd f0, 8(r3)
 ; P8BE-NEXT:    xxpermdi v2, vs0, v2, 1
 ; P8BE-NEXT:    blr
 entry:
--- a/llvm/test/CodeGen/PowerPC/select_const.ll
+++ b/llvm/test/CodeGen/PowerPC/select_const.ll
@ -992,13 +992,11 @@ define double @sel_constants_frem_constant(i1 %cond) {
 ; ALL-NEXT:    bc 12, 1, .LBB48_2
 ; ALL-NEXT:  # %bb.1:
 ; ALL-NEXT:    addis 3, 2, .LCPI48_0@toc@ha
-; ALL-NEXT:    addi 3, 3, .LCPI48_0@toc@l
-; ALL-NEXT:    lfdx 1, 0, 3
+; ALL-NEXT:    lfd 1, .LCPI48_0@toc@l(3)
 ; ALL-NEXT:    blr
 ; ALL-NEXT:  .LBB48_2:
 ; ALL-NEXT:    addis 3, 2, .LCPI48_1@toc@ha
-; ALL-NEXT:    addi 3, 3, .LCPI48_1@toc@l
-; ALL-NEXT:    lfsx 1, 0, 3
+; ALL-NEXT:    lfs 1, .LCPI48_1@toc@l(3)
 ; ALL-NEXT:    blr
  %sel = select i1 %cond, double -4.0, double 23.3
  %bo = frem double %sel, 5.1
--- a/llvm/test/CodeGen/PowerPC/toc-float.ll
+++ b/llvm/test/CodeGen/PowerPC/toc-float.ll
@ -1,24 +1,29 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 <%s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 <%s | FileCheck -check-prefix=CHECK-P9 %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 <%s | FileCheck -check-prefix=CHECK-P8 %s

 ; As the constant could be represented as float, a float is
 ; loaded from constant pool.
 define double @doubleConstant1() {
  ret double 1.400000e+01
-}

 ; CHECK-LABEL: doubleConstant1:
-; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
-; CHECK: lfs {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P9: lfs {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P8: lfs {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+}

 ; As the constant couldn't be represented as float, a double is
 ; loaded from constant pool.
 define double @doubleConstant2() {
  ret double 2.408904e+01
-}

 ; CHECK-LABEL: doubleConstant2:
-; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
-; CHECK: lfd {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P9: lfd {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P8: lfd {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+}

@FArr = hidden local_unnamed_addr global [10 x float] zeroinitializer, align 4

@ -26,19 +31,24 @@ define float @floatConstantArray() local_unnamed_addr  {
  %1 = load float, float* getelementptr inbounds ([10 x float], [10 x float]* @FArr, i64 0, i64 3), align 4
  %2 = fadd float %1, 0x400B333340000000
  ret float %2
-}

 ; CHECK-LABEL: floatConstantArray 
-; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha+[[REG2:[0-9]+]]
-; CHECK: lfs {{[0-9]+}}, [[VAR]]@toc@l+[[REG2]]([[REG1]])
+; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha+[[REG2:[0-9]+]]
+; CHECK-P9: lfs {{[0-9]+}}, [[VAR]]@toc@l+[[REG2]]([[REG1]])
+; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P8: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
+; CHECK-P8: lfs {{[0-9]+}}, 12([[REG2]])
+}

 define float @floatConstant() {
  ret float 0x400470A3E0000000
-}

 ; CHECK-LABEL: floatConstant:
-; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
-; CHECK: lfs {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P9: lfs {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P8: lfs {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+}

 ; llvm put the hidden globals into the TOC table.
 ; TODO - do some analysis and decide which globals could be put into TOC.
@ -48,11 +58,14 @@ define double @doubleConstantArray()  {
  %1 = load double, double* getelementptr inbounds ([200 x double], [200 x double]* @d, i64 0, i64 3), align 8
  %2 = fadd double %1, 6.880000e+00
  ret double %2
-}

 ; CHECK-LABEL: doubleConstantArray
-; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha+[[REG2:[0-9]+]]
-; CHECK: lfd {{[0-9]+}}, [[VAR]]@toc@l+[[REG2]]([[REG1]])
+; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha+[[REG2:[0-9]+]]
+; CHECK-P9: lfd {{[0-9]+}}, [[VAR]]@toc@l+[[REG2]]([[REG1]])
+; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P8: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
+; CHECK-P8: lfd {{[0-9]+}}, 24([[REG2]])
+}

@arr = hidden local_unnamed_addr global [20000 x double] zeroinitializer, align 8

@ -60,12 +73,34 @@ define double @doubleLargeConstantArray()  {
  %1 = load double, double* getelementptr inbounds ([20000 x double], [20000 x double]* @arr, i64 0, i64 4096), align 8
  %2 = fadd double %1, 6.880000e+00
  ret double %2
+
+; Access an element with an offset that doesn't fit in the displacement field of LFD. 
+; CHECK-LABEL: doubleLargeConstantArray
+; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P9: li [[REG2:[0-9]+]], 0 
+; CHECK-P9: addi [[REG3:[0-9]+]], [[REG1]], [[VAR:[a-z0-9A-Z_.]+]]@toc@l
+; CHECK-P9: ori [[REG4:[0-9]+]], [[REG2]], 32768 
+; CHECK-P9: lfdx {{[0-9]+}}, [[REG3]], [[REG4]] 
+; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P8: li [[REG2:[0-9]+]], 0 
+; CHECK-P8: addi [[REG3:[0-9]+]], [[REG1]], [[VAR:[a-z0-9A-Z_.]+]]@toc@l
+; CHECK-P8: ori [[REG4:[0-9]+]], [[REG2]], 32768 
+; CHECK-P8: lfdx {{[0-9]+}}, [[REG3]], [[REG4]] 
 }

-; access element that out of range
-; CHECK-LABEL: doubleLargeConstantArray
-; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
-; CHECK: li [[REG2:[0-9]+]], 0 
-; CHECK: addi [[REG3:[0-9]+]], [[REG1]], [[VAR:[a-z0-9A-Z_.]+]]@toc@l
-; CHECK: ori [[REG4:[0-9]+]], [[REG2]], 32768 
-; CHECK: lfdx {{[0-9]+}}, [[REG3]], [[REG4]] 
+@vec_arr = global [10 x <4 x i32>] zeroinitializer, align 16
+
+define <4 x i32> @vectorArray() #0 {
+entry:
+  %0 = load <4 x i32>, <4 x i32>* getelementptr inbounds ([10 x <4 x i32>], [10 x <4 x i32>]* @vec_arr, i64 0, i64 2), align 16
+  ret <4 x i32> %0
+
+; CHECK-LABEL: vectorArray
+; CHECK-P9: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P9: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
+; CHECK-P9: lxv {{[0-9]+}}, 32([[REG2]])
+; CHECK-P8: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK-P8: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
+; CHECK-P8: addi [[REG3:[0-9]+]], [[REG2]], 32
+; CHECK-P8: lvx {{[0-9]+}}, 0, [[REG3]]
+}
--- a/llvm/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx_scalar_ld_st.ll
@ -124,7 +124,7 @@ entry:
  ret void
 ; CHECK-LABEL: @dblToFloat
 ; CHECK: lfdx [[REGLD5:[0-9]+]],
-; CHECK: stfsx [[REGLD5]],
+; CHECK: stfs [[REGLD5]],
 ; CHECK-P9-LABEL: @dblToFloat
 ; CHECK-P9: lfd [[REGLD5:[0-9]+]],
 ; CHECK-P9: stfs [[REGLD5]],
@ -140,7 +140,7 @@ entry:
  ret void
 ; CHECK-LABEL: @floatToDbl
 ; CHECK: lfsx [[REGLD5:[0-9]+]],
-; CHECK: stfdx [[REGLD5]],
+; CHECK: stfd [[REGLD5]],
 ; CHECK-P9-LABEL: @floatToDbl
 ; CHECK-P9: lfs [[REGLD5:[0-9]+]],
 ; CHECK-P9: stfd [[REGLD5]],