Reapply the new LoopStrengthReduction code, with compile time and

bug fixes, and with improved heuristics for analyzing foreign-loop addrecs. This change also flattens IVUsers, eliminating the stride-oriented groupings, which makes it easier to work with. llvm-svn: 95975
2010-02-12 10:34:29 +00:00 · 2010-02-12 10:34:29 +00:00 · 45774ce0ad
parent c7ef4cc9fc
commit 45774ce0ad
35 changed files with 3523 additions and 2689 deletions
--- a/llvm/include/llvm/Analysis/IVUsers.h
+++ b/llvm/include/llvm/Analysis/IVUsers.h
@ -16,29 +16,27 @@
 #define LLVM_ANALYSIS_IVUSERS_H

 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/ADT/SmallVector.h"
-#include <map>
+#include "llvm/Support/ValueHandle.h"

 namespace llvm {

 class DominatorTree;
 class Instruction;
 class Value;
-struct IVUsersOfOneStride;
+class IVUsers;
+class ScalarEvolution;
+class SCEV;

-/// IVStrideUse - Keep track of one use of a strided induction variable, where
-/// the stride is stored externally.  The Offset member keeps track of the
-/// offset from the IV, User is the actual user of the operand, and
-/// 'OperandValToReplace' is the operand of the User that is the use.
+/// IVStrideUse - Keep track of one use of a strided induction variable.
+/// The Expr member keeps track of the expression, User is the actual user
+/// instruction of the operand, and 'OperandValToReplace' is the operand of
+/// the User that is the use.
 class IVStrideUse : public CallbackVH, public ilist_node<IVStrideUse> {
 public:
-  IVStrideUse(IVUsersOfOneStride *parent,
-              const SCEV *offset,
+  IVStrideUse(IVUsers *P, const SCEV *S, const SCEV *Off,
              Instruction* U, Value *O)
-    : CallbackVH(U), Parent(parent), Offset(offset),
-      OperandValToReplace(O),
-      IsUseOfPostIncrementedValue(false) {
+    : CallbackVH(U), Parent(P), Stride(S), Offset(Off),
+      OperandValToReplace(O), IsUseOfPostIncrementedValue(false) {
  }

  /// getUser - Return the user instruction for this use.
@ -51,9 +49,17 @@ public:
    setValPtr(NewUser);
  }

-  /// getParent - Return a pointer to the IVUsersOfOneStride that owns
+  /// getParent - Return a pointer to the IVUsers that owns
  /// this IVStrideUse.
-  IVUsersOfOneStride *getParent() const { return Parent; }
+  IVUsers *getParent() const { return Parent; }
+
+  /// getStride - Return the expression for the stride for the use.
+  const SCEV *getStride() const { return Stride; }
+
+  /// setStride - Assign a new stride to this use.
+  void setStride(const SCEV *Val) {
+    Stride = Val;
+  }

  /// getOffset - Return the offset to add to a theoeretical induction
  /// variable that starts at zero and counts up by the stride to compute
@ -92,8 +98,11 @@ public:
  }

 private:
-  /// Parent - a pointer to the IVUsersOfOneStride that owns this IVStrideUse.
-  IVUsersOfOneStride *Parent;
+  /// Parent - a pointer to the IVUsers that owns this IVStrideUse.
+  IVUsers *Parent;
+
+  /// Stride - The stride for this use.
+  const SCEV *Stride;

  /// Offset - The offset to add to the base induction expression.
  const SCEV *Offset;
@ -138,42 +147,8 @@ private:
  mutable ilist_node<IVStrideUse> Sentinel;
 };

-/// IVUsersOfOneStride - This structure keeps track of all instructions that
-/// have an operand that is based on the trip count multiplied by some stride.
-struct IVUsersOfOneStride : public ilist_node<IVUsersOfOneStride> {
-private:
-  IVUsersOfOneStride(const IVUsersOfOneStride &I); // do not implement
-  void operator=(const IVUsersOfOneStride &I);     // do not implement
-
-public:
-  IVUsersOfOneStride() : Stride(0) {}
-
-  explicit IVUsersOfOneStride(const SCEV *stride) : Stride(stride) {}
-
-  /// Stride - The stride for all the contained IVStrideUses. This is
-  /// a constant for affine strides.
-  const SCEV *Stride;
-
-  /// Users - Keep track of all of the users of this stride as well as the
-  /// initial value and the operand that uses the IV.
-  ilist<IVStrideUse> Users;
-
-  void addUser(const SCEV *Offset, Instruction *User, Value *Operand) {
-    Users.push_back(new IVStrideUse(this, Offset, User, Operand));
-  }
-
-  void removeUser(IVStrideUse *User) {
-    Users.erase(User);
-  }
-
-  void print(raw_ostream &OS) const;
-
-  /// dump - This method is used for debugging.
-  void dump() const;
-};
-
 class IVUsers : public LoopPass {
-  friend class IVStrideUserVH;
+  friend class IVStrideUse;
  Loop *L;
  LoopInfo *LI;
  DominatorTree *DT;
@ -182,19 +157,8 @@ class IVUsers : public LoopPass {

  /// IVUses - A list of all tracked IV uses of induction variable expressions
  /// we are interested in.
-  ilist<IVUsersOfOneStride> IVUses;
+  ilist<IVStrideUse> IVUses;

-public:
-  /// IVUsesByStride - A mapping from the strides in StrideOrder to the
-  /// uses in IVUses.
-  std::map<const SCEV *, IVUsersOfOneStride*> IVUsesByStride;
-
-  /// StrideOrder - An ordering of the keys in IVUsesByStride that is stable:
-  /// We use this to iterate over the IVUsesByStride collection without being
-  /// dependent on random ordering of pointers in the process.
-  SmallVector<const SCEV *, 16> StrideOrder;
-
-private:
  virtual void getAnalysisUsage(AnalysisUsage &AU) const;

  virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
@ -210,7 +174,7 @@ public:
  /// return true.  Otherwise, return false.
  bool AddUsersIfInteresting(Instruction *I);

-  void AddUser(const SCEV *Stride, const SCEV *Offset,
+  IVStrideUse &AddUser(const SCEV *Stride, const SCEV *Offset,
                       Instruction *User, Value *Operand);

  /// getReplacementExpr - Return a SCEV expression which computes the
@ -222,6 +186,14 @@ public:
  /// isUseOfPostIncrementedValue flag.
  const SCEV *getCanonicalExpr(const IVStrideUse &U) const;

+  typedef ilist<IVStrideUse>::iterator iterator;
+  typedef ilist<IVStrideUse>::const_iterator const_iterator;
+  iterator begin() { return IVUses.begin(); }
+  iterator end()   { return IVUses.end(); }
+  const_iterator begin() const { return IVUses.begin(); }
+  const_iterator end() const   { return IVUses.end(); }
+  bool empty() const { return IVUses.empty(); }
+
  void print(raw_ostream &OS, const Module* = 0) const;

  /// dump - This method is used for debugging.
--- a/llvm/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionExpander.h
@ -27,10 +27,7 @@ namespace llvm {
  /// and destroy it when finished to allow the release of the associated
  /// memory.
  class SCEVExpander : public SCEVVisitor<SCEVExpander, Value*> {
-  public:
    ScalarEvolution &SE;
-
-  private:
    std::map<std::pair<const SCEV *, Instruction *>, AssertingVH<Value> >
      InsertedExpressions;
    std::set<Value*> InsertedValues;
--- a/llvm/lib/Analysis/IVUsers.cpp
+++ b/llvm/lib/Analysis/IVUsers.cpp
@ -36,42 +36,30 @@ Pass *llvm::createIVUsersPass() {
  return new IVUsers();
 }

-/// containsAddRecFromDifferentLoop - Determine whether expression S involves a
-/// subexpression that is an AddRec from a loop other than L.  An outer loop
-/// of L is OK, but not an inner loop nor a disjoint loop.
-static bool containsAddRecFromDifferentLoop(const SCEV *S, Loop *L) {
-  // This is very common, put it first.
-  if (isa<SCEVConstant>(S))
-    return false;
-  if (const SCEVCommutativeExpr *AE = dyn_cast<SCEVCommutativeExpr>(S)) {
-    for (unsigned int i=0; i< AE->getNumOperands(); i++)
-      if (containsAddRecFromDifferentLoop(AE->getOperand(i), L))
-        return true;
-    return false;
+/// CollectSubexprs - Split S into subexpressions which can be pulled out into
+/// separate registers.
+static void CollectSubexprs(const SCEV *S,
+                            SmallVectorImpl<const SCEV *> &Ops,
+                            ScalarEvolution &SE) {
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    // Break out add operands.
+    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
+         I != E; ++I)
+      CollectSubexprs(*I, Ops, SE);
+    return;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    // Split a non-zero base out of an addrec.
+    if (!AR->getStart()->isZero()) {
+      CollectSubexprs(AR->getStart(), Ops, SE);
+      CollectSubexprs(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()),
+                                       AR->getStepRecurrence(SE),
+                                       AR->getLoop()), Ops, SE);
+      return;
    }
-  if (const SCEVAddRecExpr *AE = dyn_cast<SCEVAddRecExpr>(S)) {
-    if (const Loop *newLoop = AE->getLoop()) {
-      if (newLoop == L)
-        return false;
-      // if newLoop is an outer loop of L, this is OK.
-      if (newLoop->contains(L))
-        return false;
  }
-    return true;
-  }
-  if (const SCEVUDivExpr *DE = dyn_cast<SCEVUDivExpr>(S))
-    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
-           containsAddRecFromDifferentLoop(DE->getRHS(), L);
-#if 0
-  // SCEVSDivExpr has been backed out temporarily, but will be back; we'll
-  // need this when it is.
-  if (const SCEVSDivExpr *DE = dyn_cast<SCEVSDivExpr>(S))
-    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
-           containsAddRecFromDifferentLoop(DE->getRHS(), L);
-#endif
-  if (const SCEVCastExpr *CE = dyn_cast<SCEVCastExpr>(S))
-    return containsAddRecFromDifferentLoop(CE->getOperand(), L);
-  return false;
+
+  // Otherwise use the value itself.
+  Ops.push_back(S);
 }

 /// getSCEVStartAndStride - Compute the start and stride of this expression,
@ -90,35 +78,42 @@ static bool getSCEVStartAndStride(const SCEV *&SH, Loop *L, Loop *UseLoop,
  if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(SH)) {
    for (unsigned i = 0, e = AE->getNumOperands(); i != e; ++i)
      if (const SCEVAddRecExpr *AddRec =
-             dyn_cast<SCEVAddRecExpr>(AE->getOperand(i))) {
-        if (AddRec->getLoop() == L)
+             dyn_cast<SCEVAddRecExpr>(AE->getOperand(i)))
        TheAddRec = SE->getAddExpr(AddRec, TheAddRec);
      else
-          return false;  // Nested IV of some sort?
-      } else {
        Start = SE->getAddExpr(Start, AE->getOperand(i));
-      }
  } else if (isa<SCEVAddRecExpr>(SH)) {
    TheAddRec = SH;
  } else {
    return false;  // not analyzable.
  }

-  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(TheAddRec);
-  if (!AddRec || AddRec->getLoop() != L) return false;
+  // Break down TheAddRec into its component parts.
+  SmallVector<const SCEV *, 4> Subexprs;
+  CollectSubexprs(TheAddRec, Subexprs, *SE);
+
+  // Look for an addrec on the current loop among the parts.
+  const SCEV *AddRecStride = 0;
+  for (SmallVectorImpl<const SCEV *>::iterator I = Subexprs.begin(),
+       E = Subexprs.end(); I != E; ++I) {
+    const SCEV *S = *I;
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+      if (AR->getLoop() == L) {
+        *I = AR->getStart();
+        AddRecStride = AR->getStepRecurrence(*SE);
+        break;
+      }
+  }
+  if (!AddRecStride)
+    return false;
+
+  // Add up everything else into a start value (which may not be
+  // loop-invariant).
+  const SCEV *AddRecStart = SE->getAddExpr(Subexprs);

  // Use getSCEVAtScope to attempt to simplify other loops out of
  // the picture.
-  const SCEV *AddRecStart = AddRec->getStart();
  AddRecStart = SE->getSCEVAtScope(AddRecStart, UseLoop);
-  const SCEV *AddRecStride = AddRec->getStepRecurrence(*SE);
-
-  // FIXME: If Start contains an SCEVAddRecExpr from a different loop, other
-  // than an outer loop of the current loop, reject it.  LSR has no concept of
-  // operating on more than one loop at a time so don't confuse it with such
-  // expressions.
-  if (containsAddRecFromDifferentLoop(AddRecStart, L))
-    return false;

  Start = SE->getAddExpr(Start, AddRecStart);

@ -131,7 +126,7 @@ static bool getSCEVStartAndStride(const SCEV *&SH, Loop *L, Loop *UseLoop,

    DEBUG(dbgs() << "[";
          WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
-          dbgs() << "] Variable stride: " << *AddRec << "\n");
+          dbgs() << "] Variable stride: " << *AddRecStride << "\n");
  }

  Stride = AddRecStride;
@ -247,14 +242,6 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
    }

    if (AddUserToIVUsers) {
-      IVUsersOfOneStride *StrideUses = IVUsesByStride[Stride];
-      if (!StrideUses) {    // First occurrence of this stride?
-        StrideOrder.push_back(Stride);
-        StrideUses = new IVUsersOfOneStride(Stride);
-        IVUses.push_back(StrideUses);
-        IVUsesByStride[Stride] = StrideUses;
-      }
-
      // Okay, we found a user that we cannot reduce.  Analyze the instruction
      // and decide what to do with it.  If we are a use inside of the loop, use
      // the value before incrementation, otherwise use it after incrementation.
@ -262,27 +249,21 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
        // The value used will be incremented by the stride more than we are
        // expecting, so subtract this off.
        const SCEV *NewStart = SE->getMinusSCEV(Start, Stride);
-        StrideUses->addUser(NewStart, User, I);
-        StrideUses->Users.back().setIsUseOfPostIncrementedValue(true);
+        IVUses.push_back(new IVStrideUse(this, Stride, NewStart, User, I));
+        IVUses.back().setIsUseOfPostIncrementedValue(true);
        DEBUG(dbgs() << "   USING POSTINC SCEV, START=" << *NewStart<< "\n");
      } else {
-        StrideUses->addUser(Start, User, I);
+        IVUses.push_back(new IVStrideUse(this, Stride, Start, User, I));
      }
    }
  }
  return true;
 }

-void IVUsers::AddUser(const SCEV *Stride, const SCEV *Offset,
+IVStrideUse &IVUsers::AddUser(const SCEV *Stride, const SCEV *Offset,
                              Instruction *User, Value *Operand) {
-  IVUsersOfOneStride *StrideUses = IVUsesByStride[Stride];
-  if (!StrideUses) {    // First occurrence of this stride?
-    StrideOrder.push_back(Stride);
-    StrideUses = new IVUsersOfOneStride(Stride);
-    IVUses.push_back(StrideUses);
-    IVUsesByStride[Stride] = StrideUses;
-  }
-  IVUsesByStride[Stride]->addUser(Offset, User, Operand);
+  IVUses.push_back(new IVStrideUse(this, Stride, Offset, User, Operand));
+  return IVUses.back();
 }

 IVUsers::IVUsers()
@ -316,15 +297,15 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
 /// value of the OperandValToReplace of the given IVStrideUse.
 const SCEV *IVUsers::getReplacementExpr(const IVStrideUse &U) const {
  // Start with zero.
-  const SCEV *RetVal = SE->getIntegerSCEV(0, U.getParent()->Stride->getType());
+  const SCEV *RetVal = SE->getIntegerSCEV(0, U.getStride()->getType());
  // Create the basic add recurrence.
-  RetVal = SE->getAddRecExpr(RetVal, U.getParent()->Stride, L);
+  RetVal = SE->getAddRecExpr(RetVal, U.getStride(), L);
  // Add the offset in a separate step, because it may be loop-variant.
  RetVal = SE->getAddExpr(RetVal, U.getOffset());
  // For uses of post-incremented values, add an extra stride to compute
  // the actual replacement value.
  if (U.isUseOfPostIncrementedValue())
-    RetVal = SE->getAddExpr(RetVal, U.getParent()->Stride);
+    RetVal = SE->getAddExpr(RetVal, U.getStride());
  return RetVal;
 }

@ -333,9 +314,9 @@ const SCEV *IVUsers::getReplacementExpr(const IVStrideUse &U) const {
 /// isUseOfPostIncrementedValue flag.
 const SCEV *IVUsers::getCanonicalExpr(const IVStrideUse &U) const {
  // Start with zero.
-  const SCEV *RetVal = SE->getIntegerSCEV(0, U.getParent()->Stride->getType());
+  const SCEV *RetVal = SE->getIntegerSCEV(0, U.getStride()->getType());
  // Create the basic add recurrence.
-  RetVal = SE->getAddRecExpr(RetVal, U.getParent()->Stride, L);
+  RetVal = SE->getAddRecExpr(RetVal, U.getStride(), L);
  // Add the offset in a separate step, because it may be loop-variant.
  RetVal = SE->getAddExpr(RetVal, U.getOffset());
  return RetVal;
@ -358,18 +339,12 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
  OS << ":\n";

  IVUsersAsmAnnotator Annotator;
-  for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e; ++Stride) {
-    std::map<const SCEV *, IVUsersOfOneStride*>::const_iterator SI =
-      IVUsesByStride.find(StrideOrder[Stride]);
-    assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
-    OS << "  Stride " << *SI->first->getType() << " " << *SI->first << ":\n";
-
-    for (ilist<IVStrideUse>::const_iterator UI = SI->second->Users.begin(),
-         E = SI->second->Users.end(); UI != E; ++UI) {
+  for (ilist<IVStrideUse>::const_iterator UI = IVUses.begin(),
+       E = IVUses.end(); UI != E; ++UI) {
    OS << "  ";
    WriteAsOperand(OS, UI->getOperandValToReplace(), false);
-      OS << " = ";
-      OS << *getReplacementExpr(*UI);
+    OS << " = "
+       << *getReplacementExpr(*UI);
    if (UI->isUseOfPostIncrementedValue())
      OS << " (post-inc)";
    OS << " in  ";
@ -377,44 +352,18 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
    OS << '\n';
  }
 }
-}

 void IVUsers::dump() const {
  print(dbgs());
 }

 void IVUsers::releaseMemory() {
-  IVUsesByStride.clear();
-  StrideOrder.clear();
  Processed.clear();
  IVUses.clear();
 }

 void IVStrideUse::deleted() {
  // Remove this user from the list.
-  Parent->Users.erase(this);
+  Parent->IVUses.erase(this);
  // this now dangles!
 }
-
-void IVUsersOfOneStride::print(raw_ostream &OS) const {
-  OS << "IV Users of one stride:\n";
-
-  if (Stride)
-    OS << "    Stride: " << *Stride << '\n';
-
-  OS << "    Users:\n";
-
-  unsigned Count = 1;
-
-  for (ilist<IVStrideUse>::const_iterator
-         I = Users.begin(), E = Users.end(); I != E; ++I) {
-    const IVStrideUse &SU = *I;
-    OS << "      " << Count++ << '\n';
-    OS << "        Offset: " << *SU.getOffset() << '\n';
-    OS << "         Instr: " << *SU << '\n';
-  }
-}
-
-void IVUsersOfOneStride::dump() const {
-  print(dbgs());
-}
--- a/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
@ -641,8 +641,24 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
  // Reuse a previously-inserted PHI, if present.
  for (BasicBlock::iterator I = L->getHeader()->begin();
       PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    if (isInsertedInstruction(PN) && SE.getSCEV(PN) == Normalized)
+    if (SE.isSCEVable(PN->getType()) &&
+        (SE.getEffectiveSCEVType(PN->getType()) ==
+         SE.getEffectiveSCEVType(Normalized->getType())) &&
+        SE.getSCEV(PN) == Normalized)
+      if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+        // Remember this PHI, even in post-inc mode.
+        InsertedValues.insert(PN);
+        // Remember the increment.
+        Instruction *IncV =
+          cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock)
+                                  ->stripPointerCasts());
+        rememberInstruction(IncV);
+        // Make sure the increment is where we want it. But don't move it
+        // down past a potential existing post-inc user.
+        if (L == IVIncInsertLoop && !SE.DT->dominates(IncV, IVIncInsertPos))
+          IncV->moveBefore(IVIncInsertPos);
        return PN;
+      }

  // Save the original insertion point so we can restore it when we're done.
  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
--- a/llvm/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@ -14,6 +14,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/PassManager.h"
 #include "llvm/Pass.h"
+#include "llvm/Analysis/Verifier.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/Passes.h"
@ -234,6 +235,9 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
    PM.add(createLoopStrengthReducePass(getTargetLowering()));
    if (PrintLSR)
      PM.add(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
+#ifndef NDEBUG
+    PM.add(createVerifierPass());
+#endif
  }

  // Turn exception handling constructs into something the code generators can
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@ -364,19 +364,13 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
    if (ExitingBlock)
      NeedCannIV = true;
  }
-  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
-    const SCEV *Stride = IU->StrideOrder[i];
-    const Type *Ty = SE->getEffectiveSCEVType(Stride->getType());
+  for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
+    const Type *Ty =
+      SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType());
    if (!LargestType ||
        SE->getTypeSizeInBits(Ty) >
          SE->getTypeSizeInBits(LargestType))
      LargestType = Ty;
-
-    std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
-      IU->IVUsesByStride.find(IU->StrideOrder[i]);
-    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
-
-    if (!SI->second->Users.empty())
    NeedCannIV = true;
  }

@ -455,15 +449,8 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType,
  // add the offsets to the primary induction variable and cast, avoiding
  // the need for the code evaluation methods to insert induction variables
  // of different sizes.
-  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
-    const SCEV *Stride = IU->StrideOrder[i];
-
-    std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
-      IU->IVUsesByStride.find(IU->StrideOrder[i]);
-    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
-    ilist<IVStrideUse> &List = SI->second->Users;
-    for (ilist<IVStrideUse>::iterator UI = List.begin(),
-         E = List.end(); UI != E; ++UI) {
+  for (IVUsers::iterator UI = IU->begin(), E = IU->end(); UI != E; ++UI) {
+    const SCEV *Stride = UI->getStride();
    Value *Op = UI->getOperandValToReplace();
    const Type *UseTy = Op->getType();
    Instruction *User = UI->getUser();
@ -521,7 +508,6 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType,
    // The old value may be dead now.
    DeadInsts.push_back(Op);
  }
-  }

  // Clear the rewriter cache, because values that are in the rewriter's cache
  // can be deleted in the loop below, causing the AssertingVH in the cache to
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
--- a/llvm/test/CodeGen/ARM/arm-negative-stride.ll
+++ b/llvm/test/CodeGen/ARM/arm-negative-stride.ll
@ -1,8 +1,11 @@
 ; RUN: llc < %s -march=arm | FileCheck %s

+; This loop is rewritten with an indvar which counts down, which
+; frees up a register from holding the trip count.
+
 define void @test(i32* %P, i32 %A, i32 %i) nounwind {
 entry:
-; CHECK: str r1, [{{r.*}}, -{{r.*}}, lsl #2]
+; CHECK: str r1, [{{r.*}}, +{{r.*}}, lsl #2]
        icmp eq i32 %i, 0               ; <i1>:0 [#uses=1]
        br i1 %0, label %return, label %bb

@ -19,3 +22,26 @@ return:         ; preds = %bb, %entry
        ret void
 }

+; This loop has a non-address use of the count-up indvar, so
+; it'll remain. Now the original store uses a negative-stride address.
+
+define void @test_with_forced_iv(i32* %P, i32 %A, i32 %i) nounwind {
+entry:
+; CHECK: str r1, [{{r.*}}, -{{r.*}}, lsl #2]
+        icmp eq i32 %i, 0               ; <i1>:0 [#uses=1]
+        br i1 %0, label %return, label %bb
+
+bb:             ; preds = %bb, %entry
+        %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ]          ; <i32> [#uses=2]
+        %i_addr.09.0 = sub i32 %i, %indvar              ; <i32> [#uses=1]
+        %tmp2 = getelementptr i32* %P, i32 %i_addr.09.0         ; <i32*> [#uses=1]
+        store i32 %A, i32* %tmp2
+        store i32 %indvar, i32* null
+        %indvar.next = add i32 %indvar, 1               ; <i32> [#uses=2]
+        icmp eq i32 %indvar.next, %i            ; <i1>:1 [#uses=1]
+        br i1 %1, label %return, label %bb
+
+return:         ; preds = %bb, %entry
+        ret void
+}
+
--- a/llvm/test/CodeGen/ARM/lsr-code-insertion.ll
+++ b/llvm/test/CodeGen/ARM/lsr-code-insertion.ll
@ -1,5 +1,5 @@
-; RUN: llc < %s -stats |& grep {40.*Number of machine instrs printed}
-; RUN: llc < %s -stats |& grep {.*Number of re-materialization}
+; RUN: llc < %s -stats |& grep {39.*Number of machine instrs printed}
+; RUN: llc < %s -stats |& not grep {.*Number of re-materialization}
 ; This test really wants to check that the resultant "cond_true" block only 
 ; has a single store in it, and that cond_true55 only has code to materialize 
 ; the constant and do a store.  We do *not* want something like this:
--- a/llvm/test/CodeGen/Thumb2/lsr-deficiency.ll
+++ b/llvm/test/CodeGen/Thumb2/lsr-deficiency.ll
@ -1,25 +1,29 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -relocation-model=pic | FileCheck %s
 ; rdar://7387640

-; FIXME: We still need to rewrite array reference iv of stride -4 with loop
-; count iv of stride -1.
+; This now reduces to a single induction variable.
+
+; TODO: It still gets a GPR shuffle at the end of the loop
+; This is because something in instruction selection has decided
+; that comparing the pre-incremented value with zero is better
+; than comparing the post-incremented value with -4.

@G = external global i32                          ; <i32*> [#uses=2]
@array = external global i32*                     ; <i32**> [#uses=1]

 define arm_apcscc void @t() nounwind optsize {
 ; CHECK: t:
-; CHECK: mov.w r2, #4000
-; CHECK: movw r3, #1001
+; CHECK: mov.w r2, #1000
 entry:
  %.pre = load i32* @G, align 4                   ; <i32> [#uses=1]
  br label %bb

 bb:                                               ; preds = %bb, %entry
 ; CHECK: LBB1_1:
-; CHECK: subs r3, #1
-; CHECK: cmp r3, #0
-; CHECK: sub.w r2, r2, #4
+; CHECK: cmp r2, #0
+; CHECK: sub.w r9, r2, #1
+; CHECK: mov r2, r9
+
  %0 = phi i32 [ %.pre, %entry ], [ %3, %bb ]     ; <i32> [#uses=1]
  %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ] ; <i32> [#uses=2]
  %tmp5 = sub i32 1000, %indvar                   ; <i32> [#uses=1]
--- a/llvm/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
+++ b/llvm/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s

-define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) {
+define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; CHECK: t1:
 ; CHECK: it ne
 ; CHECK: cmpne
@ -20,12 +20,12 @@ cond_next:
 }

 ; FIXME: Check for # of unconditional branch after adding branch folding post ifcvt.
-define i32 @t2(i32 %a, i32 %b) {
+define i32 @t2(i32 %a, i32 %b) nounwind {
 entry:
 ; CHECK: t2:
-; CHECK: ite le
-; CHECK: suble
+; CHECK: ite gt
 ; CHECK: subgt
+; CHECK: suble
 	%tmp1434 = icmp eq i32 %a, %b		; <i1> [#uses=1]
 	br i1 %tmp1434, label %bb17, label %bb.outer

@ -60,14 +60,14 @@ bb17:		; preds = %cond_false, %cond_true, %entry

@x = external global i32*		; <i32**> [#uses=1]

-define void @foo(i32 %a) {
+define void @foo(i32 %a) nounwind {
 entry:
 	%tmp = load i32** @x		; <i32*> [#uses=1]
 	store i32 %a, i32* %tmp
 	ret void
 }

-define void @t3(i32 %a, i32 %b) {
+define void @t3(i32 %a, i32 %b) nounwind {
 entry:
 ; CHECK: t3:
 ; CHECK: it lt
--- a/llvm/test/CodeGen/X86/2006-05-11-InstrSched.ll
+++ b/llvm/test/CodeGen/X86/2006-05-11-InstrSched.ll
@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -stats -realign-stack=0 |&\
-; RUN:     grep {asm-printer} | grep 31
+; RUN:     grep {asm-printer} | grep 34

 target datalayout = "e-p:32:32"
 define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind {
@ -40,7 +40,7 @@ cond_true:		; preds = %cond_true, %entry
 	%tmp137.upgrd.7 = bitcast i32* %tmp137 to <2 x i64>*		; <<2 x i64>*> [#uses=1]
 	store <2 x i64> %tmp131, <2 x i64>* %tmp137.upgrd.7
 	%tmp147 = add nsw i32 %tmp.10, 8		; <i32> [#uses=1]
-	%tmp.upgrd.8 = icmp slt i32 %tmp147, %M		; <i1> [#uses=1]
+	%tmp.upgrd.8 = icmp ne i32 %tmp147, %M		; <i1> [#uses=1]
 	%indvar.next = add i32 %indvar, 1		; <i32> [#uses=1]
 	br i1 %tmp.upgrd.8, label %cond_true, label %return

--- a/llvm/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
+++ b/llvm/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -mtriple=i686-darwin | \
 ; RUN:   grep push | count 3

-define void @foo(i8** %buf, i32 %size, i32 %col, i8* %p) {
+define void @foo(i8** %buf, i32 %size, i32 %col, i8* %p) nounwind {
 entry:
 	icmp sgt i32 %size, 0		; <i1>:0 [#uses=1]
 	br i1 %0, label %bb.preheader, label %return
--- a/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
+++ b/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
@ -35,7 +35,7 @@ cond_next36.i:		; preds = %cond_next.i
 bb.i28.i:		; preds = %bb.i28.i, %cond_next36.i
 ; CHECK: %bb.i28.i
 ; CHECK: addl $2
-; CHECK: addl $2
+; CHECK: addl $-2
 	%j.0.reg2mem.0.i16.i = phi i32 [ 0, %cond_next36.i ], [ %indvar.next39.i, %bb.i28.i ]		; <i32> [#uses=2]
 	%din_addr.1.reg2mem.0.i17.i = phi double [ 0.000000e+00, %cond_next36.i ], [ %tmp16.i25.i, %bb.i28.i ]		; <double> [#uses=1]
 	%tmp1.i18.i = fptosi double %din_addr.1.reg2mem.0.i17.i to i32		; <i32> [#uses=2]
--- a/llvm/test/CodeGen/X86/full-lsr.ll
+++ b/llvm/test/CodeGen/X86/full-lsr.ll
@ -1,12 +1,7 @@
 ; RUN: llc < %s -march=x86 >%t

-; TODO: Enhance full lsr mode to get this:
-; RUNX: grep {addl	\\\$4,} %t | count 3
-; RUNX: not grep {,%} %t
-
-; For now, it should find this, which is still pretty good:
-; RUN: not grep {addl	\\\$4,} %t
-; RUN: grep {,%} %t | count 6
+; RUN: grep {addl	\\\$4,} %t | count 3
+; RUN: not grep {,%} %t

 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
 entry:
--- a/llvm/test/CodeGen/X86/iv-users-in-other-loops.ll
+++ b/llvm/test/CodeGen/X86/iv-users-in-other-loops.ll
@ -1,11 +1,11 @@
 ; RUN: llc < %s -march=x86-64 -o %t
-; RUN: grep inc %t | count 1
+; RUN: not grep inc %t
 ; RUN: grep dec %t | count 2
 ; RUN: grep addq %t | count 13
 ; RUN: not grep addb %t
-; RUN: grep leaq %t | count 9
-; RUN: grep leal %t | count 3
-; RUN: grep movq %t | count 5
+; RUN: not grep leaq %t
+; RUN: not grep leal %t
+; RUN: not grep movq %t

 ; IV users in each of the loops from other loops shouldn't cause LSR
 ; to insert new induction variables. Previously it would create a
--- a/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll
@ -1,11 +1,24 @@
-; RUN: llc < %s -march=x86 -relocation-model=pic | \
-; RUN:   grep {, 4} | count 1
-; RUN: llc < %s -march=x86 | not grep lea
+; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s -check-prefix=STATIC
 ;
 ; Make sure the common loop invariant A is hoisted up to preheader,
 ; since too many registers are needed to subsume it into the addressing modes.
 ; It's safe to sink A in when it's not pic.

+; PIC:  align
+; PIC:  movl  $4, -4([[REG:%e[a-z]+]])
+; PIC:  movl  $5, ([[REG]])
+; PIC:  addl  $4, [[REG]]
+; PIC:  decl  {{%e[[a-z]+}}
+; PIC:  jne
+
+; STATIC: align
+; STATIC: movl  $4, -4(%ecx)
+; STATIC: movl  $5, (%ecx)
+; STATIC: addl  $4, %ecx
+; STATIC: decl  %eax
+; STATIC: jne
+
@A = global [16 x [16 x i32]] zeroinitializer, align 32		; <[16 x [16 x i32]]*> [#uses=2]

 define void @test(i32 %row, i32 %N.in) nounwind {
--- a/llvm/test/CodeGen/X86/loop-strength-reduce-3.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce-3.ll
@ -1,8 +1,11 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | \
-; RUN:   grep {A+} | count 2
-;
-; Make sure the common loop invariant A is not hoisted up to preheader,
-; since it can be subsumed it into the addressing modes.
+; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | FileCheck %s
+
+; CHECK: align
+; CHECK: movl  $4, -4(%ecx)
+; CHECK: movl  $5, (%ecx)
+; CHECK: addl  $4, %ecx
+; CHECK: decl  %eax
+; CHECK: jne

@A = global [16 x [16 x i32]] zeroinitializer, align 32		; <[16 x [16 x i32]]*> [#uses=2]

--- a/llvm/test/CodeGen/X86/loop-strength-reduce.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce.ll
@ -1,8 +1,11 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | \
-; RUN:   grep {A+} | count 2
-;
-; Make sure the common loop invariant A is not hoisted up to preheader,
-; since it can be subsumed into the addressing mode in all uses.
+; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s
+
+; CHECK: align
+; CHECK: movl  $4, -4(%ecx)
+; CHECK: movl  $5, (%ecx)
+; CHECK: addl  $4, %ecx
+; CHECK: decl  %eax
+; CHECK: jne

@A = internal global [16 x [16 x i32]] zeroinitializer, align 32		; <[16 x [16 x i32]]*> [#uses=2]

--- a/llvm/test/CodeGen/X86/loop-strength-reduce4.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce4.ll
@ -1,5 +1,19 @@
-; RUN: llc < %s -march=x86 | grep cmp | grep 64
-; RUN: llc < %s -march=x86 | not grep inc
+; RUN: llc < %s -march=x86 -relocation-model=static -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=STATIC
+; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC
+
+; By starting the IV at -64 instead of 0, a cmp is eliminated,
+; as the flags from the add can be used directly.
+
+; STATIC: movl    $-64, %ecx
+
+; STATIC: movl    %eax, _state+76(%ecx)
+; STATIC: addl    $16, %ecx
+; STATIC: jne
+
+; In PIC mode the symbol can't be folded, so the change-compare-stride
+; trick applies.
+
+; PIC: cmpl $64

@state = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
@S = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
--- a/llvm/test/CodeGen/X86/loop-strength-reduce8.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce8.ll
@ -1,4 +1,10 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | grep leal | not grep 16
+; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+
+; CHECK: leal 16(%eax), %edx
+; CHECK: align
+; CHECK: addl    $4, %edx
+; CHECK: decl    %ecx
+; CHECK: jne     LBB1_2

 	%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32 }
 	%struct.bitmap_element = type { %struct.bitmap_element*, %struct.bitmap_element*, i32, [2 x i64] }
--- a/llvm/test/CodeGen/X86/lsr-reuse.ll
+++ b/llvm/test/CodeGen/X86/lsr-reuse.ll
@ -0,0 +1,386 @@
+; RUN: llc < %s -march=x86-64 -O3 | FileCheck %s
+target datalayout = "e-p:64:64:64"
+target triple = "x86_64-unknown-unknown"
+
+; Full strength reduction reduces register pressure from 5 to 4 here.
+; Instruction selection should use the FLAGS value from the dec for
+; the branch. Scheduling should push the adds upwards.
+
+; CHECK: full_me_0:
+; CHECK: movsd   (%rsi), %xmm0
+; CHECK: addq    $8, %rsi
+; CHECK: mulsd   (%rdx), %xmm0
+; CHECK: addq    $8, %rdx
+; CHECK: movsd   %xmm0, (%rdi)
+; CHECK: addq    $8, %rdi
+; CHECK: decq    %rcx
+; CHECK: jne
+
+define void @full_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  ret void
+}
+
+; Mostly-full strength reduction means we do full strength reduction on all
+; except for the offsets.
+;
+; Given a choice between constant offsets -2048 and 2048, choose the negative
+; value, because at boundary conditions it has a smaller encoding.
+; TODO: That's an over-general heuristic. It would be better for the target
+; to indicate what the encoding cost would be. Then using a 2048 offset
+; would be better on x86-64, since the start value would be 0 instead of
+; 2048.
+
+; CHECK: mostly_full_me_0:
+; CHECK: movsd   -2048(%rsi), %xmm0
+; CHECK: mulsd   -2048(%rdx), %xmm0
+; CHECK: movsd   %xmm0, -2048(%rdi)
+; CHECK: movsd   (%rsi), %xmm0
+; CHECK: addq    $8, %rsi
+; CHECK: divsd   (%rdx), %xmm0
+; CHECK: addq    $8, %rdx
+; CHECK: movsd   %xmm0, (%rdi)
+; CHECK: addq    $8, %rdi
+; CHECK: decq    %rcx
+; CHECK: jne
+
+define void @mostly_full_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %j = add i64 %i, 256
+  %Aj = getelementptr inbounds double* %A, i64 %j
+  %Bj = getelementptr inbounds double* %B, i64 %j
+  %Cj = getelementptr inbounds double* %C, i64 %j
+  %t3 = load double* %Bj
+  %t4 = load double* %Cj
+  %o = fdiv double %t3, %t4
+  store double %o, double* %Aj
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  ret void
+}
+
+; A minor variation on mostly_full_me_0.
+; Prefer to start the indvar at 0.
+
+; CHECK: mostly_full_me_1:
+; CHECK: movsd   (%rsi), %xmm0
+; CHECK: mulsd   (%rdx), %xmm0
+; CHECK: movsd   %xmm0, (%rdi)
+; CHECK: movsd   -2048(%rsi), %xmm0
+; CHECK: addq    $8, %rsi
+; CHECK: divsd   -2048(%rdx), %xmm0
+; CHECK: addq    $8, %rdx
+; CHECK: movsd   %xmm0, -2048(%rdi)
+; CHECK: addq    $8, %rdi
+; CHECK: decq    %rcx
+; CHECK: jne
+
+define void @mostly_full_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %j = sub i64 %i, 256
+  %Aj = getelementptr inbounds double* %A, i64 %j
+  %Bj = getelementptr inbounds double* %B, i64 %j
+  %Cj = getelementptr inbounds double* %C, i64 %j
+  %t3 = load double* %Bj
+  %t4 = load double* %Cj
+  %o = fdiv double %t3, %t4
+  store double %o, double* %Aj
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  ret void
+}
+
+; A slightly less minor variation on mostly_full_me_0.
+
+; CHECK: mostly_full_me_2:
+; CHECK: movsd   (%rsi), %xmm0
+; CHECK: mulsd   (%rdx), %xmm0
+; CHECK: movsd   %xmm0, (%rdi)
+; CHECK: movsd   -4096(%rsi), %xmm0
+; CHECK: addq    $8, %rsi
+; CHECK: divsd   -4096(%rdx), %xmm0
+; CHECK: addq    $8, %rdx
+; CHECK: movsd   %xmm0, -4096(%rdi)
+; CHECK: addq    $8, %rdi
+; CHECK: decq    %rcx
+; CHECK: jne
+
+define void @mostly_full_me_2(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %k = add i64 %i, 256
+  %Ak = getelementptr inbounds double* %A, i64 %k
+  %Bk = getelementptr inbounds double* %B, i64 %k
+  %Ck = getelementptr inbounds double* %C, i64 %k
+  %t1 = load double* %Bk
+  %t2 = load double* %Ck
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ak
+  %j = sub i64 %i, 256
+  %Aj = getelementptr inbounds double* %A, i64 %j
+  %Bj = getelementptr inbounds double* %B, i64 %j
+  %Cj = getelementptr inbounds double* %C, i64 %j
+  %t3 = load double* %Bj
+  %t4 = load double* %Cj
+  %o = fdiv double %t3, %t4
+  store double %o, double* %Aj
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  ret void
+}
+
+; In this test, the counting IV exit value is used, so full strength reduction
+; would not reduce register pressure. IndVarSimplify ought to simplify such
+; cases away, but it's useful here to verify that LSR's register pressure
+; heuristics are working as expected.
+
+; CHECK: count_me_0:
+; CHECK: movsd   (%rsi,%rax,8), %xmm0
+; CHECK: mulsd   (%rdx,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, (%rdi,%rax,8)
+; CHECK: incq    %rax
+; CHECK: cmpq    %rax, %rcx
+; CHECK: jne
+
+define i64 @count_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  %q = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  ret i64 %q
+}
+
+; In this test, the trip count value is used, so full strength reduction
+; would not reduce register pressure.
+; (though it would reduce register pressure inside the loop...)
+
+; CHECK: count_me_1:
+; CHECK: movsd   (%rsi,%rax,8), %xmm0
+; CHECK: mulsd   (%rdx,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, (%rdi,%rax,8)
+; CHECK: incq    %rax
+; CHECK: cmpq    %rax, %rcx
+; CHECK: jne
+
+define i64 @count_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  %q = phi i64 [ 0, %entry ], [ %n, %loop ]
+  ret i64 %q
+}
+
+; Full strength reduction doesn't save any registers here because the
+; loop tripcount is a constant.
+
+; CHECK: count_me_2:
+; CHECK: movl    $10, %eax
+; CHECK: align
+; CHECK: BB7_1:
+; CHECK: movsd   -40(%rdi,%rax,8), %xmm0
+; CHECK: addsd   -40(%rsi,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, -40(%rdx,%rax,8)
+; CHECK: movsd   (%rdi,%rax,8), %xmm0
+; CHECK: subsd   (%rsi,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, (%rdx,%rax,8)
+; CHECK: incq    %rax
+; CHECK: cmpq    $5010, %rax
+; CHECK: jne
+
+define void @count_me_2(double* nocapture %A, double* nocapture %B, double* nocapture %C) nounwind {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %i5 = add i64 %i, 5
+  %Ai = getelementptr double* %A, i64 %i5
+  %t2 = load double* %Ai
+  %Bi = getelementptr double* %B, i64 %i5
+  %t4 = load double* %Bi
+  %t5 = fadd double %t2, %t4
+  %Ci = getelementptr double* %C, i64 %i5
+  store double %t5, double* %Ci
+  %i10 = add i64 %i, 10
+  %Ai10 = getelementptr double* %A, i64 %i10
+  %t9 = load double* %Ai10
+  %Bi10 = getelementptr double* %B, i64 %i10
+  %t11 = load double* %Bi10
+  %t12 = fsub double %t9, %t11
+  %Ci10 = getelementptr double* %C, i64 %i10
+  store double %t12, double* %Ci10
+  %i.next = add i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, 5000
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  ret void
+}
+
+; This should be fully strength-reduced to reduce register pressure.
+
+; CHECK: full_me_1:
+; CHECK: align
+; CHECK: BB8_1:
+; CHECK: movsd   (%rdi), %xmm0
+; CHECK: addsd   (%rsi), %xmm0
+; CHECK: movsd   %xmm0, (%rdx)
+; CHECK: movsd   40(%rdi), %xmm0
+; CHECK: addq    $8, %rdi
+; CHECK: subsd   40(%rsi), %xmm0
+; CHECK: addq    $8, %rsi
+; CHECK: movsd   %xmm0, 40(%rdx)
+; CHECK: addq    $8, %rdx
+; CHECK: decq    %rcx
+; CHECK: jne
+
+define void @full_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %i5 = add i64 %i, 5
+  %Ai = getelementptr double* %A, i64 %i5
+  %t2 = load double* %Ai
+  %Bi = getelementptr double* %B, i64 %i5
+  %t4 = load double* %Bi
+  %t5 = fadd double %t2, %t4
+  %Ci = getelementptr double* %C, i64 %i5
+  store double %t5, double* %Ci
+  %i10 = add i64 %i, 10
+  %Ai10 = getelementptr double* %A, i64 %i10
+  %t9 = load double* %Ai10
+  %Bi10 = getelementptr double* %B, i64 %i10
+  %t11 = load double* %Bi10
+  %t12 = fsub double %t9, %t11
+  %Ci10 = getelementptr double* %C, i64 %i10
+  store double %t12, double* %Ci10
+  %i.next = add i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  ret void
+}
+
+; This is a variation on full_me_0 in which the 0,+,1 induction variable
+; has a non-address use, pinning that value in a register.
+
+; CHECK: count_me_3:
+; CHECK: call
+; CHECK: movsd   (%r15,%r13,8), %xmm0
+; CHECK: mulsd   (%r14,%r13,8), %xmm0
+; CHECK: movsd   %xmm0, (%r12,%r13,8)
+; CHECK: incq    %r13
+; CHECK: cmpq    %r13, %rbx
+; CHECK: jne
+
+declare void @use(i64)
+
+define void @count_me_3(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  call void @use(i64 %i)
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  ret void
+}
--- a/llvm/test/CodeGen/X86/masked-iv-safe.ll
+++ b/llvm/test/CodeGen/X86/masked-iv-safe.ll
@ -169,7 +169,7 @@ loop:
 	%indvar.i24 = and i64 %indvar, 16777215
 	%t3 = getelementptr double* %d, i64 %indvar.i24
 	%t4 = load double* %t3
-	%t5 = fmul double %t4, 2.3
+	%t5 = fdiv double %t4, 2.3
 	store double %t5, double* %t3
 	%t6 = getelementptr double* %d, i64 %indvar
 	%t7 = load double* %t6
@ -199,7 +199,7 @@ loop:
 	%indvar.i24 = ashr i64 %s1, 24
 	%t3 = getelementptr double* %d, i64 %indvar.i24
 	%t4 = load double* %t3
-	%t5 = fmul double %t4, 2.3
+	%t5 = fdiv double %t4, 2.3
 	store double %t5, double* %t3
 	%t6 = getelementptr double* %d, i64 %indvar
 	%t7 = load double* %t6
@ -229,7 +229,7 @@ loop:
 	%indvar.i24 = ashr i64 %s1, 24
 	%t3 = getelementptr double* %d, i64 %indvar.i24
 	%t4 = load double* %t3
-	%t5 = fmul double %t4, 2.3
+	%t5 = fdiv double %t4, 2.3
 	store double %t5, double* %t3
 	%t6 = getelementptr double* %d, i64 %indvar
 	%t7 = load double* %t6
--- a/llvm/test/CodeGen/X86/pr3495.ll
+++ b/llvm/test/CodeGen/X86/pr3495.ll
@ -1,8 +1,7 @@
 ; RUN: llc < %s -march=x86 -stats |& grep {Number of loads added} | grep 2
 ; RUN: llc < %s -march=x86 -stats |& grep {Number of register spills} | grep 1
-; RUN: llc < %s -march=x86 -stats |& grep {Number of machine instrs printed} | grep 37
+; RUN: llc < %s -march=x86 -stats |& grep {Number of machine instrs printed} | grep 34
 ; PR3495
-; The loop reversal kicks in once here, resulting in one fewer instruction.

 target triple = "i386-pc-linux-gnu"
@x = external global [8 x i32], align 32		; <[8 x i32]*> [#uses=1]
--- a/llvm/test/Transforms/IndVarSimplify/addrec-gep.ll
+++ b/llvm/test/Transforms/IndVarSimplify/addrec-gep.ll
@ -25,7 +25,7 @@ bb1:		; preds = %bb2, %bb.nph
 	%j.01 = phi i64 [ %tmp9, %bb2 ], [ 0, %bb.nph ]		; <i64> [#uses=3]
 	%tmp3 = add i64 %j.01, %tmp1		; <i64> [#uses=1]
 	%tmp4 = add i64 %j.01, %tmp2		; <i64> [#uses=1]
-        %z0 = add i64 %tmp4, 5203
+        %z0 = add i64 %tmp3, 5203
 	%tmp5 = getelementptr double* %p, i64 %z0		; <double*> [#uses=1]
 	%tmp6 = load double* %tmp5, align 8		; <double> [#uses=1]
 	%tmp7 = fdiv double %tmp6, 2.100000e+00		; <double> [#uses=1]
--- a/llvm/test/Transforms/LoopStrengthReduce/2008-08-06-CmpStride.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/2008-08-06-CmpStride.ll
@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-reduce -S | grep ugt
-; PR2535
+; RUN: llc -march=x86-64 < %s -o - | grep {cmpl	\\$\[1\], %}

@.str = internal constant [4 x i8] c"%d\0A\00"

@ -16,7 +15,7 @@ forbody:
        %add166 = or i32 %mul15, 1              ; <i32> [#uses=1] *
        call i32 (i8*, ...)* @printf( i8* noalias  getelementptr ([4 x i8]* @.str, i32 0, i32 0), i32 %add166 ) nounwind
        %inc = add i32 %i.0, 1          ; <i32> [#uses=3]
-        %cmp = icmp ult i32 %inc, 1027          ; <i1> [#uses=1]
+        %cmp = icmp ne i32 %inc, 1027          ; <i1> [#uses=1]
        br i1 %cmp, label %forbody, label %afterfor

 afterfor:               ; preds = %forcond
--- a/llvm/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-0.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-0.ll
@ -1,10 +1,15 @@
-; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmpl	\$4}
+; RUN: llc < %s -o - | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin9"

-; This is like change-compare-stride-trickiness-1.ll except the comparison
-; happens before the relevant use, so the comparison stride can't be
-; easily changed.
+; The comparison happens before the relevant use, but it can still be rewritten
+; to compare with zero.
+
+; CHECK: foo:
+; CHECK: align
+; CHECK: incl  %eax
+; CHECK-NEXT: decl  %ecx
+; CHECK-NEXT: jne

 define void @foo() nounwind {
 entry:
--- a/llvm/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-1.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-1.ll
@ -1,10 +1,12 @@
-; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmp.	\$8}
+; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmp.	\$10}
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin9"

 ; The comparison happens after the relevant use, so the stride can easily
 ; be changed. The comparison can be done in a narrower mode than the
 ; induction variable.
+; TODO: By making the first store post-increment as well, the loop setup
+; could be made simpler.

 define void @foo() nounwind {
 entry:
--- a/llvm/test/Transforms/LoopStrengthReduce/count-to-zero.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/count-to-zero.ll
@ -19,7 +19,7 @@ bb3:                                              ; preds = %bb1
  %tmp4 = add i32 %c_addr.1, -1                   ; <i32> [#uses=1]
  %c_addr.1.be = select i1 %tmp2, i32 %tmp3, i32 %tmp4 ; <i32> [#uses=1]
  %indvar.next = add i32 %indvar, 1               ; <i32> [#uses=1]
-; CHECK: sub i32 %lsr.iv, 1
+; CHECK: add i32 %lsr.iv, -1
  br label %bb6

 bb6:                                              ; preds = %bb3, %entry
--- a/llvm/test/Transforms/LoopStrengthReduce/invariant_value_first.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/invariant_value_first.ll
@ -1,5 +1,5 @@
 ; Check that the index of 'P[outer]' is pulled out of the loop.
-; RUN: opt < %s -loop-reduce -S | \
+; RUN: opt < %s -loop-reduce -S -default-data-layout="e-p:32:32:32" | \
 ; RUN:   not grep {getelementptr.*%outer.*%INDVAR}

 declare i1 @pred()
--- a/llvm/test/Transforms/LoopStrengthReduce/invariant_value_first_arg.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/invariant_value_first_arg.ll
@ -1,5 +1,5 @@
 ; Check that the index of 'P[outer]' is pulled out of the loop.
-; RUN: opt < %s -loop-reduce -S | \
+; RUN: opt < %s -loop-reduce -S -default-data-layout="e-p:32:32:32" | \
 ; RUN:   not grep {getelementptr.*%outer.*%INDVAR}

 declare i1 @pred()
--- a/llvm/test/Transforms/LoopStrengthReduce/ops_after_indvar.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/ops_after_indvar.ll
@ -1,7 +1,7 @@
 ; Check that this test makes INDVAR and related stuff dead, because P[indvar]
 ; gets reduced, making INDVAR dead.

-; RUN: opt < %s -loop-reduce -S | not grep INDVAR
+; RUN: opt < %s -loop-reduce -S -default-data-layout="e-p:32:32:32" | not grep INDVAR

 declare i1 @pred()

--- a/llvm/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -analyze -iv-users | grep {Stride i64 {3,+,2}<%loop>:}
+; RUN: opt < %s -analyze -iv-users | grep {\{1,+,3,+,2\}<%loop> (post-inc)}

 ; The value of %r is dependent on a polynomial iteration expression.

--- a/llvm/test/Transforms/LoopStrengthReduce/remove_indvar.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/remove_indvar.ll
@ -7,10 +7,12 @@ define void @test(i32* %P) {
 ; <label>:0
 	br label %Loop
 Loop:		; preds = %Loop, %0
+        %i = phi i32 [ 0, %0 ], [ %i.next, %Loop ]
 	%INDVAR = phi i32 [ 0, %0 ], [ %INDVAR2, %Loop ]		; <i32> [#uses=2]
 	%STRRED = getelementptr i32* %P, i32 %INDVAR		; <i32*> [#uses=1]
 	store i32 0, i32* %STRRED
 	%INDVAR2 = add i32 %INDVAR, 1		; <i32> [#uses=1]
+        %i.next = add i32 %i, 1
 	%cond = call i1 @pred( )		; <i1> [#uses=1]
 	br i1 %cond, label %Loop, label %Out
 Out:		; preds = %Loop
--- a/llvm/test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/use_postinc_value_outside_loop.ll
@ -1,5 +1,5 @@
 ; RUN: opt < %s -loop-reduce -S | \
-; RUN:   grep {add i32 %lsr.iv.next, 1}
+; RUN:   grep {add i32 %indvar630.ui, 1}
 ;
 ; Make sure that the use of the IV outside of the loop (the store) uses the 
 ; post incremented value of the IV, not the preincremented value.  This