Revert [SLP] Look-ahead operand reordering heuristic.

This reverts r364084 (git commit 5698921be2) It caused crashes while compiling a file in Chrome. Reduction forthcoming. llvm-svn: 364111
2019-06-21 23:10:25 +00:00 · 2019-06-21 23:10:25 +00:00 · 592a193285
parent a9bfda08ca
commit 592a193285
2 changed files with 94 additions and 277 deletions
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -147,12 +147,6 @@ static cl::opt<unsigned> MinTreeSize(
    "slp-min-tree-size", cl::init(3), cl::Hidden,
    cl::desc("Only vectorize small trees if they are fully vectorizable"));

-// The maximum depth that the look-ahead score heuristic will explore.
-// The higher this value, the higher the compilation time overhead.
-static cl::opt<int> LookAheadMaxDepth(
-    "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
-    cl::desc("The maximum look-ahead depth for operand reordering scores"));
-
 static cl::opt<bool>
    ViewSLPTree("view-slp-tree", cl::Hidden,
                cl::desc("Display the SLP trees with Graphviz"));
@ -714,7 +708,6 @@ public:

    const DataLayout &DL;
    ScalarEvolution &SE;
-    const BoUpSLP &R;

    /// \returns the operand data at \p OpIdx and \p Lane.
    OperandData &getData(unsigned OpIdx, unsigned Lane) {
@ -740,207 +733,6 @@ public:
      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
    }

-    // The hard-coded scores listed here are not very important. When computing
-    // the scores of matching one sub-tree with another, we are basically
-    // counting the number of values that are matching. So even if all scores
-    // are set to 1, we would still get a decent matching result.
-    // However, sometimes we have to break ties. For example we may have to
-    // choose between matching loads vs matching opcodes. This is what these
-    // scores are helping us with: they provide the order of preference.
-
-    /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
-    static const int ScoreConsecutiveLoads = 3;
-    /// Constants.
-    static const int ScoreConstants = 2;
-    /// Instructions with the same opcode.
-    static const int ScoreSameOpcode = 2;
-    /// Instructions with alt opcodes (e.g, add + sub).
-    static const int ScoreAltOpcodes = 1;
-    /// Identical instructions (a.k.a. splat or broadcast).
-    static const int ScoreSplat = 1;
-    /// Matching with an undef is preferable to failing.
-    static const int ScoreUndef = 1;
-    /// Score for failing to find a decent match.
-    static const int ScoreFail = 0;
-    /// User external to the vectorized code.
-    static const int ExternalUseCost = 1;
-    /// The user is internal but in a different lane.
-    static const int UserInDiffLaneCost = ExternalUseCost;
-
-    /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
-    static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
-                               ScalarEvolution &SE) {
-      auto *LI1 = dyn_cast<LoadInst>(V1);
-      auto *LI2 = dyn_cast<LoadInst>(V2);
-      if (LI1 && LI2)
-        return isConsecutiveAccess(LI1, LI2, DL, SE)
-                   ? VLOperands::ScoreConsecutiveLoads
-                   : VLOperands::ScoreFail;
-
-      auto *C1 = dyn_cast<Constant>(V1);
-      auto *C2 = dyn_cast<Constant>(V2);
-      if (C1 && C2)
-        return VLOperands::ScoreConstants;
-
-      auto *I1 = dyn_cast<Instruction>(V1);
-      auto *I2 = dyn_cast<Instruction>(V2);
-      if (I1 && I2) {
-        if (I1 == I2)
-          return VLOperands::ScoreSplat;
-        InstructionsState S = getSameOpcode({I1, I2});
-        // Note: Only consider instructions with <= 2 operands to avoid
-        // complexity explosion.
-        if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
-          return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
-                                  : VLOperands::ScoreSameOpcode;
-      }
-
-      if (isa<UndefValue>(V2))
-        return VLOperands::ScoreUndef;
-
-      return VLOperands::ScoreFail;
-    }
-
-    /// Holds the values and their lane that are taking part in the look-ahead
-    /// score calculation. This is used in the external uses cost calculation.
-    SmallDenseMap<Value *, int> InLookAheadValues;
-
-    /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
-    /// either external to the vectorized code, or require shuffling.
-    int getExternalUsesCost(const std::pair<Value *, int> &LHS,
-                            const std::pair<Value *, int> &RHS) {
-      int Cost = 0;
-      SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
-      for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
-        Value *V = Values[Idx].first;
-        // Calculate the absolute lane, using the minimum relative lane of LHS
-        // and RHS as base and Idx as the offset.
-        int Ln = std::min(LHS.second, RHS.second) + Idx;
-        assert(Ln >= 0 && "Bad lane calculation");
-        for (User *U : V->users()) {
-          if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
-            // The user is in the VectorizableTree. Check if we need to insert.
-            auto It = llvm::find(UserTE->Scalars, U);
-            assert(It != UserTE->Scalars.end() && "U is in UserTE");
-            int UserLn = std::distance(UserTE->Scalars.begin(), It);
-            assert(UserLn >= 0 && "Bad lane");
-            if (UserLn != Ln)
-              Cost += UserInDiffLaneCost;
-          } else {
-            // Check if the user is in the look-ahead code.
-            auto It2 = InLookAheadValues.find(U);
-            if (It2 != InLookAheadValues.end()) {
-              // The user is in the look-ahead code. Check the lane.
-              if (It2->second != Ln)
-                Cost += UserInDiffLaneCost;
-            } else {
-              // The user is neither in SLP tree nor in the look-ahead code.
-              Cost += ExternalUseCost;
-            }
-          }
-        }
-      }
-      return Cost;
-    }
-
-    /// Go through the operands of \p LHS and \p RHS recursively until \p
-    /// MaxLevel, and return the cummulative score. For example:
-    /// \verbatim
-    ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]
-    ///     \ /         \ /         \ /        \ /
-    ///      +           +           +          +
-    ///     G1          G2          G3         G4
-    /// \endverbatim
-    /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
-    /// each level recursively, accumulating the score. It starts from matching
-    /// the additions at level 0, then moves on to the loads (level 1). The
-    /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
-    /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
-    /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
-    /// Please note that the order of the operands does not matter, as we
-    /// evaluate the score of all profitable combinations of operands. In
-    /// other words the score of G1 and G4 is the same as G1 and G2. This
-    /// heuristic is based on ideas described in:
-    ///   Look-ahead SLP: Auto-vectorization in the presence of commutative
-    ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
-    ///   Luís F. W. Góes
-    int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
-                           const std::pair<Value *, int> &RHS, int CurrLevel,
-                           int MaxLevel) {
-
-      Value *V1 = LHS.first;
-      Value *V2 = RHS.first;
-      // Get the shallow score of V1 and V2.
-      int ShallowScoreAtThisLevel =
-          std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
-                                       getExternalUsesCost(LHS, RHS));
-      int Lane1 = LHS.second;
-      int Lane2 = RHS.second;
-
-      // If reached MaxLevel,
-      //  or if V1 and V2 are not instructions,
-      //  or if they are SPLAT,
-      //  or if they are not consecutive, early return the current cost.
-      auto *I1 = dyn_cast<Instruction>(V1);
-      auto *I2 = dyn_cast<Instruction>(V2);
-      if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
-          ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
-          (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
-        return ShallowScoreAtThisLevel;
-      assert(I1 && I2 && "Should have early exited.");
-
-      // Keep track of in-tree values for determining the external-use cost.
-      InLookAheadValues[V1] = Lane1;
-      InLookAheadValues[V2] = Lane2;
-
-      // Contains the I2 operand indexes that got matched with I1 operands.
-      SmallSet<int, 4> Op2Used;
-
-      // Recursion towards the operands of I1 and I2. We are trying all possbile
-      // operand pairs, and keeping track of the best score.
-      for (int OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
-           OpIdx1 != NumOperands1; ++OpIdx1) {
-        // Try to pair op1I with the best operand of I2.
-        int MaxTmpScore = 0;
-        int MaxOpIdx2 = -1;
-        // If I2 is commutative try all combinations.
-        int FromIdx = isCommutative(I2) ? 0 : OpIdx1;
-        int ToIdx = isCommutative(I2) ? I2->getNumOperands() : OpIdx1 + 1;
-        assert(FromIdx < ToIdx && "Bad index");
-        for (int OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
-          // Skip operands already paired with OpIdx1.
-          if (Op2Used.count(OpIdx2))
-            continue;
-          // Recursively calculate the cost at each level
-          int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
-                                            {I2->getOperand(OpIdx2), Lane2},
-                                            CurrLevel + 1, MaxLevel);
-          // Look for the best score.
-          if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
-            MaxTmpScore = TmpScore;
-            MaxOpIdx2 = OpIdx2;
-          }
-        }
-        if (MaxOpIdx2 >= 0) {
-          // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
-          Op2Used.insert(MaxOpIdx2);
-          ShallowScoreAtThisLevel += MaxTmpScore;
-        }
-      }
-      return ShallowScoreAtThisLevel;
-    }
-
-    /// \Returns the look-ahead score, which tells us how much the sub-trees
-    /// rooted at \p LHS and \p RHS match, the more they match the higher the
-    /// score. This helps break ties in an informed way when we cannot decide on
-    /// the order of the operands by just considering the immediate
-    /// predecessors.
-    int getLookAheadScore(const std::pair<Value *, int> &LHS,
-                          const std::pair<Value *, int> &RHS) {
-      InLookAheadValues.clear();
-      return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
-    }
-
    // Search all operands in Ops[*][Lane] for the one that matches best
    // Ops[OpIdx][LastLane] and return its opreand index.
    // If no good match can be found, return None.
@ -958,6 +750,9 @@ public:
      // The linearized opcode of the operand at OpIdx, Lane.
      bool OpIdxAPO = getData(OpIdx, Lane).APO;

+      const unsigned BestScore = 2;
+      const unsigned GoodScore = 1;
+
      // The best operand index and its score.
      // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
      // are using the score to differentiate between the two.
@ -986,19 +781,41 @@ public:
        // Look for an operand that matches the current mode.
        switch (RMode) {
        case ReorderingMode::Load:
-        case ReorderingMode::Constant:
-        case ReorderingMode::Opcode: {
-          bool LeftToRight = Lane > LastLane;
-          Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
-          Value *OpRight = (LeftToRight) ? Op : OpLastLane;
-          unsigned Score =
-              getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
-          if (Score > BestOp.Score) {
-            BestOp.Idx = Idx;
-            BestOp.Score = Score;
+          if (isa<LoadInst>(Op)) {
+            // Figure out which is left and right, so that we can check for
+            // consecutive loads
+            bool LeftToRight = Lane > LastLane;
+            Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
+            Value *OpRight = (LeftToRight) ? Op : OpLastLane;
+            if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
+                                    cast<LoadInst>(OpRight), DL, SE))
+              BestOp.Idx = Idx;
+          }
+          break;
+        case ReorderingMode::Opcode:
+          // We accept both Instructions and Undefs, but with different scores.
+          if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
+               cast<Instruction>(Op)->getOpcode() ==
+                   cast<Instruction>(OpLastLane)->getOpcode()) ||
+              (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
+              isa<UndefValue>(Op)) {
+            // An instruction has a higher score than an undef.
+            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
+            if (Score > BestOp.Score) {
+              BestOp.Idx = Idx;
+              BestOp.Score = Score;
+            }
+          }
+          break;
+        case ReorderingMode::Constant:
+          if (isa<Constant>(Op)) {
+            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
+            if (Score > BestOp.Score) {
+              BestOp.Idx = Idx;
+              BestOp.Score = Score;
+            }
          }
          break;
-        }
        case ReorderingMode::Splat:
          if (Op == OpLastLane)
            BestOp.Idx = Idx;
@ -1129,8 +946,8 @@ public:
  public:
    /// Initialize with all the operands of the instruction vector \p RootVL.
    VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
-               ScalarEvolution &SE, const BoUpSLP &R)
-        : DL(DL), SE(SE), R(R) {
+               ScalarEvolution &SE)
+        : DL(DL), SE(SE) {
      // Append all the operands of RootVL.
      appendOperandsOfVL(RootVL);
    }
@ -1352,8 +1169,7 @@ private:
                                             SmallVectorImpl<Value *> &Left,
                                             SmallVectorImpl<Value *> &Right,
                                             const DataLayout &DL,
-                                             ScalarEvolution &SE,
-                                             const BoUpSLP &R);
+                                             ScalarEvolution &SE);
  struct TreeEntry {
    using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
    TreeEntry(VecTreeTy &Container) : Container(Container) {}
@ -2555,7 +2371,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
        // Commutative predicate - collect + sort operands of the instructions
        // so that each side is more likely to have the same opcode.
        assert(P0 == SwapP0 && "Commutative Predicate mismatch");
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
      } else {
        // Collect operands - commute if it uses the swapped predicate.
        for (Value *V : VL) {
@ -2599,7 +2415,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
      // have the same opcode.
      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
        buildTree_rec(Left, Depth + 1, {TE, 0});
        buildTree_rec(Right, Depth + 1, {TE, 1});
        return;
@ -2768,7 +2584,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
      // Reorder operands if reordering would enable vectorization.
      if (isa<BinaryOperator>(VL0)) {
        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
        buildTree_rec(Left, Depth + 1, {TE, 0});
        buildTree_rec(Right, Depth + 1, {TE, 1});
        return;
@ -3483,15 +3299,13 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

 // Perform operand reordering on the instructions in VL and return the reordered
 // operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const DataLayout &DL,
-                                             ScalarEvolution &SE,
-                                             const BoUpSLP &R) {
+void BoUpSLP::reorderInputsAccordingToOpcode(
+    ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
+    SmallVectorImpl<Value *> &Right, const DataLayout &DL,
+    ScalarEvolution &SE) {
  if (VL.empty())
    return;
-  VLOperands Ops(VL, DL, SE, R);
+  VLOperands Ops(VL, DL, SE);
  // Reorder the operands in place.
  Ops.reorder();
  Left = Ops.getVL(0);
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@ -27,19 +27,22 @@ define void @lookahead_basic(double* %array) {
 ; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
 ; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
 ; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
+; CHECK-NEXT:    [[A_0:%.*]] = load double, double* [[IDX0]], align 8
+; CHECK-NEXT:    [[A_1:%.*]] = load double, double* [[IDX1]], align 8
+; CHECK-NEXT:    [[B_0:%.*]] = load double, double* [[IDX2]], align 8
+; CHECK-NEXT:    [[B_1:%.*]] = load double, double* [[IDX3]], align 8
+; CHECK-NEXT:    [[C_0:%.*]] = load double, double* [[IDX4]], align 8
+; CHECK-NEXT:    [[C_1:%.*]] = load double, double* [[IDX5]], align 8
+; CHECK-NEXT:    [[D_0:%.*]] = load double, double* [[IDX6]], align 8
+; CHECK-NEXT:    [[D_1:%.*]] = load double, double* [[IDX7]], align 8
+; CHECK-NEXT:    [[SUBAB_0:%.*]] = fsub fast double [[A_0]], [[B_0]]
+; CHECK-NEXT:    [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]]
+; CHECK-NEXT:    [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]]
+; CHECK-NEXT:    [[SUBCD_1:%.*]] = fsub fast double [[C_1]], [[D_1]]
+; CHECK-NEXT:    [[ADDABCD_0:%.*]] = fadd fast double [[SUBAB_0]], [[SUBCD_0]]
+; CHECK-NEXT:    [[ADDCDAB_1:%.*]] = fadd fast double [[SUBCD_1]], [[SUBAB_1]]
+; CHECK-NEXT:    store double [[ADDABCD_0]], double* [[IDX0]], align 8
+; CHECK-NEXT:    store double [[ADDCDAB_1]], double* [[IDX1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@ -161,23 +164,22 @@ define void @lookahead_alt2(double* %array) {
 ; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
 ; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
 ; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8
+; CHECK-NEXT:    [[A_0:%.*]] = load double, double* [[IDX0]], align 8
+; CHECK-NEXT:    [[A_1:%.*]] = load double, double* [[IDX1]], align 8
+; CHECK-NEXT:    [[B_0:%.*]] = load double, double* [[IDX2]], align 8
+; CHECK-NEXT:    [[B_1:%.*]] = load double, double* [[IDX3]], align 8
+; CHECK-NEXT:    [[C_0:%.*]] = load double, double* [[IDX4]], align 8
+; CHECK-NEXT:    [[C_1:%.*]] = load double, double* [[IDX5]], align 8
+; CHECK-NEXT:    [[D_0:%.*]] = load double, double* [[IDX6]], align 8
+; CHECK-NEXT:    [[D_1:%.*]] = load double, double* [[IDX7]], align 8
+; CHECK-NEXT:    [[ADDAB_0:%.*]] = fadd fast double [[A_0]], [[B_0]]
+; CHECK-NEXT:    [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]]
+; CHECK-NEXT:    [[ADDCD_1:%.*]] = fadd fast double [[C_1]], [[D_1]]
+; CHECK-NEXT:    [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]]
+; CHECK-NEXT:    [[ADDABCD_0:%.*]] = fadd fast double [[ADDAB_0]], [[SUBCD_0]]
+; CHECK-NEXT:    [[ADDCDAB_1:%.*]] = fadd fast double [[ADDCD_1]], [[SUBAB_1]]
+; CHECK-NEXT:    store double [[ADDABCD_0]], double* [[IDX0]], align 8
+; CHECK-NEXT:    store double [[ADDCDAB_1]], double* [[IDX1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@ -237,28 +239,29 @@ define void @lookahead_external_uses(double* %A, double *%B, double *%C, double
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
-; CHECK-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
+; CHECK-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[B2:%.*]] = load double, double* [[IDXB2]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = load double, double* [[IDXA2]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[B1:%.*]] = load double, double* [[IDXB1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]]
 ; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
-; CHECK-NEXT:    store double [[A1]], double* [[EXT1:%.*]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    store double [[TMP12]], double* [[EXT1:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry: