[X86] Improve EmitLoweredSelect for contiguous CMOV pseudo instructions.

This change improves EmitLoweredSelect() so that multiple contiguous CMOV pseudo instructions with the same (or exactly opposite) conditions get lowered using a single new basic-block. This eliminates unnecessary extra basic-blocks (and CFG merge points) when contiguous CMOVs are being lowered. Patch by: kevin.b.smith@intel.com Differential Revision: http://reviews.llvm.org/D11428 llvm-svn: 244202
2015-08-06 08:45:34 +00:00 · 2015-08-06 08:45:34 +00:00 · 868dc65444
parent d7b9392f59
commit 868dc65444
4 changed files with 567 additions and 35 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -19947,6 +19947,39 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
  return true;
 }

+// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
+// together with other CMOV pseudo-opcodes into a single basic-block with
+// conditional jump around it.
+static bool isCMOVPseudo(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_GR8:
+  case X86::CMOV_GR16:
+  case X86::CMOV_GR32:
+  case X86::CMOV_RFP32:
+  case X86::CMOV_RFP64:
+  case X86::CMOV_RFP80:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
+  case X86::CMOV_V8I1:
+  case X86::CMOV_V16I1:
+  case X86::CMOV_V32I1:
+  case X86::CMOV_V64I1:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
                                     MachineBasicBlock *BB) const {
@ -19970,8 +20003,41 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
  MachineBasicBlock *thisMBB = BB;
  MachineFunction *F = BB->getParent();

-  // We also lower double CMOVs:
+  // This code lowers all pseudo-CMOV instructions. Generally it lowers these
+  // as described above, by inserting a BB, and then making a PHI at the join
+  // point to select the true and false operands of the CMOV in the PHI.
+  //
+  // The code also handles two different cases of multiple CMOV opcodes
+  // in a row.
+  //
+  // Case 1:
+  // In this case, there are multiple CMOVs in a row, all which are based on
+  // the same condition setting (or the exact opposite condition setting).
+  // In this case we can lower all the CMOVs using a single inserted BB, and
+  // then make a number of PHIs at the join point to model the CMOVs. The only
+  // trickiness here, is that in a case like:
+  //
+  // t2 = CMOV cond1 t1, f1
+  // t3 = CMOV cond1 t2, f2
+  //
+  // when rewriting this into PHIs, we have to perform some renaming on the
+  // temps since you cannot have a PHI operand refer to a PHI result earlier
+  // in the same block.  The "simple" but wrong lowering would be:
+  //
+  // t2 = PHI t1(BB1), f1(BB2)
+  // t3 = PHI t2(BB1), f2(BB2)
+  //
+  // but clearly t2 is not defined in BB1, so that is incorrect. The proper
+  // renaming is to note that on the path through BB1, t2 is really just a
+  // copy of t1, and do that renaming, properly generating:
+  //
+  // t2 = PHI t1(BB1), f1(BB2)
+  // t3 = PHI t1(BB1), f2(BB2)
+  //
+  // Case 2, we lower cascaded CMOVs such as
+  //
  //   (CMOV (CMOV F, T, cc1), T, cc2)
+  //
  // to two successives branches.  For that, we look for another CMOV as the
  // following instruction.
  //
@ -20037,19 +20103,42 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
  // .LBB5_4:
  //         retq
  //
-  MachineInstr *NextCMOV = nullptr;
+  MachineInstr *CascadedCMOV = nullptr;
+  MachineInstr *LastCMOV = MI;
+  X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm());
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
  MachineBasicBlock::iterator NextMIIt =
      std::next(MachineBasicBlock::iterator(MI));
-  if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
+
+  // Check for case 1, where there are multiple CMOVs with the same condition
+  // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
+  // number of jumps the most.
+
+  if (isCMOVPseudo(MI)) {
+    // See if we have a string of CMOVS with the same condition.
+    while (NextMIIt != BB->end() &&
+           isCMOVPseudo(NextMIIt) &&
+           (NextMIIt->getOperand(3).getImm() == CC ||
+            NextMIIt->getOperand(3).getImm() == OppCC)) {
+      LastCMOV = &*NextMIIt;
+      ++NextMIIt;
+    }
+  }
+
+  // This checks for case 2, but only do this if we didn't already find
+  // case 1, as indicated by LastCMOV == MI.
+  if (LastCMOV == MI &&
+      NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
      NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
-      NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg())
-    NextCMOV = &*NextMIIt;
+      NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) {
+    CascadedCMOV = &*NextMIIt;
+  }

  MachineBasicBlock *jcc1MBB = nullptr;

-  // If we have a double CMOV, we lower it to two successive branches to
+  // If we have a cascaded CMOV, we lower it to two successive branches to
  // the same block.  EFLAGS is used by both, so mark it as live in the second.
-  if (NextCMOV) {
+  if (CascadedCMOV) {
    jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
    F->insert(It, jcc1MBB);
    jcc1MBB->addLiveIn(X86::EFLAGS);
@ -20064,7 +20153,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
  // live into the sink and copy blocks.
  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();

-  MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI;
+  MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
  if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
      !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
    copy0MBB->addLiveIn(X86::EFLAGS);
@ -20073,12 +20162,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,

  // Transfer the remainder of BB and its successor edges to sinkMBB.
  sinkMBB->splice(sinkMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+                  std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

  // Add the true and fallthrough blocks as its successors.
-  if (NextCMOV) {
-    // The fallthrough block may be jcc1MBB, if we have a double CMOV.
+  if (CascadedCMOV) {
+    // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
    BB->addSuccessor(jcc1MBB);

    // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
@ -20093,13 +20182,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
  BB->addSuccessor(sinkMBB);

  // Create the conditional branch instruction.
-  unsigned Opc =
-    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+  unsigned Opc = X86::GetCondBranchFromCond(CC);
  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);

-  if (NextCMOV) {
+  if (CascadedCMOV) {
    unsigned Opc2 = X86::GetCondBranchFromCond(
-        (X86::CondCode)NextCMOV->getOperand(3).getImm());
+        (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
    BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
  }

@ -20111,24 +20199,62 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
  //  sinkMBB:
  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
  //  ...
-  MachineInstrBuilder MIB =
-      BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI),
-              MI->getOperand(0).getReg())
-          .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
-          .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+  MachineBasicBlock::iterator MIItEnd =
+    std::next(MachineBasicBlock::iterator(LastCMOV));
+  MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+  MachineInstrBuilder MIB;

-  // If we have a double CMOV, the second Jcc provides the same incoming
+  // As we are creating the PHIs, we have to be careful if there is more than
+  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
+  // PHIs have to reference the individual true/false inputs from earlier PHIs.
+  // That also means that PHI construction must work forward from earlier to
+  // later, and that the code must maintain a mapping from earlier PHI's
+  // destination registers, and the registers that went into the PHI.
+
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+    unsigned DestReg = MIIt->getOperand(0).getReg();
+    unsigned Op1Reg = MIIt->getOperand(1).getReg();
+    unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+    // If this CMOV we are generating is the opposite condition from
+    // the jump we generated, then we have to swap the operands for the
+    // PHI that is going to be generated.
+    if (MIIt->getOperand(3).getImm() == OppCC)
+        std::swap(Op1Reg, Op2Reg);
+
+    if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
+      Op1Reg = RegRewriteTable[Op1Reg].first;
+
+    if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
+      Op2Reg = RegRewriteTable[Op2Reg].second;
+
+    MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
+                  TII->get(X86::PHI), DestReg)
+          .addReg(Op1Reg).addMBB(copy0MBB)
+          .addReg(Op2Reg).addMBB(thisMBB);
+
+    // Add this PHI to the rewrite table.
+    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+  }
+
+  // If we have a cascaded CMOV, the second Jcc provides the same incoming
  // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
-  if (NextCMOV) {
+  if (CascadedCMOV) {
    MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
    // Copy the PHI result to the register defined by the second CMOV.
    BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
-            DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg())
+            DL, TII->get(TargetOpcode::COPY),
+            CascadedCMOV->getOperand(0).getReg())
        .addReg(MI->getOperand(0).getReg());
-    NextCMOV->eraseFromParent();
+    CascadedCMOV->eraseFromParent();
  }

-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  // Now remove the CMOV(s).
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
+    (MIIt++)->eraseFromParent();
+
  return sinkMBB;
 }

@ -20703,23 +20829,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
  case X86::TLSCall_32:
  case X86::TLSCall_64:
    return EmitLoweredTLSCall(MI, BB);
-  case X86::CMOV_GR8:
  case X86::CMOV_FR32:
  case X86::CMOV_FR64:
-  case X86::CMOV_V4F32:
-  case X86::CMOV_V2F64:
-  case X86::CMOV_V2I64:
-  case X86::CMOV_V8F32:
-  case X86::CMOV_V4F64:
-  case X86::CMOV_V4I64:
-  case X86::CMOV_V16F32:
-  case X86::CMOV_V8F64:
-  case X86::CMOV_V8I64:
+  case X86::CMOV_GR8:
  case X86::CMOV_GR16:
  case X86::CMOV_GR32:
  case X86::CMOV_RFP32:
  case X86::CMOV_RFP64:
  case X86::CMOV_RFP80:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
  case X86::CMOV_V8I1:
  case X86::CMOV_V16I1:
  case X86::CMOV_V32I1:
--- a/llvm/test/CodeGen/X86/pseudo_cmov_lower.ll
+++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower.ll
@ -0,0 +1,267 @@
+; RUN: llc < %s -mtriple=i386-linux-gnu -o - | FileCheck %s 
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo1:
+; CHECK: js
+; CHECK-NOT: js
+define i32 @foo1(i32 %v1, i32 %v2, i32 %v3) nounwind {
+entry:
+  %cmp = icmp slt i32 %v1, 0
+  %v2.v3 = select i1 %cmp, i32 %v2, i32 %v3
+  %v1.v2 = select i1 %cmp, i32 %v1, i32 %v2
+  %sub = sub i32 %v1.v2, %v2.v3
+  ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR. This makes
+; sure the code for the lowering for opposite conditions gets tested.
+; CHECK-LABEL: foo11:
+; CHECK: js
+; CHECK-NOT: js
+; CHECK-NOT: jns
+define i32 @foo11(i32 %v1, i32 %v2, i32 %v3) nounwind {
+entry:
+  %cmp1 = icmp slt i32 %v1, 0
+  %v2.v3 = select i1 %cmp1, i32 %v2, i32 %v3
+  %cmp2 = icmp sge i32 %v1, 0
+  %v1.v2 = select i1 %cmp2, i32 %v1, i32 %v2
+  %sub = sub i32 %v1.v2, %v2.v3
+  ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo2:
+; CHECK: js
+; CHECK-NOT: js
+define i32 @foo2(i8 %v1, i8 %v2, i8 %v3) nounwind {
+entry:
+  %cmp = icmp slt i8 %v1, 0
+  %v2.v3 = select i1 %cmp, i8 %v2, i8 %v3
+  %v1.v2 = select i1 %cmp, i8 %v1, i8 %v2
+  %t1 = sext i8 %v2.v3 to i32
+  %t2 = sext i8 %v1.v2 to i32
+  %sub = sub i32 %t1, %t2
+  ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo3:
+; CHECK: js
+; CHECK-NOT: js
+define i32 @foo3(i16 %v1, i16 %v2, i16 %v3) nounwind {
+entry:
+  %cmp = icmp slt i16 %v1, 0
+  %v2.v3 = select i1 %cmp, i16 %v2, i16 %v3
+  %v1.v2 = select i1 %cmp, i16 %v1, i16 %v2
+  %t1 = sext i16 %v2.v3 to i32
+  %t2 = sext i16 %v1.v2 to i32
+  %sub = sub i32 %t1, %t2
+  ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo4:
+; CHECK: js
+; CHECK-NOT: js
+define float @foo4(i32 %v1, float %v2, float %v3, float %v4) nounwind {
+entry:
+  %cmp = icmp slt i32 %v1, 0
+  %t1 = select i1 %cmp, float %v2, float %v3
+  %t2 = select i1 %cmp, float %v3, float %v4
+  %sub = fsub float %t1, %t2
+  ret float %sub
+}
+
+; This test checks that only a single je gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo5:
+; CHECK: je
+; CHECK-NOT: je
+define double @foo5(i32 %v1, double %v2, double %v3, double %v4) nounwind {
+entry:
+  %cmp = icmp eq i32 %v1, 0
+  %t1 = select i1 %cmp, double %v2, double %v3
+  %t2 = select i1 %cmp, double %v3, double %v4
+  %sub = fsub double %t1, %t2
+  ret double %sub
+}
+
+; This test checks that only a single je gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo6:
+; CHECK: je
+; CHECK-NOT: je
+define <4 x float> @foo6(i32 %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> %v4) nounwind {
+entry:
+  %cmp = icmp eq i32 %v1, 0
+  %t1 = select i1 %cmp, <4 x float> %v2, <4 x float> %v3
+  %t2 = select i1 %cmp, <4 x float> %v3, <4 x float> %v4
+  %sub = fsub <4 x float> %t1, %t2
+  ret <4 x float> %sub
+}
+
+; This test checks that only a single je gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo7:
+; CHECK: je
+; CHECK-NOT: je
+define <2 x double> @foo7(i32 %v1, <2 x double> %v2, <2 x double> %v3, <2 x double> %v4) nounwind {
+entry:
+  %cmp = icmp eq i32 %v1, 0
+  %t1 = select i1 %cmp, <2 x double> %v2, <2 x double> %v3
+  %t2 = select i1 %cmp, <2 x double> %v3, <2 x double> %v4
+  %sub = fsub <2 x double> %t1, %t2
+  ret <2 x double> %sub
+}
+
+; This test checks that only a single ja gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR. This combines
+; all the supported types together into one long string of selects based
+; on the same condition.
+; CHECK-LABEL: foo8:
+; CHECK: ja
+; CHECK-NOT: ja
+define void @foo8(i32 %v1,
+                  i8 %v2, i8 %v3,
+                  i16 %v12, i16 %v13,
+                  i32 %v22, i32 %v23,
+                  float %v32, float %v33,
+                  double %v42, double %v43,
+                  <4 x float> %v52, <4 x float> %v53,
+                  <2 x double> %v62, <2 x double> %v63,
+                  <8 x float> %v72, <8 x float> %v73,
+                  <4 x double> %v82, <4 x double> %v83,
+                  <16 x float> %v92, <16 x float> %v93,
+                  <8 x double> %v102, <8 x double> %v103,
+                  i8 * %dst) nounwind {
+entry:
+  %add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 2
+  %a11 = bitcast i8* %add.ptr11 to i16*
+
+  %add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
+  %a21 = bitcast i8* %add.ptr21 to i32*
+
+  %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
+  %a31 = bitcast i8* %add.ptr31 to float*
+
+  %add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
+  %a41 = bitcast i8* %add.ptr41 to double*
+
+  %add.ptr51 = getelementptr inbounds i8, i8* %dst, i32 32
+  %a51 = bitcast i8* %add.ptr51 to <4 x float>*
+
+  %add.ptr61 = getelementptr inbounds i8, i8* %dst, i32 48
+  %a61 = bitcast i8* %add.ptr61 to <2 x double>*
+
+  %add.ptr71 = getelementptr inbounds i8, i8* %dst, i32 64
+  %a71 = bitcast i8* %add.ptr71 to <8 x float>*
+
+  %add.ptr81 = getelementptr inbounds i8, i8* %dst, i32 128
+  %a81 = bitcast i8* %add.ptr81 to <4 x double>*
+
+  %add.ptr91 = getelementptr inbounds i8, i8* %dst, i32 64
+  %a91 = bitcast i8* %add.ptr91 to <16 x float>*
+
+  %add.ptr101 = getelementptr inbounds i8, i8* %dst, i32 128
+  %a101 = bitcast i8* %add.ptr101 to <8 x double>*
+
+  ; These operations are necessary, because select of two single use loads
+  ; ends up getting optimized into a select of two leas, followed by a
+  ; single load of the selected address.
+  %t13 = xor i16 %v13, 11
+  %t23 = xor i32 %v23, 1234
+  %t33 = fadd float %v33, %v32
+  %t43 = fadd double %v43, %v42
+  %t53 = fadd <4 x float> %v53, %v52
+  %t63 = fadd <2 x double> %v63, %v62
+  %t73 = fsub <8 x float> %v73, %v72
+  %t83 = fsub <4 x double> %v83, %v82
+  %t93 = fsub <16 x float> %v93, %v92
+  %t103 = fsub <8 x double> %v103, %v102
+
+  %cmp = icmp ugt i32 %v1, 31
+  %t11 = select i1 %cmp, i16 %v12, i16 %t13
+  %t21 = select i1 %cmp, i32 %v22, i32 %t23
+  %t31 = select i1 %cmp, float %v32, float %t33
+  %t41 = select i1 %cmp, double %v42, double %t43
+  %t51 = select i1 %cmp, <4 x float> %v52, <4 x float> %t53
+  %t61 = select i1 %cmp, <2 x double> %v62, <2 x double> %t63
+  %t71 = select i1 %cmp, <8 x float> %v72, <8 x float> %t73
+  %t81 = select i1 %cmp, <4 x double> %v82, <4 x double> %t83
+  %t91 = select i1 %cmp, <16 x float> %v92, <16 x float> %t93
+  %t101 = select i1 %cmp, <8 x double> %v102, <8 x double> %t103
+
+  store i16 %t11, i16* %a11, align 2
+  store i32 %t21, i32* %a21, align 4
+  store float %t31, float* %a31, align 4
+  store double %t41, double* %a41, align 8
+  store <4 x float> %t51, <4 x float>* %a51, align 16
+  store <2 x double> %t61, <2 x double>* %a61, align 16
+  store <8 x float> %t71, <8 x float>* %a71, align 32
+  store <4 x double> %t81, <4 x double>* %a81, align 32
+  store <16 x float> %t91, <16 x float>* %a91, align 32
+  store <8 x double> %t101, <8 x double>* %a101, align 32
+
+  ret void
+}
+
+; This test checks that only a single ja gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; on the same condition.
+; Contrary to my expectations, this doesn't exercise the code for
+; CMOV_V8I1, CMOV_V16I1, CMOV_V32I1, or CMOV_V64I1.  Instead the selects all
+; get lowered into vector length number of selects, which all eventually turn
+; into a huge number of CMOV_GR8, which are all contiguous, so the optimization
+; kicks in as long as CMOV_GR8 is supported. I couldn't find a way to get
+; CMOV_V*I1 pseudo-opcodes to get generated. If a way exists to get CMOV_V*1
+; pseudo-opcodes to be generated, this test should be replaced with one that
+; tests those opcodes.
+;
+; CHECK-LABEL: foo9:
+; CHECK: ja
+; CHECK-NOT: ja
+define void @foo9(i32 %v1,
+                  <8 x i1> %v12, <8 x i1> %v13,
+                  <16 x i1> %v22, <16 x i1> %v23,
+                  <32 x i1> %v32, <32 x i1> %v33,
+                  <64 x i1> %v42, <64 x i1> %v43,
+                  i8 * %dst) nounwind {
+entry:
+  %add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 0
+  %a11 = bitcast i8* %add.ptr11 to <8 x i1>*
+
+  %add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
+  %a21 = bitcast i8* %add.ptr21 to <16 x i1>*
+
+  %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
+  %a31 = bitcast i8* %add.ptr31 to <32 x i1>*
+
+  %add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
+  %a41 = bitcast i8* %add.ptr41 to <64 x i1>*
+
+  ; These operations are necessary, because select of two single use loads
+  ; ends up getting optimized into a select of two leas, followed by a
+  ; single load of the selected address.
+  %t13 = xor <8 x i1> %v13, %v12
+  %t23 = xor <16 x i1> %v23, %v22
+  %t33 = xor <32 x i1> %v33, %v32
+  %t43 = xor <64 x i1> %v43, %v42
+
+  %cmp = icmp ugt i32 %v1, 31
+  %t11 = select i1 %cmp, <8 x i1> %v12, <8 x i1> %t13
+  %t21 = select i1 %cmp, <16 x i1> %v22, <16 x i1> %t23
+  %t31 = select i1 %cmp, <32 x i1> %v32, <32 x i1> %t33
+  %t41 = select i1 %cmp, <64 x i1> %v42, <64 x i1> %t43
+
+  store <8 x i1> %t11, <8 x i1>* %a11, align 16
+  store <16 x i1> %t21, <16 x i1>* %a21, align 4
+  store <32 x i1> %t31, <32 x i1>* %a31, align 8
+  store <64 x i1> %t41, <64 x i1>* %a41, align 16
+
+  ret void
+}
--- a/llvm/test/CodeGen/X86/pseudo_cmov_lower1.ll
+++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower1.ll
@ -0,0 +1,39 @@
+; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2 -o - | FileCheck %s 
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s 
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo1:
+; CHECK: jae
+; CHECK-NOT: jae
+define double @foo1(float %p1, double %p2, double %p3) nounwind {
+entry:
+  %c1 = fcmp oge float %p1, 0.000000e+00
+  %d0 = fadd double %p2, 1.25e0
+  %d1 = fadd double %p3, 1.25e0
+  %d2 = select i1 %c1, double %d0, double %d1
+  %d3 = select i1 %c1, double %d0, double %p2
+  %d4 = select i1 %c1, double %p3, double %d1
+  %d5 = fsub double %d2, %d3
+  %d6 = fadd double %d5, %d4
+  ret double %d6
+}
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo2:
+; CHECK: jae
+; CHECK-NOT: jae
+define float @foo2(float %p1, float %p2, float %p3) nounwind {
+entry:
+  %c1 = fcmp oge float %p1, 0.000000e+00
+  %d0 = fadd float %p2, 1.25e0
+  %d1 = fadd float %p3, 1.25e0
+  %d2 = select i1 %c1, float %d0, float %d1
+  %d3 = select i1 %c1, float %d1, float %p2
+  %d4 = select i1 %c1, float %d0, float %p3
+  %d5 = fsub float %d2, %d3
+  %d6 = fadd float %d5, %d4
+  ret float %d6
+}
+
--- a/llvm/test/CodeGen/X86/pseudo_cmov_lower2.ll
+++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower2.ll
@ -0,0 +1,100 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s 
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.  The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect.
+;
+; CHECK-LABEL: foo1:
+; CHECK: jae
+; CHECK-NOT: jae
+define double @foo1(float %p1, double %p2, double %p3) nounwind {
+entry:
+  %c1 = fcmp oge float %p1, 0.000000e+00
+  %d0 = fadd double %p2, 1.25e0
+  %d1 = fadd double %p3, 1.25e0
+  %d2 = select i1 %c1, double %d0, double %d1
+  %d3 = select i1 %c1, double %d2, double %p2
+  %d4 = select i1 %c1, double %d3, double %p3
+  %d5 = fsub double %d2, %d3
+  %d6 = fadd double %d5, %d4
+  ret double %d6
+}
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.  The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect.
+;
+; CHECK-LABEL: foo2:
+; CHECK: jae
+; CHECK-NOT: jae
+define double @foo2(float %p1, double %p2, double %p3) nounwind {
+entry:
+  %c1 = fcmp oge float %p1, 0.000000e+00
+  %d0 = fadd double %p2, 1.25e0
+  %d1 = fadd double %p3, 1.25e0
+  %d2 = select i1 %c1, double %d0, double %d1
+  %d3 = select i1 %c1, double %p2, double %d2
+  %d4 = select i1 %c1, double %p3, double %d3
+  %d5 = fsub double %d2, %d3
+  %d6 = fadd double %d5, %d4
+  ret double %d6
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.  The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect.  It also tests to make sure all
+; the operands of the resulting instructions are from the proper places.
+;
+; CHECK-LABEL: foo3:
+; CHECK:          js
+; CHECK-NOT: js
+; CHECK-LABEL: # BB#1:
+; CHECK-DAG:      movapd  %xmm2, %xmm1
+; CHECK-DAG:      movapd  %xmm2, %xmm0
+; CHECK-LABEL:.LBB2_2:
+; CHECK:          divsd   %xmm1, %xmm0
+; CHECK:          ret
+define double @foo3(i32 %p1, double %p2, double %p3,
+                             double %p4, double %p5) nounwind {
+entry:
+  %c1 = icmp slt i32 %p1, 0
+  %d2 = select i1 %c1, double %p2, double %p3
+  %d3 = select i1 %c1, double %p3, double %p4
+  %d4 = select i1 %c1, double %d2, double %d3
+  %d5 = fdiv double %d4, %d3
+  ret double %d5
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.  The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect.  It also tests to make sure all
+; the operands of the resulting instructions are from the proper places
+; when the "opposite condition" handling code in the compiler is used.
+; This should be the same code as foo3 above, because we use the opposite
+; condition code in the second two selects, but we also swap the operands
+; of the selects to give the same actual computation.
+;
+; CHECK-LABEL: foo4:
+; CHECK:          js
+; CHECK-NOT: js
+; CHECK-LABEL: # BB#1:
+; CHECK-DAG:      movapd  %xmm2, %xmm1
+; CHECK-DAG:      movapd  %xmm2, %xmm0
+; CHECK-LABEL:.LBB3_2:
+; CHECK:          divsd   %xmm1, %xmm0
+; CHECK:          ret
+define double @foo4(i32 %p1, double %p2, double %p3,
+                             double %p4, double %p5) nounwind {
+entry:
+  %c1 = icmp slt i32 %p1, 0
+  %d2 = select i1 %c1, double %p2, double %p3
+  %c2 = icmp sge i32 %p1, 0
+  %d3 = select i1 %c2, double %p4, double %p3
+  %d4 = select i1 %c2, double %d3, double %d2
+  %d5 = fdiv double %d4, %d3
+  ret double %d5
+}