[X86] Improve EmitLoweredSelect for contiguous CMOV pseudo instructions.

This change improves EmitLoweredSelect() so that multiple contiguous CMOV pseudo
instructions with the same (or exactly opposite) conditions get lowered using a single
new basic-block. This eliminates unnecessary extra basic-blocks (and CFG merge points)
when contiguous CMOVs are being lowered.

Patch by: kevin.b.smith@intel.com
Differential Revision: http://reviews.llvm.org/D11428

llvm-svn: 244202
This commit is contained in:
Michael Kuperstein 2015-08-06 08:45:34 +00:00
parent d7b9392f59
commit 868dc65444
4 changed files with 567 additions and 35 deletions

View File

@ -19947,6 +19947,39 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
return true;
}
// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
// together with other CMOV pseudo-opcodes into a single basic-block with
// conditional jump around it.
static bool isCMOVPseudo(MachineInstr *MI) {
switch (MI->getOpcode()) {
case X86::CMOV_FR32:
case X86::CMOV_FR64:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
case X86::CMOV_V2F64:
case X86::CMOV_V2I64:
case X86::CMOV_V4F32:
case X86::CMOV_V4F64:
case X86::CMOV_V4I64:
case X86::CMOV_V16F32:
case X86::CMOV_V8F32:
case X86::CMOV_V8F64:
case X86::CMOV_V8I64:
case X86::CMOV_V8I1:
case X86::CMOV_V16I1:
case X86::CMOV_V32I1:
case X86::CMOV_V64I1:
return true;
default:
return false;
}
}
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *BB) const {
@ -19970,8 +20003,41 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *thisMBB = BB;
MachineFunction *F = BB->getParent();
// We also lower double CMOVs:
// This code lowers all pseudo-CMOV instructions. Generally it lowers these
// as described above, by inserting a BB, and then making a PHI at the join
// point to select the true and false operands of the CMOV in the PHI.
//
// The code also handles two different cases of multiple CMOV opcodes
// in a row.
//
// Case 1:
// In this case, there are multiple CMOVs in a row, all which are based on
// the same condition setting (or the exact opposite condition setting).
// In this case we can lower all the CMOVs using a single inserted BB, and
// then make a number of PHIs at the join point to model the CMOVs. The only
// trickiness here, is that in a case like:
//
// t2 = CMOV cond1 t1, f1
// t3 = CMOV cond1 t2, f2
//
// when rewriting this into PHIs, we have to perform some renaming on the
// temps since you cannot have a PHI operand refer to a PHI result earlier
// in the same block. The "simple" but wrong lowering would be:
//
// t2 = PHI t1(BB1), f1(BB2)
// t3 = PHI t2(BB1), f2(BB2)
//
// but clearly t2 is not defined in BB1, so that is incorrect. The proper
// renaming is to note that on the path through BB1, t2 is really just a
// copy of t1, and do that renaming, properly generating:
//
// t2 = PHI t1(BB1), f1(BB2)
// t3 = PHI t1(BB1), f2(BB2)
//
// Case 2, we lower cascaded CMOVs such as
//
// (CMOV (CMOV F, T, cc1), T, cc2)
//
// to two successives branches. For that, we look for another CMOV as the
// following instruction.
//
@ -20037,19 +20103,42 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// .LBB5_4:
// retq
//
MachineInstr *NextCMOV = nullptr;
MachineInstr *CascadedCMOV = nullptr;
MachineInstr *LastCMOV = MI;
X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm());
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
MachineBasicBlock::iterator NextMIIt =
std::next(MachineBasicBlock::iterator(MI));
if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
// Check for case 1, where there are multiple CMOVs with the same condition
// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
// number of jumps the most.
if (isCMOVPseudo(MI)) {
// See if we have a string of CMOVS with the same condition.
while (NextMIIt != BB->end() &&
isCMOVPseudo(NextMIIt) &&
(NextMIIt->getOperand(3).getImm() == CC ||
NextMIIt->getOperand(3).getImm() == OppCC)) {
LastCMOV = &*NextMIIt;
++NextMIIt;
}
}
// This checks for case 2, but only do this if we didn't already find
// case 1, as indicated by LastCMOV == MI.
if (LastCMOV == MI &&
NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg())
NextCMOV = &*NextMIIt;
NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) {
CascadedCMOV = &*NextMIIt;
}
MachineBasicBlock *jcc1MBB = nullptr;
// If we have a double CMOV, we lower it to two successive branches to
// If we have a cascaded CMOV, we lower it to two successive branches to
// the same block. EFLAGS is used by both, so mark it as live in the second.
if (NextCMOV) {
if (CascadedCMOV) {
jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, jcc1MBB);
jcc1MBB->addLiveIn(X86::EFLAGS);
@ -20064,7 +20153,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// live into the sink and copy blocks.
const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI;
MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
@ -20073,12 +20162,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
// Add the true and fallthrough blocks as its successors.
if (NextCMOV) {
// The fallthrough block may be jcc1MBB, if we have a double CMOV.
if (CascadedCMOV) {
// The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
BB->addSuccessor(jcc1MBB);
// In that case, jcc1MBB will itself fallthrough the copy0MBB, and
@ -20093,13 +20182,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
BB->addSuccessor(sinkMBB);
// Create the conditional branch instruction.
unsigned Opc =
X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
unsigned Opc = X86::GetCondBranchFromCond(CC);
BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
if (NextCMOV) {
if (CascadedCMOV) {
unsigned Opc2 = X86::GetCondBranchFromCond(
(X86::CondCode)NextCMOV->getOperand(3).getImm());
(X86::CondCode)CascadedCMOV->getOperand(3).getImm());
BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
}
@ -20111,24 +20199,62 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// sinkMBB:
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
// ...
MachineInstrBuilder MIB =
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI),
MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
.addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
MachineBasicBlock::iterator MIItEnd =
std::next(MachineBasicBlock::iterator(LastCMOV));
MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
MachineInstrBuilder MIB;
// If we have a double CMOV, the second Jcc provides the same incoming
// As we are creating the PHIs, we have to be careful if there is more than
// one. Later CMOVs may reference the results of earlier CMOVs, but later
// PHIs have to reference the individual true/false inputs from earlier PHIs.
// That also means that PHI construction must work forward from earlier to
// later, and that the code must maintain a mapping from earlier PHI's
// destination registers, and the registers that went into the PHI.
for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
unsigned DestReg = MIIt->getOperand(0).getReg();
unsigned Op1Reg = MIIt->getOperand(1).getReg();
unsigned Op2Reg = MIIt->getOperand(2).getReg();
// If this CMOV we are generating is the opposite condition from
// the jump we generated, then we have to swap the operands for the
// PHI that is going to be generated.
if (MIIt->getOperand(3).getImm() == OppCC)
std::swap(Op1Reg, Op2Reg);
if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
Op1Reg = RegRewriteTable[Op1Reg].first;
if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
Op2Reg = RegRewriteTable[Op2Reg].second;
MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
TII->get(X86::PHI), DestReg)
.addReg(Op1Reg).addMBB(copy0MBB)
.addReg(Op2Reg).addMBB(thisMBB);
// Add this PHI to the rewrite table.
RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
}
// If we have a cascaded CMOV, the second Jcc provides the same incoming
// value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
if (NextCMOV) {
if (CascadedCMOV) {
MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
// Copy the PHI result to the register defined by the second CMOV.
BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg())
DL, TII->get(TargetOpcode::COPY),
CascadedCMOV->getOperand(0).getReg())
.addReg(MI->getOperand(0).getReg());
NextCMOV->eraseFromParent();
CascadedCMOV->eraseFromParent();
}
MI->eraseFromParent(); // The pseudo instruction is gone now.
// Now remove the CMOV(s).
for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
(MIIt++)->eraseFromParent();
return sinkMBB;
}
@ -20703,23 +20829,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
case X86::CMOV_GR8:
case X86::CMOV_FR32:
case X86::CMOV_FR64:
case X86::CMOV_V4F32:
case X86::CMOV_V2F64:
case X86::CMOV_V2I64:
case X86::CMOV_V8F32:
case X86::CMOV_V4F64:
case X86::CMOV_V4I64:
case X86::CMOV_V16F32:
case X86::CMOV_V8F64:
case X86::CMOV_V8I64:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
case X86::CMOV_V2F64:
case X86::CMOV_V2I64:
case X86::CMOV_V4F32:
case X86::CMOV_V4F64:
case X86::CMOV_V4I64:
case X86::CMOV_V16F32:
case X86::CMOV_V8F32:
case X86::CMOV_V8F64:
case X86::CMOV_V8I64:
case X86::CMOV_V8I1:
case X86::CMOV_V16I1:
case X86::CMOV_V32I1:

View File

@ -0,0 +1,267 @@
; RUN: llc < %s -mtriple=i386-linux-gnu -o - | FileCheck %s
; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo1:
; CHECK: js
; CHECK-NOT: js
define i32 @foo1(i32 %v1, i32 %v2, i32 %v3) nounwind {
entry:
%cmp = icmp slt i32 %v1, 0
%v2.v3 = select i1 %cmp, i32 %v2, i32 %v3
%v1.v2 = select i1 %cmp, i32 %v1, i32 %v2
%sub = sub i32 %v1.v2, %v2.v3
ret i32 %sub
}
; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR. This makes
; sure the code for the lowering for opposite conditions gets tested.
; CHECK-LABEL: foo11:
; CHECK: js
; CHECK-NOT: js
; CHECK-NOT: jns
define i32 @foo11(i32 %v1, i32 %v2, i32 %v3) nounwind {
entry:
%cmp1 = icmp slt i32 %v1, 0
%v2.v3 = select i1 %cmp1, i32 %v2, i32 %v3
%cmp2 = icmp sge i32 %v1, 0
%v1.v2 = select i1 %cmp2, i32 %v1, i32 %v2
%sub = sub i32 %v1.v2, %v2.v3
ret i32 %sub
}
; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo2:
; CHECK: js
; CHECK-NOT: js
define i32 @foo2(i8 %v1, i8 %v2, i8 %v3) nounwind {
entry:
%cmp = icmp slt i8 %v1, 0
%v2.v3 = select i1 %cmp, i8 %v2, i8 %v3
%v1.v2 = select i1 %cmp, i8 %v1, i8 %v2
%t1 = sext i8 %v2.v3 to i32
%t2 = sext i8 %v1.v2 to i32
%sub = sub i32 %t1, %t2
ret i32 %sub
}
; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo3:
; CHECK: js
; CHECK-NOT: js
define i32 @foo3(i16 %v1, i16 %v2, i16 %v3) nounwind {
entry:
%cmp = icmp slt i16 %v1, 0
%v2.v3 = select i1 %cmp, i16 %v2, i16 %v3
%v1.v2 = select i1 %cmp, i16 %v1, i16 %v2
%t1 = sext i16 %v2.v3 to i32
%t2 = sext i16 %v1.v2 to i32
%sub = sub i32 %t1, %t2
ret i32 %sub
}
; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo4:
; CHECK: js
; CHECK-NOT: js
define float @foo4(i32 %v1, float %v2, float %v3, float %v4) nounwind {
entry:
%cmp = icmp slt i32 %v1, 0
%t1 = select i1 %cmp, float %v2, float %v3
%t2 = select i1 %cmp, float %v3, float %v4
%sub = fsub float %t1, %t2
ret float %sub
}
; This test checks that only a single je gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo5:
; CHECK: je
; CHECK-NOT: je
define double @foo5(i32 %v1, double %v2, double %v3, double %v4) nounwind {
entry:
%cmp = icmp eq i32 %v1, 0
%t1 = select i1 %cmp, double %v2, double %v3
%t2 = select i1 %cmp, double %v3, double %v4
%sub = fsub double %t1, %t2
ret double %sub
}
; This test checks that only a single je gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo6:
; CHECK: je
; CHECK-NOT: je
define <4 x float> @foo6(i32 %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> %v4) nounwind {
entry:
%cmp = icmp eq i32 %v1, 0
%t1 = select i1 %cmp, <4 x float> %v2, <4 x float> %v3
%t2 = select i1 %cmp, <4 x float> %v3, <4 x float> %v4
%sub = fsub <4 x float> %t1, %t2
ret <4 x float> %sub
}
; This test checks that only a single je gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo7:
; CHECK: je
; CHECK-NOT: je
define <2 x double> @foo7(i32 %v1, <2 x double> %v2, <2 x double> %v3, <2 x double> %v4) nounwind {
entry:
%cmp = icmp eq i32 %v1, 0
%t1 = select i1 %cmp, <2 x double> %v2, <2 x double> %v3
%t2 = select i1 %cmp, <2 x double> %v3, <2 x double> %v4
%sub = fsub <2 x double> %t1, %t2
ret <2 x double> %sub
}
; This test checks that only a single ja gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR. This combines
; all the supported types together into one long string of selects based
; on the same condition.
; CHECK-LABEL: foo8:
; CHECK: ja
; CHECK-NOT: ja
define void @foo8(i32 %v1,
i8 %v2, i8 %v3,
i16 %v12, i16 %v13,
i32 %v22, i32 %v23,
float %v32, float %v33,
double %v42, double %v43,
<4 x float> %v52, <4 x float> %v53,
<2 x double> %v62, <2 x double> %v63,
<8 x float> %v72, <8 x float> %v73,
<4 x double> %v82, <4 x double> %v83,
<16 x float> %v92, <16 x float> %v93,
<8 x double> %v102, <8 x double> %v103,
i8 * %dst) nounwind {
entry:
%add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 2
%a11 = bitcast i8* %add.ptr11 to i16*
%add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
%a21 = bitcast i8* %add.ptr21 to i32*
%add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
%a31 = bitcast i8* %add.ptr31 to float*
%add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
%a41 = bitcast i8* %add.ptr41 to double*
%add.ptr51 = getelementptr inbounds i8, i8* %dst, i32 32
%a51 = bitcast i8* %add.ptr51 to <4 x float>*
%add.ptr61 = getelementptr inbounds i8, i8* %dst, i32 48
%a61 = bitcast i8* %add.ptr61 to <2 x double>*
%add.ptr71 = getelementptr inbounds i8, i8* %dst, i32 64
%a71 = bitcast i8* %add.ptr71 to <8 x float>*
%add.ptr81 = getelementptr inbounds i8, i8* %dst, i32 128
%a81 = bitcast i8* %add.ptr81 to <4 x double>*
%add.ptr91 = getelementptr inbounds i8, i8* %dst, i32 64
%a91 = bitcast i8* %add.ptr91 to <16 x float>*
%add.ptr101 = getelementptr inbounds i8, i8* %dst, i32 128
%a101 = bitcast i8* %add.ptr101 to <8 x double>*
; These operations are necessary, because select of two single use loads
; ends up getting optimized into a select of two leas, followed by a
; single load of the selected address.
%t13 = xor i16 %v13, 11
%t23 = xor i32 %v23, 1234
%t33 = fadd float %v33, %v32
%t43 = fadd double %v43, %v42
%t53 = fadd <4 x float> %v53, %v52
%t63 = fadd <2 x double> %v63, %v62
%t73 = fsub <8 x float> %v73, %v72
%t83 = fsub <4 x double> %v83, %v82
%t93 = fsub <16 x float> %v93, %v92
%t103 = fsub <8 x double> %v103, %v102
%cmp = icmp ugt i32 %v1, 31
%t11 = select i1 %cmp, i16 %v12, i16 %t13
%t21 = select i1 %cmp, i32 %v22, i32 %t23
%t31 = select i1 %cmp, float %v32, float %t33
%t41 = select i1 %cmp, double %v42, double %t43
%t51 = select i1 %cmp, <4 x float> %v52, <4 x float> %t53
%t61 = select i1 %cmp, <2 x double> %v62, <2 x double> %t63
%t71 = select i1 %cmp, <8 x float> %v72, <8 x float> %t73
%t81 = select i1 %cmp, <4 x double> %v82, <4 x double> %t83
%t91 = select i1 %cmp, <16 x float> %v92, <16 x float> %t93
%t101 = select i1 %cmp, <8 x double> %v102, <8 x double> %t103
store i16 %t11, i16* %a11, align 2
store i32 %t21, i32* %a21, align 4
store float %t31, float* %a31, align 4
store double %t41, double* %a41, align 8
store <4 x float> %t51, <4 x float>* %a51, align 16
store <2 x double> %t61, <2 x double>* %a61, align 16
store <8 x float> %t71, <8 x float>* %a71, align 32
store <4 x double> %t81, <4 x double>* %a81, align 32
store <16 x float> %t91, <16 x float>* %a91, align 32
store <8 x double> %t101, <8 x double>* %a101, align 32
ret void
}
; This test checks that only a single ja gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; on the same condition.
; Contrary to my expectations, this doesn't exercise the code for
; CMOV_V8I1, CMOV_V16I1, CMOV_V32I1, or CMOV_V64I1. Instead the selects all
; get lowered into vector length number of selects, which all eventually turn
; into a huge number of CMOV_GR8, which are all contiguous, so the optimization
; kicks in as long as CMOV_GR8 is supported. I couldn't find a way to get
; CMOV_V*I1 pseudo-opcodes to get generated. If a way exists to get CMOV_V*1
; pseudo-opcodes to be generated, this test should be replaced with one that
; tests those opcodes.
;
; CHECK-LABEL: foo9:
; CHECK: ja
; CHECK-NOT: ja
define void @foo9(i32 %v1,
<8 x i1> %v12, <8 x i1> %v13,
<16 x i1> %v22, <16 x i1> %v23,
<32 x i1> %v32, <32 x i1> %v33,
<64 x i1> %v42, <64 x i1> %v43,
i8 * %dst) nounwind {
entry:
%add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 0
%a11 = bitcast i8* %add.ptr11 to <8 x i1>*
%add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
%a21 = bitcast i8* %add.ptr21 to <16 x i1>*
%add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
%a31 = bitcast i8* %add.ptr31 to <32 x i1>*
%add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
%a41 = bitcast i8* %add.ptr41 to <64 x i1>*
; These operations are necessary, because select of two single use loads
; ends up getting optimized into a select of two leas, followed by a
; single load of the selected address.
%t13 = xor <8 x i1> %v13, %v12
%t23 = xor <16 x i1> %v23, %v22
%t33 = xor <32 x i1> %v33, %v32
%t43 = xor <64 x i1> %v43, %v42
%cmp = icmp ugt i32 %v1, 31
%t11 = select i1 %cmp, <8 x i1> %v12, <8 x i1> %t13
%t21 = select i1 %cmp, <16 x i1> %v22, <16 x i1> %t23
%t31 = select i1 %cmp, <32 x i1> %v32, <32 x i1> %t33
%t41 = select i1 %cmp, <64 x i1> %v42, <64 x i1> %t43
store <8 x i1> %t11, <8 x i1>* %a11, align 16
store <16 x i1> %t21, <16 x i1>* %a21, align 4
store <32 x i1> %t31, <32 x i1>* %a31, align 8
store <64 x i1> %t41, <64 x i1>* %a41, align 16
ret void
}

View File

@ -0,0 +1,39 @@
; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2 -o - | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s
; This test checks that only a single jae gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo1:
; CHECK: jae
; CHECK-NOT: jae
define double @foo1(float %p1, double %p2, double %p3) nounwind {
entry:
%c1 = fcmp oge float %p1, 0.000000e+00
%d0 = fadd double %p2, 1.25e0
%d1 = fadd double %p3, 1.25e0
%d2 = select i1 %c1, double %d0, double %d1
%d3 = select i1 %c1, double %d0, double %p2
%d4 = select i1 %c1, double %p3, double %d1
%d5 = fsub double %d2, %d3
%d6 = fadd double %d5, %d4
ret double %d6
}
; This test checks that only a single jae gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo2:
; CHECK: jae
; CHECK-NOT: jae
define float @foo2(float %p1, float %p2, float %p3) nounwind {
entry:
%c1 = fcmp oge float %p1, 0.000000e+00
%d0 = fadd float %p2, 1.25e0
%d1 = fadd float %p3, 1.25e0
%d2 = select i1 %c1, float %d0, float %d1
%d3 = select i1 %c1, float %d1, float %p2
%d4 = select i1 %c1, float %d0, float %p3
%d5 = fsub float %d2, %d3
%d6 = fadd float %d5, %d4
ret float %d6
}

View File

@ -0,0 +1,100 @@
; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s
; This test checks that only a single jae gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR. The tricky part
; of this test is that it tests the special PHI operand rewriting code in
; X86TargetLowering::EmitLoweredSelect.
;
; CHECK-LABEL: foo1:
; CHECK: jae
; CHECK-NOT: jae
define double @foo1(float %p1, double %p2, double %p3) nounwind {
entry:
%c1 = fcmp oge float %p1, 0.000000e+00
%d0 = fadd double %p2, 1.25e0
%d1 = fadd double %p3, 1.25e0
%d2 = select i1 %c1, double %d0, double %d1
%d3 = select i1 %c1, double %d2, double %p2
%d4 = select i1 %c1, double %d3, double %p3
%d5 = fsub double %d2, %d3
%d6 = fadd double %d5, %d4
ret double %d6
}
; This test checks that only a single jae gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR. The tricky part
; of this test is that it tests the special PHI operand rewriting code in
; X86TargetLowering::EmitLoweredSelect.
;
; CHECK-LABEL: foo2:
; CHECK: jae
; CHECK-NOT: jae
define double @foo2(float %p1, double %p2, double %p3) nounwind {
entry:
%c1 = fcmp oge float %p1, 0.000000e+00
%d0 = fadd double %p2, 1.25e0
%d1 = fadd double %p3, 1.25e0
%d2 = select i1 %c1, double %d0, double %d1
%d3 = select i1 %c1, double %p2, double %d2
%d4 = select i1 %c1, double %p3, double %d3
%d5 = fsub double %d2, %d3
%d6 = fadd double %d5, %d4
ret double %d6
}
; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR. The tricky part
; of this test is that it tests the special PHI operand rewriting code in
; X86TargetLowering::EmitLoweredSelect. It also tests to make sure all
; the operands of the resulting instructions are from the proper places.
;
; CHECK-LABEL: foo3:
; CHECK: js
; CHECK-NOT: js
; CHECK-LABEL: # BB#1:
; CHECK-DAG: movapd %xmm2, %xmm1
; CHECK-DAG: movapd %xmm2, %xmm0
; CHECK-LABEL:.LBB2_2:
; CHECK: divsd %xmm1, %xmm0
; CHECK: ret
define double @foo3(i32 %p1, double %p2, double %p3,
double %p4, double %p5) nounwind {
entry:
%c1 = icmp slt i32 %p1, 0
%d2 = select i1 %c1, double %p2, double %p3
%d3 = select i1 %c1, double %p3, double %p4
%d4 = select i1 %c1, double %d2, double %d3
%d5 = fdiv double %d4, %d3
ret double %d5
}
; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR. The tricky part
; of this test is that it tests the special PHI operand rewriting code in
; X86TargetLowering::EmitLoweredSelect. It also tests to make sure all
; the operands of the resulting instructions are from the proper places
; when the "opposite condition" handling code in the compiler is used.
; This should be the same code as foo3 above, because we use the opposite
; condition code in the second two selects, but we also swap the operands
; of the selects to give the same actual computation.
;
; CHECK-LABEL: foo4:
; CHECK: js
; CHECK-NOT: js
; CHECK-LABEL: # BB#1:
; CHECK-DAG: movapd %xmm2, %xmm1
; CHECK-DAG: movapd %xmm2, %xmm0
; CHECK-LABEL:.LBB3_2:
; CHECK: divsd %xmm1, %xmm0
; CHECK: ret
define double @foo4(i32 %p1, double %p2, double %p3,
double %p4, double %p5) nounwind {
entry:
%c1 = icmp slt i32 %p1, 0
%d2 = select i1 %c1, double %p2, double %p3
%c2 = icmp sge i32 %p1, 0
%d3 = select i1 %c2, double %p4, double %p3
%d4 = select i1 %c2, double %d3, double %d2
%d5 = fdiv double %d4, %d3
ret double %d5
}