[X86] Improve EmitLoweredSelect for contiguous CMOV pseudo instructions.
This change improves EmitLoweredSelect() so that multiple contiguous CMOV pseudo instructions with the same (or exactly opposite) conditions get lowered using a single new basic-block. This eliminates unnecessary extra basic-blocks (and CFG merge points) when contiguous CMOVs are being lowered. Patch by: kevin.b.smith@intel.com Differential Revision: http://reviews.llvm.org/D11428 llvm-svn: 244202
This commit is contained in:
parent
d7b9392f59
commit
868dc65444
|
@ -19947,6 +19947,39 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
|
|||
return true;
|
||||
}
|
||||
|
||||
// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
|
||||
// together with other CMOV pseudo-opcodes into a single basic-block with
|
||||
// conditional jump around it.
|
||||
static bool isCMOVPseudo(MachineInstr *MI) {
|
||||
switch (MI->getOpcode()) {
|
||||
case X86::CMOV_FR32:
|
||||
case X86::CMOV_FR64:
|
||||
case X86::CMOV_GR8:
|
||||
case X86::CMOV_GR16:
|
||||
case X86::CMOV_GR32:
|
||||
case X86::CMOV_RFP32:
|
||||
case X86::CMOV_RFP64:
|
||||
case X86::CMOV_RFP80:
|
||||
case X86::CMOV_V2F64:
|
||||
case X86::CMOV_V2I64:
|
||||
case X86::CMOV_V4F32:
|
||||
case X86::CMOV_V4F64:
|
||||
case X86::CMOV_V4I64:
|
||||
case X86::CMOV_V16F32:
|
||||
case X86::CMOV_V8F32:
|
||||
case X86::CMOV_V8F64:
|
||||
case X86::CMOV_V8I64:
|
||||
case X86::CMOV_V8I1:
|
||||
case X86::CMOV_V16I1:
|
||||
case X86::CMOV_V32I1:
|
||||
case X86::CMOV_V64I1:
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
MachineBasicBlock *
|
||||
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
|
||||
MachineBasicBlock *BB) const {
|
||||
|
@ -19970,8 +20003,41 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
|
|||
MachineBasicBlock *thisMBB = BB;
|
||||
MachineFunction *F = BB->getParent();
|
||||
|
||||
// We also lower double CMOVs:
|
||||
// This code lowers all pseudo-CMOV instructions. Generally it lowers these
|
||||
// as described above, by inserting a BB, and then making a PHI at the join
|
||||
// point to select the true and false operands of the CMOV in the PHI.
|
||||
//
|
||||
// The code also handles two different cases of multiple CMOV opcodes
|
||||
// in a row.
|
||||
//
|
||||
// Case 1:
|
||||
// In this case, there are multiple CMOVs in a row, all which are based on
|
||||
// the same condition setting (or the exact opposite condition setting).
|
||||
// In this case we can lower all the CMOVs using a single inserted BB, and
|
||||
// then make a number of PHIs at the join point to model the CMOVs. The only
|
||||
// trickiness here, is that in a case like:
|
||||
//
|
||||
// t2 = CMOV cond1 t1, f1
|
||||
// t3 = CMOV cond1 t2, f2
|
||||
//
|
||||
// when rewriting this into PHIs, we have to perform some renaming on the
|
||||
// temps since you cannot have a PHI operand refer to a PHI result earlier
|
||||
// in the same block. The "simple" but wrong lowering would be:
|
||||
//
|
||||
// t2 = PHI t1(BB1), f1(BB2)
|
||||
// t3 = PHI t2(BB1), f2(BB2)
|
||||
//
|
||||
// but clearly t2 is not defined in BB1, so that is incorrect. The proper
|
||||
// renaming is to note that on the path through BB1, t2 is really just a
|
||||
// copy of t1, and do that renaming, properly generating:
|
||||
//
|
||||
// t2 = PHI t1(BB1), f1(BB2)
|
||||
// t3 = PHI t1(BB1), f2(BB2)
|
||||
//
|
||||
// Case 2, we lower cascaded CMOVs such as
|
||||
//
|
||||
// (CMOV (CMOV F, T, cc1), T, cc2)
|
||||
//
|
||||
// to two successives branches. For that, we look for another CMOV as the
|
||||
// following instruction.
|
||||
//
|
||||
|
@ -20037,19 +20103,42 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
|
|||
// .LBB5_4:
|
||||
// retq
|
||||
//
|
||||
MachineInstr *NextCMOV = nullptr;
|
||||
MachineInstr *CascadedCMOV = nullptr;
|
||||
MachineInstr *LastCMOV = MI;
|
||||
X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm());
|
||||
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
|
||||
MachineBasicBlock::iterator NextMIIt =
|
||||
std::next(MachineBasicBlock::iterator(MI));
|
||||
if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
|
||||
|
||||
// Check for case 1, where there are multiple CMOVs with the same condition
|
||||
// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
|
||||
// number of jumps the most.
|
||||
|
||||
if (isCMOVPseudo(MI)) {
|
||||
// See if we have a string of CMOVS with the same condition.
|
||||
while (NextMIIt != BB->end() &&
|
||||
isCMOVPseudo(NextMIIt) &&
|
||||
(NextMIIt->getOperand(3).getImm() == CC ||
|
||||
NextMIIt->getOperand(3).getImm() == OppCC)) {
|
||||
LastCMOV = &*NextMIIt;
|
||||
++NextMIIt;
|
||||
}
|
||||
}
|
||||
|
||||
// This checks for case 2, but only do this if we didn't already find
|
||||
// case 1, as indicated by LastCMOV == MI.
|
||||
if (LastCMOV == MI &&
|
||||
NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
|
||||
NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
|
||||
NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg())
|
||||
NextCMOV = &*NextMIIt;
|
||||
NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) {
|
||||
CascadedCMOV = &*NextMIIt;
|
||||
}
|
||||
|
||||
MachineBasicBlock *jcc1MBB = nullptr;
|
||||
|
||||
// If we have a double CMOV, we lower it to two successive branches to
|
||||
// If we have a cascaded CMOV, we lower it to two successive branches to
|
||||
// the same block. EFLAGS is used by both, so mark it as live in the second.
|
||||
if (NextCMOV) {
|
||||
if (CascadedCMOV) {
|
||||
jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
|
||||
F->insert(It, jcc1MBB);
|
||||
jcc1MBB->addLiveIn(X86::EFLAGS);
|
||||
|
@ -20064,7 +20153,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
|
|||
// live into the sink and copy blocks.
|
||||
const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
|
||||
|
||||
MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI;
|
||||
MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
|
||||
if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
|
||||
!checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
|
||||
copy0MBB->addLiveIn(X86::EFLAGS);
|
||||
|
@ -20073,12 +20162,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
|
|||
|
||||
// Transfer the remainder of BB and its successor edges to sinkMBB.
|
||||
sinkMBB->splice(sinkMBB->begin(), BB,
|
||||
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
||||
std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
|
||||
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
|
||||
|
||||
// Add the true and fallthrough blocks as its successors.
|
||||
if (NextCMOV) {
|
||||
// The fallthrough block may be jcc1MBB, if we have a double CMOV.
|
||||
if (CascadedCMOV) {
|
||||
// The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
|
||||
BB->addSuccessor(jcc1MBB);
|
||||
|
||||
// In that case, jcc1MBB will itself fallthrough the copy0MBB, and
|
||||
|
@ -20093,13 +20182,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
|
|||
BB->addSuccessor(sinkMBB);
|
||||
|
||||
// Create the conditional branch instruction.
|
||||
unsigned Opc =
|
||||
X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
|
||||
unsigned Opc = X86::GetCondBranchFromCond(CC);
|
||||
BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
|
||||
|
||||
if (NextCMOV) {
|
||||
if (CascadedCMOV) {
|
||||
unsigned Opc2 = X86::GetCondBranchFromCond(
|
||||
(X86::CondCode)NextCMOV->getOperand(3).getImm());
|
||||
(X86::CondCode)CascadedCMOV->getOperand(3).getImm());
|
||||
BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
|
||||
}
|
||||
|
||||
|
@ -20111,24 +20199,62 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
|
|||
// sinkMBB:
|
||||
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
|
||||
// ...
|
||||
MachineInstrBuilder MIB =
|
||||
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI),
|
||||
MI->getOperand(0).getReg())
|
||||
.addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
|
||||
.addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
|
||||
MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
|
||||
MachineBasicBlock::iterator MIItEnd =
|
||||
std::next(MachineBasicBlock::iterator(LastCMOV));
|
||||
MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
|
||||
DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
|
||||
MachineInstrBuilder MIB;
|
||||
|
||||
// If we have a double CMOV, the second Jcc provides the same incoming
|
||||
// As we are creating the PHIs, we have to be careful if there is more than
|
||||
// one. Later CMOVs may reference the results of earlier CMOVs, but later
|
||||
// PHIs have to reference the individual true/false inputs from earlier PHIs.
|
||||
// That also means that PHI construction must work forward from earlier to
|
||||
// later, and that the code must maintain a mapping from earlier PHI's
|
||||
// destination registers, and the registers that went into the PHI.
|
||||
|
||||
for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
|
||||
unsigned DestReg = MIIt->getOperand(0).getReg();
|
||||
unsigned Op1Reg = MIIt->getOperand(1).getReg();
|
||||
unsigned Op2Reg = MIIt->getOperand(2).getReg();
|
||||
|
||||
// If this CMOV we are generating is the opposite condition from
|
||||
// the jump we generated, then we have to swap the operands for the
|
||||
// PHI that is going to be generated.
|
||||
if (MIIt->getOperand(3).getImm() == OppCC)
|
||||
std::swap(Op1Reg, Op2Reg);
|
||||
|
||||
if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
|
||||
Op1Reg = RegRewriteTable[Op1Reg].first;
|
||||
|
||||
if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
|
||||
Op2Reg = RegRewriteTable[Op2Reg].second;
|
||||
|
||||
MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
|
||||
TII->get(X86::PHI), DestReg)
|
||||
.addReg(Op1Reg).addMBB(copy0MBB)
|
||||
.addReg(Op2Reg).addMBB(thisMBB);
|
||||
|
||||
// Add this PHI to the rewrite table.
|
||||
RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
|
||||
}
|
||||
|
||||
// If we have a cascaded CMOV, the second Jcc provides the same incoming
|
||||
// value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
|
||||
if (NextCMOV) {
|
||||
if (CascadedCMOV) {
|
||||
MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
|
||||
// Copy the PHI result to the register defined by the second CMOV.
|
||||
BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
|
||||
DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg())
|
||||
DL, TII->get(TargetOpcode::COPY),
|
||||
CascadedCMOV->getOperand(0).getReg())
|
||||
.addReg(MI->getOperand(0).getReg());
|
||||
NextCMOV->eraseFromParent();
|
||||
CascadedCMOV->eraseFromParent();
|
||||
}
|
||||
|
||||
MI->eraseFromParent(); // The pseudo instruction is gone now.
|
||||
// Now remove the CMOV(s).
|
||||
for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
|
||||
(MIIt++)->eraseFromParent();
|
||||
|
||||
return sinkMBB;
|
||||
}
|
||||
|
||||
|
@ -20703,23 +20829,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
|
|||
case X86::TLSCall_32:
|
||||
case X86::TLSCall_64:
|
||||
return EmitLoweredTLSCall(MI, BB);
|
||||
case X86::CMOV_GR8:
|
||||
case X86::CMOV_FR32:
|
||||
case X86::CMOV_FR64:
|
||||
case X86::CMOV_V4F32:
|
||||
case X86::CMOV_V2F64:
|
||||
case X86::CMOV_V2I64:
|
||||
case X86::CMOV_V8F32:
|
||||
case X86::CMOV_V4F64:
|
||||
case X86::CMOV_V4I64:
|
||||
case X86::CMOV_V16F32:
|
||||
case X86::CMOV_V8F64:
|
||||
case X86::CMOV_V8I64:
|
||||
case X86::CMOV_GR8:
|
||||
case X86::CMOV_GR16:
|
||||
case X86::CMOV_GR32:
|
||||
case X86::CMOV_RFP32:
|
||||
case X86::CMOV_RFP64:
|
||||
case X86::CMOV_RFP80:
|
||||
case X86::CMOV_V2F64:
|
||||
case X86::CMOV_V2I64:
|
||||
case X86::CMOV_V4F32:
|
||||
case X86::CMOV_V4F64:
|
||||
case X86::CMOV_V4I64:
|
||||
case X86::CMOV_V16F32:
|
||||
case X86::CMOV_V8F32:
|
||||
case X86::CMOV_V8F64:
|
||||
case X86::CMOV_V8I64:
|
||||
case X86::CMOV_V8I1:
|
||||
case X86::CMOV_V16I1:
|
||||
case X86::CMOV_V32I1:
|
||||
|
|
|
@ -0,0 +1,267 @@
|
|||
; RUN: llc < %s -mtriple=i386-linux-gnu -o - | FileCheck %s
|
||||
|
||||
; This test checks that only a single js gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR.
|
||||
; CHECK-LABEL: foo1:
|
||||
; CHECK: js
|
||||
; CHECK-NOT: js
|
||||
define i32 @foo1(i32 %v1, i32 %v2, i32 %v3) nounwind {
|
||||
entry:
|
||||
%cmp = icmp slt i32 %v1, 0
|
||||
%v2.v3 = select i1 %cmp, i32 %v2, i32 %v3
|
||||
%v1.v2 = select i1 %cmp, i32 %v1, i32 %v2
|
||||
%sub = sub i32 %v1.v2, %v2.v3
|
||||
ret i32 %sub
|
||||
}
|
||||
|
||||
; This test checks that only a single js gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR. This makes
|
||||
; sure the code for the lowering for opposite conditions gets tested.
|
||||
; CHECK-LABEL: foo11:
|
||||
; CHECK: js
|
||||
; CHECK-NOT: js
|
||||
; CHECK-NOT: jns
|
||||
define i32 @foo11(i32 %v1, i32 %v2, i32 %v3) nounwind {
|
||||
entry:
|
||||
%cmp1 = icmp slt i32 %v1, 0
|
||||
%v2.v3 = select i1 %cmp1, i32 %v2, i32 %v3
|
||||
%cmp2 = icmp sge i32 %v1, 0
|
||||
%v1.v2 = select i1 %cmp2, i32 %v1, i32 %v2
|
||||
%sub = sub i32 %v1.v2, %v2.v3
|
||||
ret i32 %sub
|
||||
}
|
||||
|
||||
; This test checks that only a single js gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR.
|
||||
; CHECK-LABEL: foo2:
|
||||
; CHECK: js
|
||||
; CHECK-NOT: js
|
||||
define i32 @foo2(i8 %v1, i8 %v2, i8 %v3) nounwind {
|
||||
entry:
|
||||
%cmp = icmp slt i8 %v1, 0
|
||||
%v2.v3 = select i1 %cmp, i8 %v2, i8 %v3
|
||||
%v1.v2 = select i1 %cmp, i8 %v1, i8 %v2
|
||||
%t1 = sext i8 %v2.v3 to i32
|
||||
%t2 = sext i8 %v1.v2 to i32
|
||||
%sub = sub i32 %t1, %t2
|
||||
ret i32 %sub
|
||||
}
|
||||
|
||||
; This test checks that only a single js gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR.
|
||||
; CHECK-LABEL: foo3:
|
||||
; CHECK: js
|
||||
; CHECK-NOT: js
|
||||
define i32 @foo3(i16 %v1, i16 %v2, i16 %v3) nounwind {
|
||||
entry:
|
||||
%cmp = icmp slt i16 %v1, 0
|
||||
%v2.v3 = select i1 %cmp, i16 %v2, i16 %v3
|
||||
%v1.v2 = select i1 %cmp, i16 %v1, i16 %v2
|
||||
%t1 = sext i16 %v2.v3 to i32
|
||||
%t2 = sext i16 %v1.v2 to i32
|
||||
%sub = sub i32 %t1, %t2
|
||||
ret i32 %sub
|
||||
}
|
||||
|
||||
; This test checks that only a single js gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR.
|
||||
; CHECK-LABEL: foo4:
|
||||
; CHECK: js
|
||||
; CHECK-NOT: js
|
||||
define float @foo4(i32 %v1, float %v2, float %v3, float %v4) nounwind {
|
||||
entry:
|
||||
%cmp = icmp slt i32 %v1, 0
|
||||
%t1 = select i1 %cmp, float %v2, float %v3
|
||||
%t2 = select i1 %cmp, float %v3, float %v4
|
||||
%sub = fsub float %t1, %t2
|
||||
ret float %sub
|
||||
}
|
||||
|
||||
; This test checks that only a single je gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR.
|
||||
; CHECK-LABEL: foo5:
|
||||
; CHECK: je
|
||||
; CHECK-NOT: je
|
||||
define double @foo5(i32 %v1, double %v2, double %v3, double %v4) nounwind {
|
||||
entry:
|
||||
%cmp = icmp eq i32 %v1, 0
|
||||
%t1 = select i1 %cmp, double %v2, double %v3
|
||||
%t2 = select i1 %cmp, double %v3, double %v4
|
||||
%sub = fsub double %t1, %t2
|
||||
ret double %sub
|
||||
}
|
||||
|
||||
; This test checks that only a single je gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR.
|
||||
; CHECK-LABEL: foo6:
|
||||
; CHECK: je
|
||||
; CHECK-NOT: je
|
||||
define <4 x float> @foo6(i32 %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> %v4) nounwind {
|
||||
entry:
|
||||
%cmp = icmp eq i32 %v1, 0
|
||||
%t1 = select i1 %cmp, <4 x float> %v2, <4 x float> %v3
|
||||
%t2 = select i1 %cmp, <4 x float> %v3, <4 x float> %v4
|
||||
%sub = fsub <4 x float> %t1, %t2
|
||||
ret <4 x float> %sub
|
||||
}
|
||||
|
||||
; This test checks that only a single je gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR.
|
||||
; CHECK-LABEL: foo7:
|
||||
; CHECK: je
|
||||
; CHECK-NOT: je
|
||||
define <2 x double> @foo7(i32 %v1, <2 x double> %v2, <2 x double> %v3, <2 x double> %v4) nounwind {
|
||||
entry:
|
||||
%cmp = icmp eq i32 %v1, 0
|
||||
%t1 = select i1 %cmp, <2 x double> %v2, <2 x double> %v3
|
||||
%t2 = select i1 %cmp, <2 x double> %v3, <2 x double> %v4
|
||||
%sub = fsub <2 x double> %t1, %t2
|
||||
ret <2 x double> %sub
|
||||
}
|
||||
|
||||
; This test checks that only a single ja gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR. This combines
|
||||
; all the supported types together into one long string of selects based
|
||||
; on the same condition.
|
||||
; CHECK-LABEL: foo8:
|
||||
; CHECK: ja
|
||||
; CHECK-NOT: ja
|
||||
define void @foo8(i32 %v1,
|
||||
i8 %v2, i8 %v3,
|
||||
i16 %v12, i16 %v13,
|
||||
i32 %v22, i32 %v23,
|
||||
float %v32, float %v33,
|
||||
double %v42, double %v43,
|
||||
<4 x float> %v52, <4 x float> %v53,
|
||||
<2 x double> %v62, <2 x double> %v63,
|
||||
<8 x float> %v72, <8 x float> %v73,
|
||||
<4 x double> %v82, <4 x double> %v83,
|
||||
<16 x float> %v92, <16 x float> %v93,
|
||||
<8 x double> %v102, <8 x double> %v103,
|
||||
i8 * %dst) nounwind {
|
||||
entry:
|
||||
%add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 2
|
||||
%a11 = bitcast i8* %add.ptr11 to i16*
|
||||
|
||||
%add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
|
||||
%a21 = bitcast i8* %add.ptr21 to i32*
|
||||
|
||||
%add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
|
||||
%a31 = bitcast i8* %add.ptr31 to float*
|
||||
|
||||
%add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
|
||||
%a41 = bitcast i8* %add.ptr41 to double*
|
||||
|
||||
%add.ptr51 = getelementptr inbounds i8, i8* %dst, i32 32
|
||||
%a51 = bitcast i8* %add.ptr51 to <4 x float>*
|
||||
|
||||
%add.ptr61 = getelementptr inbounds i8, i8* %dst, i32 48
|
||||
%a61 = bitcast i8* %add.ptr61 to <2 x double>*
|
||||
|
||||
%add.ptr71 = getelementptr inbounds i8, i8* %dst, i32 64
|
||||
%a71 = bitcast i8* %add.ptr71 to <8 x float>*
|
||||
|
||||
%add.ptr81 = getelementptr inbounds i8, i8* %dst, i32 128
|
||||
%a81 = bitcast i8* %add.ptr81 to <4 x double>*
|
||||
|
||||
%add.ptr91 = getelementptr inbounds i8, i8* %dst, i32 64
|
||||
%a91 = bitcast i8* %add.ptr91 to <16 x float>*
|
||||
|
||||
%add.ptr101 = getelementptr inbounds i8, i8* %dst, i32 128
|
||||
%a101 = bitcast i8* %add.ptr101 to <8 x double>*
|
||||
|
||||
; These operations are necessary, because select of two single use loads
|
||||
; ends up getting optimized into a select of two leas, followed by a
|
||||
; single load of the selected address.
|
||||
%t13 = xor i16 %v13, 11
|
||||
%t23 = xor i32 %v23, 1234
|
||||
%t33 = fadd float %v33, %v32
|
||||
%t43 = fadd double %v43, %v42
|
||||
%t53 = fadd <4 x float> %v53, %v52
|
||||
%t63 = fadd <2 x double> %v63, %v62
|
||||
%t73 = fsub <8 x float> %v73, %v72
|
||||
%t83 = fsub <4 x double> %v83, %v82
|
||||
%t93 = fsub <16 x float> %v93, %v92
|
||||
%t103 = fsub <8 x double> %v103, %v102
|
||||
|
||||
%cmp = icmp ugt i32 %v1, 31
|
||||
%t11 = select i1 %cmp, i16 %v12, i16 %t13
|
||||
%t21 = select i1 %cmp, i32 %v22, i32 %t23
|
||||
%t31 = select i1 %cmp, float %v32, float %t33
|
||||
%t41 = select i1 %cmp, double %v42, double %t43
|
||||
%t51 = select i1 %cmp, <4 x float> %v52, <4 x float> %t53
|
||||
%t61 = select i1 %cmp, <2 x double> %v62, <2 x double> %t63
|
||||
%t71 = select i1 %cmp, <8 x float> %v72, <8 x float> %t73
|
||||
%t81 = select i1 %cmp, <4 x double> %v82, <4 x double> %t83
|
||||
%t91 = select i1 %cmp, <16 x float> %v92, <16 x float> %t93
|
||||
%t101 = select i1 %cmp, <8 x double> %v102, <8 x double> %t103
|
||||
|
||||
store i16 %t11, i16* %a11, align 2
|
||||
store i32 %t21, i32* %a21, align 4
|
||||
store float %t31, float* %a31, align 4
|
||||
store double %t41, double* %a41, align 8
|
||||
store <4 x float> %t51, <4 x float>* %a51, align 16
|
||||
store <2 x double> %t61, <2 x double>* %a61, align 16
|
||||
store <8 x float> %t71, <8 x float>* %a71, align 32
|
||||
store <4 x double> %t81, <4 x double>* %a81, align 32
|
||||
store <16 x float> %t91, <16 x float>* %a91, align 32
|
||||
store <8 x double> %t101, <8 x double>* %a101, align 32
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; This test checks that only a single ja gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR.
|
||||
; on the same condition.
|
||||
; Contrary to my expectations, this doesn't exercise the code for
|
||||
; CMOV_V8I1, CMOV_V16I1, CMOV_V32I1, or CMOV_V64I1. Instead the selects all
|
||||
; get lowered into vector length number of selects, which all eventually turn
|
||||
; into a huge number of CMOV_GR8, which are all contiguous, so the optimization
|
||||
; kicks in as long as CMOV_GR8 is supported. I couldn't find a way to get
|
||||
; CMOV_V*I1 pseudo-opcodes to get generated. If a way exists to get CMOV_V*1
|
||||
; pseudo-opcodes to be generated, this test should be replaced with one that
|
||||
; tests those opcodes.
|
||||
;
|
||||
; CHECK-LABEL: foo9:
|
||||
; CHECK: ja
|
||||
; CHECK-NOT: ja
|
||||
define void @foo9(i32 %v1,
|
||||
<8 x i1> %v12, <8 x i1> %v13,
|
||||
<16 x i1> %v22, <16 x i1> %v23,
|
||||
<32 x i1> %v32, <32 x i1> %v33,
|
||||
<64 x i1> %v42, <64 x i1> %v43,
|
||||
i8 * %dst) nounwind {
|
||||
entry:
|
||||
%add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 0
|
||||
%a11 = bitcast i8* %add.ptr11 to <8 x i1>*
|
||||
|
||||
%add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
|
||||
%a21 = bitcast i8* %add.ptr21 to <16 x i1>*
|
||||
|
||||
%add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
|
||||
%a31 = bitcast i8* %add.ptr31 to <32 x i1>*
|
||||
|
||||
%add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
|
||||
%a41 = bitcast i8* %add.ptr41 to <64 x i1>*
|
||||
|
||||
; These operations are necessary, because select of two single use loads
|
||||
; ends up getting optimized into a select of two leas, followed by a
|
||||
; single load of the selected address.
|
||||
%t13 = xor <8 x i1> %v13, %v12
|
||||
%t23 = xor <16 x i1> %v23, %v22
|
||||
%t33 = xor <32 x i1> %v33, %v32
|
||||
%t43 = xor <64 x i1> %v43, %v42
|
||||
|
||||
%cmp = icmp ugt i32 %v1, 31
|
||||
%t11 = select i1 %cmp, <8 x i1> %v12, <8 x i1> %t13
|
||||
%t21 = select i1 %cmp, <16 x i1> %v22, <16 x i1> %t23
|
||||
%t31 = select i1 %cmp, <32 x i1> %v32, <32 x i1> %t33
|
||||
%t41 = select i1 %cmp, <64 x i1> %v42, <64 x i1> %t43
|
||||
|
||||
store <8 x i1> %t11, <8 x i1>* %a11, align 16
|
||||
store <16 x i1> %t21, <16 x i1>* %a21, align 4
|
||||
store <32 x i1> %t31, <32 x i1>* %a31, align 8
|
||||
store <64 x i1> %t41, <64 x i1>* %a41, align 16
|
||||
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2 -o - | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s
|
||||
|
||||
; This test checks that only a single jae gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR.
|
||||
; CHECK-LABEL: foo1:
|
||||
; CHECK: jae
|
||||
; CHECK-NOT: jae
|
||||
define double @foo1(float %p1, double %p2, double %p3) nounwind {
|
||||
entry:
|
||||
%c1 = fcmp oge float %p1, 0.000000e+00
|
||||
%d0 = fadd double %p2, 1.25e0
|
||||
%d1 = fadd double %p3, 1.25e0
|
||||
%d2 = select i1 %c1, double %d0, double %d1
|
||||
%d3 = select i1 %c1, double %d0, double %p2
|
||||
%d4 = select i1 %c1, double %p3, double %d1
|
||||
%d5 = fsub double %d2, %d3
|
||||
%d6 = fadd double %d5, %d4
|
||||
ret double %d6
|
||||
}
|
||||
|
||||
; This test checks that only a single jae gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR.
|
||||
; CHECK-LABEL: foo2:
|
||||
; CHECK: jae
|
||||
; CHECK-NOT: jae
|
||||
define float @foo2(float %p1, float %p2, float %p3) nounwind {
|
||||
entry:
|
||||
%c1 = fcmp oge float %p1, 0.000000e+00
|
||||
%d0 = fadd float %p2, 1.25e0
|
||||
%d1 = fadd float %p3, 1.25e0
|
||||
%d2 = select i1 %c1, float %d0, float %d1
|
||||
%d3 = select i1 %c1, float %d1, float %p2
|
||||
%d4 = select i1 %c1, float %d0, float %p3
|
||||
%d5 = fsub float %d2, %d3
|
||||
%d6 = fadd float %d5, %d4
|
||||
ret float %d6
|
||||
}
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s
|
||||
|
||||
; This test checks that only a single jae gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR. The tricky part
|
||||
; of this test is that it tests the special PHI operand rewriting code in
|
||||
; X86TargetLowering::EmitLoweredSelect.
|
||||
;
|
||||
; CHECK-LABEL: foo1:
|
||||
; CHECK: jae
|
||||
; CHECK-NOT: jae
|
||||
define double @foo1(float %p1, double %p2, double %p3) nounwind {
|
||||
entry:
|
||||
%c1 = fcmp oge float %p1, 0.000000e+00
|
||||
%d0 = fadd double %p2, 1.25e0
|
||||
%d1 = fadd double %p3, 1.25e0
|
||||
%d2 = select i1 %c1, double %d0, double %d1
|
||||
%d3 = select i1 %c1, double %d2, double %p2
|
||||
%d4 = select i1 %c1, double %d3, double %p3
|
||||
%d5 = fsub double %d2, %d3
|
||||
%d6 = fadd double %d5, %d4
|
||||
ret double %d6
|
||||
}
|
||||
|
||||
; This test checks that only a single jae gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR. The tricky part
|
||||
; of this test is that it tests the special PHI operand rewriting code in
|
||||
; X86TargetLowering::EmitLoweredSelect.
|
||||
;
|
||||
; CHECK-LABEL: foo2:
|
||||
; CHECK: jae
|
||||
; CHECK-NOT: jae
|
||||
define double @foo2(float %p1, double %p2, double %p3) nounwind {
|
||||
entry:
|
||||
%c1 = fcmp oge float %p1, 0.000000e+00
|
||||
%d0 = fadd double %p2, 1.25e0
|
||||
%d1 = fadd double %p3, 1.25e0
|
||||
%d2 = select i1 %c1, double %d0, double %d1
|
||||
%d3 = select i1 %c1, double %p2, double %d2
|
||||
%d4 = select i1 %c1, double %p3, double %d3
|
||||
%d5 = fsub double %d2, %d3
|
||||
%d6 = fadd double %d5, %d4
|
||||
ret double %d6
|
||||
}
|
||||
|
||||
; This test checks that only a single js gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR. The tricky part
|
||||
; of this test is that it tests the special PHI operand rewriting code in
|
||||
; X86TargetLowering::EmitLoweredSelect. It also tests to make sure all
|
||||
; the operands of the resulting instructions are from the proper places.
|
||||
;
|
||||
; CHECK-LABEL: foo3:
|
||||
; CHECK: js
|
||||
; CHECK-NOT: js
|
||||
; CHECK-LABEL: # BB#1:
|
||||
; CHECK-DAG: movapd %xmm2, %xmm1
|
||||
; CHECK-DAG: movapd %xmm2, %xmm0
|
||||
; CHECK-LABEL:.LBB2_2:
|
||||
; CHECK: divsd %xmm1, %xmm0
|
||||
; CHECK: ret
|
||||
define double @foo3(i32 %p1, double %p2, double %p3,
|
||||
double %p4, double %p5) nounwind {
|
||||
entry:
|
||||
%c1 = icmp slt i32 %p1, 0
|
||||
%d2 = select i1 %c1, double %p2, double %p3
|
||||
%d3 = select i1 %c1, double %p3, double %p4
|
||||
%d4 = select i1 %c1, double %d2, double %d3
|
||||
%d5 = fdiv double %d4, %d3
|
||||
ret double %d5
|
||||
}
|
||||
|
||||
; This test checks that only a single js gets generated in the final code
|
||||
; for lowering the CMOV pseudos that get created for this IR. The tricky part
|
||||
; of this test is that it tests the special PHI operand rewriting code in
|
||||
; X86TargetLowering::EmitLoweredSelect. It also tests to make sure all
|
||||
; the operands of the resulting instructions are from the proper places
|
||||
; when the "opposite condition" handling code in the compiler is used.
|
||||
; This should be the same code as foo3 above, because we use the opposite
|
||||
; condition code in the second two selects, but we also swap the operands
|
||||
; of the selects to give the same actual computation.
|
||||
;
|
||||
; CHECK-LABEL: foo4:
|
||||
; CHECK: js
|
||||
; CHECK-NOT: js
|
||||
; CHECK-LABEL: # BB#1:
|
||||
; CHECK-DAG: movapd %xmm2, %xmm1
|
||||
; CHECK-DAG: movapd %xmm2, %xmm0
|
||||
; CHECK-LABEL:.LBB3_2:
|
||||
; CHECK: divsd %xmm1, %xmm0
|
||||
; CHECK: ret
|
||||
define double @foo4(i32 %p1, double %p2, double %p3,
|
||||
double %p4, double %p5) nounwind {
|
||||
entry:
|
||||
%c1 = icmp slt i32 %p1, 0
|
||||
%d2 = select i1 %c1, double %p2, double %p3
|
||||
%c2 = icmp sge i32 %p1, 0
|
||||
%d3 = select i1 %c2, double %p4, double %p3
|
||||
%d4 = select i1 %c2, double %d3, double %d2
|
||||
%d5 = fdiv double %d4, %d3
|
||||
ret double %d5
|
||||
}
|
Loading…
Reference in New Issue