[LV] Refactor ILV to provide vectorizeInstruction(); NFC
Refactoring InnerLoopVectorizer's vectorizeBlockInLoop() to provide vectorizeInstruction(). Aligning DeadInstructions with its only user. Facilitates driving the transformation by VPlan - follows https://reviews.llvm.org/D28975 and its tentative breakdown. Differential Revision: https://reviews.llvm.org/D31997 llvm-svn: 300183
This commit is contained in:
parent
cec1e260c3
commit
cd712b6c49
|
@ -465,7 +465,8 @@ protected:
|
|||
|
||||
/// Collect the instructions from the original loop that would be trivially
|
||||
/// dead in the vectorized loop if generated.
|
||||
void collectTriviallyDeadInstructions();
|
||||
void collectTriviallyDeadInstructions(
|
||||
SmallPtrSetImpl<Instruction *> &DeadInstructions);
|
||||
|
||||
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
|
||||
/// represented as.
|
||||
|
@ -479,8 +480,9 @@ protected:
|
|||
/// and DST.
|
||||
VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
|
||||
|
||||
/// A helper function to vectorize a single BB within the innermost loop.
|
||||
void vectorizeBlockInLoop(BasicBlock *BB);
|
||||
/// A helper function to vectorize a single instruction within the innermost
|
||||
/// loop.
|
||||
void vectorizeInstruction(Instruction &I);
|
||||
|
||||
/// Vectorize a single PHINode in a block. This method handles the induction
|
||||
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
|
||||
|
@ -798,14 +800,6 @@ protected:
|
|||
// Record whether runtime checks are added.
|
||||
bool AddedSafetyChecks;
|
||||
|
||||
// Holds instructions from the original loop whose counterparts in the
|
||||
// vectorized loop would be trivially dead if generated. For example,
|
||||
// original induction update instructions can become dead because we
|
||||
// separately emit induction "steps" when generating code for the new loop.
|
||||
// Similarly, we create a new latch condition when setting up the structure
|
||||
// of the new loop, so the old one can become dead.
|
||||
SmallPtrSet<Instruction *, 4> DeadInstructions;
|
||||
|
||||
// Holds the end values for each induction variable. We save the end values
|
||||
// so we can later fix-up the external users of the induction variables.
|
||||
DenseMap<PHINode *, Value *> IVEndValues;
|
||||
|
@ -3892,19 +3886,26 @@ void InnerLoopVectorizer::vectorizeLoop() {
|
|||
//
|
||||
//===------------------------------------------------===//
|
||||
|
||||
// Collect instructions from the original loop that will become trivially
|
||||
// dead in the vectorized loop. We don't need to vectorize these
|
||||
// instructions.
|
||||
collectTriviallyDeadInstructions();
|
||||
// Collect instructions from the original loop that will become trivially dead
|
||||
// in the vectorized loop. We don't need to vectorize these instructions. For
|
||||
// example, original induction update instructions can become dead because we
|
||||
// separately emit induction "steps" when generating code for the new loop.
|
||||
// Similarly, we create a new latch condition when setting up the structure
|
||||
// of the new loop, so the old one can become dead.
|
||||
SmallPtrSet<Instruction *, 4> DeadInstructions;
|
||||
collectTriviallyDeadInstructions(DeadInstructions);
|
||||
|
||||
// Scan the loop in a topological order to ensure that defs are vectorized
|
||||
// before users.
|
||||
LoopBlocksDFS DFS(OrigLoop);
|
||||
DFS.perform(LI);
|
||||
|
||||
// Vectorize all of the blocks in the original loop.
|
||||
// Vectorize all instructions in the original loop that will not become
|
||||
// trivially dead when vectorized.
|
||||
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
|
||||
vectorizeBlockInLoop(BB);
|
||||
for (Instruction &I : *BB)
|
||||
if (!DeadInstructions.count(&I))
|
||||
vectorizeInstruction(I);
|
||||
|
||||
// Insert truncates and extends for any truncated instructions as hints to
|
||||
// InstCombine.
|
||||
|
@ -4334,7 +4335,8 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
|
|||
}
|
||||
}
|
||||
|
||||
void InnerLoopVectorizer::collectTriviallyDeadInstructions() {
|
||||
void InnerLoopVectorizer::collectTriviallyDeadInstructions(
|
||||
SmallPtrSetImpl<Instruction *> &DeadInstructions) {
|
||||
BasicBlock *Latch = OrigLoop->getLoopLatch();
|
||||
|
||||
// We create new control-flow for the vectorized loop, so the original
|
||||
|
@ -4720,333 +4722,323 @@ static bool mayDivideByZero(Instruction &I) {
|
|||
return !CInt || CInt->isZero();
|
||||
}
|
||||
|
||||
void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB) {
|
||||
// For each instruction in the old loop.
|
||||
for (Instruction &I : *BB) {
|
||||
void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
|
||||
// Scalarize instructions that should remain scalar after vectorization.
|
||||
if (VF > 1 &&
|
||||
!(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) &&
|
||||
shouldScalarizeInstruction(&I)) {
|
||||
scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
|
||||
return;
|
||||
}
|
||||
|
||||
// If the instruction will become trivially dead when vectorized, we don't
|
||||
// need to generate it.
|
||||
if (DeadInstructions.count(&I))
|
||||
continue;
|
||||
switch (I.getOpcode()) {
|
||||
case Instruction::Br:
|
||||
// Nothing to do for PHIs and BR, since we already took care of the
|
||||
// loop control flow instructions.
|
||||
break;
|
||||
case Instruction::PHI: {
|
||||
// Vectorize PHINodes.
|
||||
widenPHIInstruction(&I, UF, VF);
|
||||
break;
|
||||
} // End of PHI.
|
||||
case Instruction::GetElementPtr: {
|
||||
// Construct a vector GEP by widening the operands of the scalar GEP as
|
||||
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
|
||||
// results in a vector of pointers when at least one operand of the GEP
|
||||
// is vector-typed. Thus, to keep the representation compact, we only use
|
||||
// vector-typed operands for loop-varying values.
|
||||
auto *GEP = cast<GetElementPtrInst>(&I);
|
||||
VectorParts Entry(UF);
|
||||
|
||||
// Scalarize instructions that should remain scalar after vectorization.
|
||||
if (VF > 1 &&
|
||||
!(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
|
||||
isa<DbgInfoIntrinsic>(&I)) &&
|
||||
shouldScalarizeInstruction(&I)) {
|
||||
scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (I.getOpcode()) {
|
||||
case Instruction::Br:
|
||||
// Nothing to do for PHIs and BR, since we already took care of the
|
||||
// loop control flow instructions.
|
||||
continue;
|
||||
case Instruction::PHI: {
|
||||
// Vectorize PHINodes.
|
||||
widenPHIInstruction(&I, UF, VF);
|
||||
continue;
|
||||
} // End of PHI.
|
||||
case Instruction::GetElementPtr: {
|
||||
// Construct a vector GEP by widening the operands of the scalar GEP as
|
||||
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
|
||||
// results in a vector of pointers when at least one operand of the GEP
|
||||
// is vector-typed. Thus, to keep the representation compact, we only use
|
||||
// vector-typed operands for loop-varying values.
|
||||
auto *GEP = cast<GetElementPtrInst>(&I);
|
||||
VectorParts Entry(UF);
|
||||
|
||||
if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
|
||||
// If we are vectorizing, but the GEP has only loop-invariant operands,
|
||||
// the GEP we build (by only using vector-typed operands for
|
||||
// loop-varying values) would be a scalar pointer. Thus, to ensure we
|
||||
// produce a vector of pointers, we need to either arbitrarily pick an
|
||||
// operand to broadcast, or broadcast a clone of the original GEP.
|
||||
// Here, we broadcast a clone of the original.
|
||||
//
|
||||
// TODO: If at some point we decide to scalarize instructions having
|
||||
// loop-invariant operands, this special case will no longer be
|
||||
// required. We would add the scalarization decision to
|
||||
// collectLoopScalars() and teach getVectorValue() to broadcast
|
||||
// the lane-zero scalar value.
|
||||
auto *Clone = Builder.Insert(GEP->clone());
|
||||
for (unsigned Part = 0; Part < UF; ++Part)
|
||||
Entry[Part] = Builder.CreateVectorSplat(VF, Clone);
|
||||
} else {
|
||||
// If the GEP has at least one loop-varying operand, we are sure to
|
||||
// produce a vector of pointers. But if we are only unrolling, we want
|
||||
// to produce a scalar GEP for each unroll part. Thus, the GEP we
|
||||
// produce with the code below will be scalar (if VF == 1) or vector
|
||||
// (otherwise). Note that for the unroll-only case, we still maintain
|
||||
// values in the vector mapping with initVector, as we do for other
|
||||
// instructions.
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
|
||||
// The pointer operand of the new GEP. If it's loop-invariant, we
|
||||
// won't broadcast it.
|
||||
auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand())
|
||||
? GEP->getPointerOperand()
|
||||
: getVectorValue(GEP->getPointerOperand())[Part];
|
||||
|
||||
// Collect all the indices for the new GEP. If any index is
|
||||
// loop-invariant, we won't broadcast it.
|
||||
SmallVector<Value *, 4> Indices;
|
||||
for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
|
||||
if (OrigLoop->isLoopInvariant(U.get()))
|
||||
Indices.push_back(U.get());
|
||||
else
|
||||
Indices.push_back(getVectorValue(U.get())[Part]);
|
||||
}
|
||||
|
||||
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
|
||||
// but it should be a vector, otherwise.
|
||||
auto *NewGEP = GEP->isInBounds()
|
||||
? Builder.CreateInBoundsGEP(Ptr, Indices)
|
||||
: Builder.CreateGEP(Ptr, Indices);
|
||||
assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
|
||||
"NewGEP is not a pointer vector");
|
||||
Entry[Part] = NewGEP;
|
||||
}
|
||||
}
|
||||
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, GEP);
|
||||
break;
|
||||
}
|
||||
case Instruction::UDiv:
|
||||
case Instruction::SDiv:
|
||||
case Instruction::SRem:
|
||||
case Instruction::URem:
|
||||
// Scalarize with predication if this instruction may divide by zero and
|
||||
// block execution is conditional, otherwise fallthrough.
|
||||
if (Legal->isScalarWithPredication(&I)) {
|
||||
scalarizeInstruction(&I, true);
|
||||
continue;
|
||||
}
|
||||
case Instruction::Add:
|
||||
case Instruction::FAdd:
|
||||
case Instruction::Sub:
|
||||
case Instruction::FSub:
|
||||
case Instruction::Mul:
|
||||
case Instruction::FMul:
|
||||
case Instruction::FDiv:
|
||||
case Instruction::FRem:
|
||||
case Instruction::Shl:
|
||||
case Instruction::LShr:
|
||||
case Instruction::AShr:
|
||||
case Instruction::And:
|
||||
case Instruction::Or:
|
||||
case Instruction::Xor: {
|
||||
// Just widen binops.
|
||||
auto *BinOp = cast<BinaryOperator>(&I);
|
||||
setDebugLocFromInst(Builder, BinOp);
|
||||
const VectorParts &A = getVectorValue(BinOp->getOperand(0));
|
||||
const VectorParts &B = getVectorValue(BinOp->getOperand(1));
|
||||
|
||||
// Use this vector value for all users of the original instruction.
|
||||
VectorParts Entry(UF);
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
|
||||
|
||||
if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
|
||||
VecOp->copyIRFlags(BinOp);
|
||||
|
||||
Entry[Part] = V;
|
||||
}
|
||||
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, BinOp);
|
||||
break;
|
||||
}
|
||||
case Instruction::Select: {
|
||||
// Widen selects.
|
||||
// If the selector is loop invariant we can create a select
|
||||
// instruction with a scalar condition. Otherwise, use vector-select.
|
||||
auto *SE = PSE.getSE();
|
||||
bool InvariantCond =
|
||||
SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
|
||||
setDebugLocFromInst(Builder, &I);
|
||||
|
||||
// The condition can be loop invariant but still defined inside the
|
||||
// loop. This means that we can't just use the original 'cond' value.
|
||||
// We have to take the 'vectorized' value and pick the first lane.
|
||||
// Instcombine will make this a no-op.
|
||||
const VectorParts &Cond = getVectorValue(I.getOperand(0));
|
||||
const VectorParts &Op0 = getVectorValue(I.getOperand(1));
|
||||
const VectorParts &Op1 = getVectorValue(I.getOperand(2));
|
||||
|
||||
auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
|
||||
|
||||
VectorParts Entry(UF);
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
Entry[Part] = Builder.CreateSelect(
|
||||
InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
|
||||
}
|
||||
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, &I);
|
||||
break;
|
||||
}
|
||||
|
||||
case Instruction::ICmp:
|
||||
case Instruction::FCmp: {
|
||||
// Widen compares. Generate vector compares.
|
||||
bool FCmp = (I.getOpcode() == Instruction::FCmp);
|
||||
auto *Cmp = dyn_cast<CmpInst>(&I);
|
||||
setDebugLocFromInst(Builder, Cmp);
|
||||
const VectorParts &A = getVectorValue(Cmp->getOperand(0));
|
||||
const VectorParts &B = getVectorValue(Cmp->getOperand(1));
|
||||
VectorParts Entry(UF);
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
Value *C = nullptr;
|
||||
if (FCmp) {
|
||||
C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
|
||||
cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
|
||||
} else {
|
||||
C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
|
||||
}
|
||||
Entry[Part] = C;
|
||||
}
|
||||
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, &I);
|
||||
break;
|
||||
}
|
||||
|
||||
case Instruction::Store:
|
||||
case Instruction::Load:
|
||||
vectorizeMemoryInstruction(&I);
|
||||
break;
|
||||
case Instruction::ZExt:
|
||||
case Instruction::SExt:
|
||||
case Instruction::FPToUI:
|
||||
case Instruction::FPToSI:
|
||||
case Instruction::FPExt:
|
||||
case Instruction::PtrToInt:
|
||||
case Instruction::IntToPtr:
|
||||
case Instruction::SIToFP:
|
||||
case Instruction::UIToFP:
|
||||
case Instruction::Trunc:
|
||||
case Instruction::FPTrunc:
|
||||
case Instruction::BitCast: {
|
||||
auto *CI = dyn_cast<CastInst>(&I);
|
||||
setDebugLocFromInst(Builder, CI);
|
||||
|
||||
// Optimize the special case where the source is a constant integer
|
||||
// induction variable. Notice that we can only optimize the 'trunc' case
|
||||
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
|
||||
// (c) other casts depend on pointer size.
|
||||
if (Cost->isOptimizableIVTruncate(CI, VF)) {
|
||||
widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
|
||||
cast<TruncInst>(CI));
|
||||
break;
|
||||
}
|
||||
|
||||
/// Vectorize casts.
|
||||
Type *DestTy =
|
||||
(VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
|
||||
|
||||
const VectorParts &A = getVectorValue(CI->getOperand(0));
|
||||
VectorParts Entry(UF);
|
||||
if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
|
||||
// If we are vectorizing, but the GEP has only loop-invariant operands,
|
||||
// the GEP we build (by only using vector-typed operands for
|
||||
// loop-varying values) would be a scalar pointer. Thus, to ensure we
|
||||
// produce a vector of pointers, we need to either arbitrarily pick an
|
||||
// operand to broadcast, or broadcast a clone of the original GEP.
|
||||
// Here, we broadcast a clone of the original.
|
||||
//
|
||||
// TODO: If at some point we decide to scalarize instructions having
|
||||
// loop-invariant operands, this special case will no longer be
|
||||
// required. We would add the scalarization decision to
|
||||
// collectLoopScalars() and teach getVectorValue() to broadcast
|
||||
// the lane-zero scalar value.
|
||||
auto *Clone = Builder.Insert(GEP->clone());
|
||||
for (unsigned Part = 0; Part < UF; ++Part)
|
||||
Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, &I);
|
||||
break;
|
||||
}
|
||||
|
||||
case Instruction::Call: {
|
||||
// Ignore dbg intrinsics.
|
||||
if (isa<DbgInfoIntrinsic>(I))
|
||||
break;
|
||||
setDebugLocFromInst(Builder, &I);
|
||||
|
||||
Module *M = BB->getParent()->getParent();
|
||||
auto *CI = cast<CallInst>(&I);
|
||||
|
||||
StringRef FnName = CI->getCalledFunction()->getName();
|
||||
Function *F = CI->getCalledFunction();
|
||||
Type *RetTy = ToVectorTy(CI->getType(), VF);
|
||||
SmallVector<Type *, 4> Tys;
|
||||
for (Value *ArgOperand : CI->arg_operands())
|
||||
Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
|
||||
|
||||
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
||||
if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
|
||||
ID == Intrinsic::lifetime_start)) {
|
||||
scalarizeInstruction(&I);
|
||||
break;
|
||||
}
|
||||
// The flag shows whether we use Intrinsic or a usual Call for vectorized
|
||||
// version of the instruction.
|
||||
// Is it beneficial to perform intrinsic call compared to lib call?
|
||||
bool NeedToScalarize;
|
||||
unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
|
||||
bool UseVectorIntrinsic =
|
||||
ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
|
||||
if (!UseVectorIntrinsic && NeedToScalarize) {
|
||||
scalarizeInstruction(&I);
|
||||
break;
|
||||
}
|
||||
|
||||
VectorParts Entry(UF);
|
||||
Entry[Part] = Builder.CreateVectorSplat(VF, Clone);
|
||||
} else {
|
||||
// If the GEP has at least one loop-varying operand, we are sure to
|
||||
// produce a vector of pointers. But if we are only unrolling, we want
|
||||
// to produce a scalar GEP for each unroll part. Thus, the GEP we
|
||||
// produce with the code below will be scalar (if VF == 1) or vector
|
||||
// (otherwise). Note that for the unroll-only case, we still maintain
|
||||
// values in the vector mapping with initVector, as we do for other
|
||||
// instructions.
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
SmallVector<Value *, 4> Args;
|
||||
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
|
||||
Value *Arg = CI->getArgOperand(i);
|
||||
// Some intrinsics have a scalar argument - don't replace it with a
|
||||
// vector.
|
||||
if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
|
||||
const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
|
||||
Arg = VectorArg[Part];
|
||||
}
|
||||
Args.push_back(Arg);
|
||||
|
||||
// The pointer operand of the new GEP. If it's loop-invariant, we
|
||||
// won't broadcast it.
|
||||
auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand())
|
||||
? GEP->getPointerOperand()
|
||||
: getVectorValue(GEP->getPointerOperand())[Part];
|
||||
|
||||
// Collect all the indices for the new GEP. If any index is
|
||||
// loop-invariant, we won't broadcast it.
|
||||
SmallVector<Value *, 4> Indices;
|
||||
for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
|
||||
if (OrigLoop->isLoopInvariant(U.get()))
|
||||
Indices.push_back(U.get());
|
||||
else
|
||||
Indices.push_back(getVectorValue(U.get())[Part]);
|
||||
}
|
||||
|
||||
Function *VectorF;
|
||||
if (UseVectorIntrinsic) {
|
||||
// Use vector version of the intrinsic.
|
||||
Type *TysForDecl[] = {CI->getType()};
|
||||
if (VF > 1)
|
||||
TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
|
||||
VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
|
||||
} else {
|
||||
// Use vector version of the library call.
|
||||
StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
|
||||
assert(!VFnName.empty() && "Vector function name is empty.");
|
||||
VectorF = M->getFunction(VFnName);
|
||||
if (!VectorF) {
|
||||
// Generate a declaration
|
||||
FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
|
||||
VectorF =
|
||||
Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
|
||||
VectorF->copyAttributesFrom(F);
|
||||
}
|
||||
}
|
||||
assert(VectorF && "Can't create vector function.");
|
||||
|
||||
SmallVector<OperandBundleDef, 1> OpBundles;
|
||||
CI->getOperandBundlesAsDefs(OpBundles);
|
||||
CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
|
||||
|
||||
if (isa<FPMathOperator>(V))
|
||||
V->copyFastMathFlags(CI);
|
||||
|
||||
Entry[Part] = V;
|
||||
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
|
||||
// but it should be a vector, otherwise.
|
||||
auto *NewGEP = GEP->isInBounds()
|
||||
? Builder.CreateInBoundsGEP(Ptr, Indices)
|
||||
: Builder.CreateGEP(Ptr, Indices);
|
||||
assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
|
||||
"NewGEP is not a pointer vector");
|
||||
Entry[Part] = NewGEP;
|
||||
}
|
||||
}
|
||||
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, &I);
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, GEP);
|
||||
break;
|
||||
}
|
||||
case Instruction::UDiv:
|
||||
case Instruction::SDiv:
|
||||
case Instruction::SRem:
|
||||
case Instruction::URem:
|
||||
// Scalarize with predication if this instruction may divide by zero and
|
||||
// block execution is conditional, otherwise fallthrough.
|
||||
if (Legal->isScalarWithPredication(&I)) {
|
||||
scalarizeInstruction(&I, true);
|
||||
break;
|
||||
}
|
||||
case Instruction::Add:
|
||||
case Instruction::FAdd:
|
||||
case Instruction::Sub:
|
||||
case Instruction::FSub:
|
||||
case Instruction::Mul:
|
||||
case Instruction::FMul:
|
||||
case Instruction::FDiv:
|
||||
case Instruction::FRem:
|
||||
case Instruction::Shl:
|
||||
case Instruction::LShr:
|
||||
case Instruction::AShr:
|
||||
case Instruction::And:
|
||||
case Instruction::Or:
|
||||
case Instruction::Xor: {
|
||||
// Just widen binops.
|
||||
auto *BinOp = cast<BinaryOperator>(&I);
|
||||
setDebugLocFromInst(Builder, BinOp);
|
||||
const VectorParts &A = getVectorValue(BinOp->getOperand(0));
|
||||
const VectorParts &B = getVectorValue(BinOp->getOperand(1));
|
||||
|
||||
// Use this vector value for all users of the original instruction.
|
||||
VectorParts Entry(UF);
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
|
||||
|
||||
if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
|
||||
VecOp->copyIRFlags(BinOp);
|
||||
|
||||
Entry[Part] = V;
|
||||
}
|
||||
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, BinOp);
|
||||
break;
|
||||
}
|
||||
case Instruction::Select: {
|
||||
// Widen selects.
|
||||
// If the selector is loop invariant we can create a select
|
||||
// instruction with a scalar condition. Otherwise, use vector-select.
|
||||
auto *SE = PSE.getSE();
|
||||
bool InvariantCond =
|
||||
SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
|
||||
setDebugLocFromInst(Builder, &I);
|
||||
|
||||
// The condition can be loop invariant but still defined inside the
|
||||
// loop. This means that we can't just use the original 'cond' value.
|
||||
// We have to take the 'vectorized' value and pick the first lane.
|
||||
// Instcombine will make this a no-op.
|
||||
const VectorParts &Cond = getVectorValue(I.getOperand(0));
|
||||
const VectorParts &Op0 = getVectorValue(I.getOperand(1));
|
||||
const VectorParts &Op1 = getVectorValue(I.getOperand(2));
|
||||
|
||||
auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
|
||||
|
||||
VectorParts Entry(UF);
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
Entry[Part] = Builder.CreateSelect(
|
||||
InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
|
||||
}
|
||||
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, &I);
|
||||
break;
|
||||
}
|
||||
|
||||
case Instruction::ICmp:
|
||||
case Instruction::FCmp: {
|
||||
// Widen compares. Generate vector compares.
|
||||
bool FCmp = (I.getOpcode() == Instruction::FCmp);
|
||||
auto *Cmp = dyn_cast<CmpInst>(&I);
|
||||
setDebugLocFromInst(Builder, Cmp);
|
||||
const VectorParts &A = getVectorValue(Cmp->getOperand(0));
|
||||
const VectorParts &B = getVectorValue(Cmp->getOperand(1));
|
||||
VectorParts Entry(UF);
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
Value *C = nullptr;
|
||||
if (FCmp) {
|
||||
C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
|
||||
cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
|
||||
} else {
|
||||
C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
|
||||
}
|
||||
Entry[Part] = C;
|
||||
}
|
||||
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, &I);
|
||||
break;
|
||||
}
|
||||
|
||||
case Instruction::Store:
|
||||
case Instruction::Load:
|
||||
vectorizeMemoryInstruction(&I);
|
||||
break;
|
||||
case Instruction::ZExt:
|
||||
case Instruction::SExt:
|
||||
case Instruction::FPToUI:
|
||||
case Instruction::FPToSI:
|
||||
case Instruction::FPExt:
|
||||
case Instruction::PtrToInt:
|
||||
case Instruction::IntToPtr:
|
||||
case Instruction::SIToFP:
|
||||
case Instruction::UIToFP:
|
||||
case Instruction::Trunc:
|
||||
case Instruction::FPTrunc:
|
||||
case Instruction::BitCast: {
|
||||
auto *CI = dyn_cast<CastInst>(&I);
|
||||
setDebugLocFromInst(Builder, CI);
|
||||
|
||||
// Optimize the special case where the source is a constant integer
|
||||
// induction variable. Notice that we can only optimize the 'trunc' case
|
||||
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
|
||||
// (c) other casts depend on pointer size.
|
||||
if (Cost->isOptimizableIVTruncate(CI, VF)) {
|
||||
widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
|
||||
cast<TruncInst>(CI));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
// All other instructions are unsupported. Scalarize them.
|
||||
/// Vectorize casts.
|
||||
Type *DestTy =
|
||||
(VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
|
||||
|
||||
const VectorParts &A = getVectorValue(CI->getOperand(0));
|
||||
VectorParts Entry(UF);
|
||||
for (unsigned Part = 0; Part < UF; ++Part)
|
||||
Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, &I);
|
||||
break;
|
||||
}
|
||||
|
||||
case Instruction::Call: {
|
||||
// Ignore dbg intrinsics.
|
||||
if (isa<DbgInfoIntrinsic>(I))
|
||||
break;
|
||||
setDebugLocFromInst(Builder, &I);
|
||||
|
||||
Module *M = I.getParent()->getParent()->getParent();
|
||||
auto *CI = cast<CallInst>(&I);
|
||||
|
||||
StringRef FnName = CI->getCalledFunction()->getName();
|
||||
Function *F = CI->getCalledFunction();
|
||||
Type *RetTy = ToVectorTy(CI->getType(), VF);
|
||||
SmallVector<Type *, 4> Tys;
|
||||
for (Value *ArgOperand : CI->arg_operands())
|
||||
Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
|
||||
|
||||
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
||||
if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
|
||||
ID == Intrinsic::lifetime_start)) {
|
||||
scalarizeInstruction(&I);
|
||||
break;
|
||||
} // end of switch.
|
||||
} // end of for_each instr.
|
||||
}
|
||||
// The flag shows whether we use Intrinsic or a usual Call for vectorized
|
||||
// version of the instruction.
|
||||
// Is it beneficial to perform intrinsic call compared to lib call?
|
||||
bool NeedToScalarize;
|
||||
unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
|
||||
bool UseVectorIntrinsic =
|
||||
ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
|
||||
if (!UseVectorIntrinsic && NeedToScalarize) {
|
||||
scalarizeInstruction(&I);
|
||||
break;
|
||||
}
|
||||
|
||||
VectorParts Entry(UF);
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
SmallVector<Value *, 4> Args;
|
||||
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
|
||||
Value *Arg = CI->getArgOperand(i);
|
||||
// Some intrinsics have a scalar argument - don't replace it with a
|
||||
// vector.
|
||||
if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
|
||||
const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
|
||||
Arg = VectorArg[Part];
|
||||
}
|
||||
Args.push_back(Arg);
|
||||
}
|
||||
|
||||
Function *VectorF;
|
||||
if (UseVectorIntrinsic) {
|
||||
// Use vector version of the intrinsic.
|
||||
Type *TysForDecl[] = {CI->getType()};
|
||||
if (VF > 1)
|
||||
TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
|
||||
VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
|
||||
} else {
|
||||
// Use vector version of the library call.
|
||||
StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
|
||||
assert(!VFnName.empty() && "Vector function name is empty.");
|
||||
VectorF = M->getFunction(VFnName);
|
||||
if (!VectorF) {
|
||||
// Generate a declaration
|
||||
FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
|
||||
VectorF =
|
||||
Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
|
||||
VectorF->copyAttributesFrom(F);
|
||||
}
|
||||
}
|
||||
assert(VectorF && "Can't create vector function.");
|
||||
|
||||
SmallVector<OperandBundleDef, 1> OpBundles;
|
||||
CI->getOperandBundlesAsDefs(OpBundles);
|
||||
CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
|
||||
|
||||
if (isa<FPMathOperator>(V))
|
||||
V->copyFastMathFlags(CI);
|
||||
|
||||
Entry[Part] = V;
|
||||
}
|
||||
|
||||
VectorLoopValueMap.initVector(&I, Entry);
|
||||
addMetadata(Entry, &I);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
// All other instructions are unsupported. Scalarize them.
|
||||
scalarizeInstruction(&I);
|
||||
break;
|
||||
} // end of switch.
|
||||
}
|
||||
|
||||
void InnerLoopVectorizer::updateAnalysis() {
|
||||
|
|
Loading…
Reference in New Issue