Allow vectorization of division by uniform power of 2.

This patch adds support to recognize division by uniform power of 2 and modifies the cost table to vectorize division by uniform power of 2 whenever possible.
Updates Cost model for Loop and SLP Vectorizer.The cost table is currently only updated for X86 backend.
Thanks to Hal, Andrea, Sanjay for the review. (http://reviews.llvm.org/D4971)

llvm-svn: 216371
This commit is contained in:
Karthik Bhat 2014-08-25 04:56:54 +00:00
parent c6914d068f
commit 7f33ff7dea
11 changed files with 187 additions and 50 deletions

View File

@ -335,6 +335,9 @@ public:
OK_NonUniformConstantValue // Operand is a non uniform constant value.
};
/// \brief Additional properties of an operand's values.
enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
/// \return The number of scalar or vector registers that the target has.
/// If 'Vectors' is true, it returns the number of vector registers. If it is
/// set to false, it returns the number of scalar registers.
@ -349,9 +352,12 @@ public:
virtual unsigned getMaximumUnrollFactor() const;
/// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
OperandValueKind Opd1Info = OK_AnyValue,
OperandValueKind Opd2Info = OK_AnyValue) const;
virtual unsigned
getArithmeticInstrCost(unsigned Opcode, Type *Ty,
OperandValueKind Opd1Info = OK_AnyValue,
OperandValueKind Opd2Info = OK_AnyValue,
OperandValueProperties Opd1PropInfo = OP_None,
OperandValueProperties Opd2PropInfo = OP_None) const;
/// \return The cost of a shuffle instruction of kind Kind and of type Tp.
/// The index and subtype parameters are used by the subvector insertion and

View File

@ -171,11 +171,12 @@ unsigned TargetTransformInfo::getMaximumUnrollFactor() const {
return PrevTTI->getMaximumUnrollFactor();
}
unsigned TargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
Type *Ty,
OperandValueKind Op1Info,
OperandValueKind Op2Info) const {
return PrevTTI->getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
unsigned TargetTransformInfo::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
OperandValueProperties Opd2PropInfo) const {
return PrevTTI->getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
Opd1PropInfo, Opd2PropInfo);
}
unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Tp,
@ -569,7 +570,8 @@ struct NoTTI final : ImmutablePass, TargetTransformInfo {
}
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
OperandValueKind) const override {
OperandValueKind, OperandValueProperties,
OperandValueProperties) const override {
return 1;
}

View File

@ -104,7 +104,8 @@ public:
unsigned getMaximumUnrollFactor() const override;
unsigned getRegisterBitWidth(bool Vector) const override;
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
OperandValueKind) const override;
OperandValueKind, OperandValueProperties,
OperandValueProperties) const override;
unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
int Index, Type *SubTp) const override;
unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
@ -289,8 +290,9 @@ unsigned BasicTTI::getMaximumUnrollFactor() const {
}
unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
OperandValueKind,
OperandValueKind) const {
OperandValueKind, OperandValueKind,
OperandValueProperties,
OperandValueProperties) const {
// Check if any of the operands are vector operands.
const TargetLoweringBase *TLI = getTLI();
int ISD = TLI->InstructionOpcodeToISD(Opcode);

View File

@ -112,10 +112,11 @@ public:
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
override;
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
OperandValueKind Opd1Info = OK_AnyValue,
OperandValueKind Opd2Info = OK_AnyValue) const
override;
unsigned getArithmeticInstrCost(
unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
OperandValueKind Opd2Info = OK_AnyValue,
OperandValueProperties Opd1PropInfo = OP_None,
OperandValueProperties Opd2PropInfo = OP_None) const override;
unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
@ -403,9 +404,10 @@ unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
return 2;
}
unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
OperandValueKind Opd1Info,
OperandValueKind Opd2Info) const {
unsigned AArch64TTI::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
OperandValueProperties Opd2PropInfo) const {
// Legalize the type.
std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
@ -413,8 +415,8 @@ unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
switch (ISD) {
default:
return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
Opd2Info);
return TargetTransformInfo::getArithmeticInstrCost(
Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
case ISD::ADD:
case ISD::MUL:
case ISD::XOR:

View File

@ -126,10 +126,11 @@ public:
unsigned getAddressComputationCost(Type *Val,
bool IsComplex) const override;
unsigned
getArithmeticInstrCost(unsigned Opcode, Type *Ty,
OperandValueKind Op1Info = OK_AnyValue,
OperandValueKind Op2Info = OK_AnyValue) const override;
unsigned getArithmeticInstrCost(
unsigned Opcode, Type *Ty, OperandValueKind Op1Info = OK_AnyValue,
OperandValueKind Op2Info = OK_AnyValue,
OperandValueProperties Opd1PropInfo = OP_None,
OperandValueProperties Opd2PropInfo = OP_None) const override;
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace) const override;
@ -497,9 +498,10 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
}
unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
OperandValueKind Op1Info,
OperandValueKind Op2Info) const {
unsigned ARMTTI::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
OperandValueProperties Opd2PropInfo) const {
int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
@ -555,8 +557,8 @@ unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
if (Idx != -1)
return LT.first * CostTbl[Idx].Cost;
unsigned Cost =
TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
unsigned Cost = TargetTransformInfo::getArithmeticInstrCost(
Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
// This is somewhat of a hack. The problem that we are facing is that SROA
// creates a sequence of shift, and, or instructions to construct values.

View File

@ -92,9 +92,10 @@ public:
virtual unsigned getNumberOfRegisters(bool Vector) const override;
virtual unsigned getRegisterBitWidth(bool Vector) const override;
virtual unsigned getMaximumUnrollFactor() const override;
virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
OperandValueKind,
OperandValueKind) const override;
virtual unsigned
getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
OperandValueKind, OperandValueProperties,
OperandValueProperties) const override;
virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
int Index, Type *SubTp) const override;
virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
@ -318,14 +319,15 @@ unsigned PPCTTI::getMaximumUnrollFactor() const {
return 2;
}
unsigned PPCTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
OperandValueKind Op1Info,
OperandValueKind Op2Info) const {
unsigned PPCTTI::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
OperandValueProperties Opd2PropInfo) const {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
// Fallback to the default implementation.
return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
Op2Info);
return TargetTransformInfo::getArithmeticInstrCost(
Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
}
unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,

View File

@ -84,7 +84,8 @@ public:
unsigned getRegisterBitWidth(bool Vector) const override;
unsigned getMaximumUnrollFactor() const override;
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
OperandValueKind) const override;
OperandValueKind, OperandValueProperties,
OperandValueProperties) const override;
unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
int Index, Type *SubTp) const override;
unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
@ -178,15 +179,37 @@ unsigned X86TTI::getMaximumUnrollFactor() const {
return 2;
}
unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
OperandValueKind Op1Info,
OperandValueKind Op2Info) const {
unsigned X86TTI::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
OperandValueProperties Opd2PropInfo) const {
// Legalize the type.
std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
if (ISD == ISD::SDIV &&
Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
// On X86, vector signed division by constants power-of-two are
// normally expanded to the sequence SRA + SRL + ADD + SRA.
// The OperandValue properties many not be same as that of previous
// operation;conservatively assume OP_None.
unsigned Cost =
2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
return Cost;
}
static const CostTblEntry<MVT::SimpleValueType>
AVX2UniformConstCostTable[] = {
{ ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence

View File

@ -5837,18 +5837,31 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OP_None;
TargetTransformInfo::OperandValueProperties Op2VP =
TargetTransformInfo::OP_None;
Value *Op2 = I->getOperand(1);
// Check for a splat of a constant or for a non uniform vector of constants.
if (isa<ConstantInt>(Op2))
if (isa<ConstantInt>(Op2)) {
ConstantInt *CInt = cast<ConstantInt>(Op2);
if (CInt && CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
} else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
if (cast<Constant>(Op2)->getSplatValue() != nullptr)
Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
if (SplatValue) {
ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
if (CInt && CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
}
}
return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
Op1VP, Op2VP);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);

View File

@ -1425,6 +1425,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OP_None;
TargetTransformInfo::OperandValueProperties Op2VP =
TargetTransformInfo::OP_None;
// If all operands are exactly the same ConstantInt then set the
// operand kind to OK_UniformConstantValue.
@ -1446,11 +1450,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
CInt != cast<ConstantInt>(I->getOperand(1)))
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
}
// FIXME: Currently cost of model modification for division by
// power of 2 is handled only for X86. Add support for other targets.
if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
ScalarCost =
VecTy->getNumElements() *
TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK);
VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK);
ScalarCost = VecTy->getNumElements() *
TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
Op1VP, Op2VP);
VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
Op1VP, Op2VP);
}
return VecCost - ScalarCost;
}

View File

@ -0,0 +1,32 @@
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct.anon = type { [100 x i32], i32, [100 x i32] }
@Foo = common global %struct.anon zeroinitializer, align 4
;CHECK-LABEL: @foo(
;CHECK: load <4 x i32>*
;CHECK: sdiv <4 x i32>
;CHECK: store <4 x i32>
define void @foo(){
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv
%0 = load i32* %arrayidx, align 4
%div = sdiv i32 %0, 2
%arrayidx2 = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv
store i32 %div, i32* %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 100
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}

View File

@ -0,0 +1,43 @@
; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
;CHECK-LABEL: @powof2div(
;CHECK: load <4 x i32>*
;CHECK: add <4 x i32>
;CHECK: sdiv <4 x i32>
define void @powof2div(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
entry:
%0 = load i32* %b, align 4
%1 = load i32* %c, align 4
%add = add nsw i32 %1, %0
%div = sdiv i32 %add, 2
store i32 %div, i32* %a, align 4
%arrayidx3 = getelementptr inbounds i32* %b, i64 1
%2 = load i32* %arrayidx3, align 4
%arrayidx4 = getelementptr inbounds i32* %c, i64 1
%3 = load i32* %arrayidx4, align 4
%add5 = add nsw i32 %3, %2
%div6 = sdiv i32 %add5, 2
%arrayidx7 = getelementptr inbounds i32* %a, i64 1
store i32 %div6, i32* %arrayidx7, align 4
%arrayidx8 = getelementptr inbounds i32* %b, i64 2
%4 = load i32* %arrayidx8, align 4
%arrayidx9 = getelementptr inbounds i32* %c, i64 2
%5 = load i32* %arrayidx9, align 4
%add10 = add nsw i32 %5, %4
%div11 = sdiv i32 %add10, 2
%arrayidx12 = getelementptr inbounds i32* %a, i64 2
store i32 %div11, i32* %arrayidx12, align 4
%arrayidx13 = getelementptr inbounds i32* %b, i64 3
%6 = load i32* %arrayidx13, align 4
%arrayidx14 = getelementptr inbounds i32* %c, i64 3
%7 = load i32* %arrayidx14, align 4
%add15 = add nsw i32 %7, %6
%div16 = sdiv i32 %add15, 2
%arrayidx17 = getelementptr inbounds i32* %a, i64 3
store i32 %div16, i32* %arrayidx17, align 4
ret void
}