Cost model support for lowered math builtins.

We make the cost for calling libm functions extremely high as emitting the
calls is expensive and causes spills (on x86) so performance suffers. We still
vectorize important calls like ceilf and friends on SSE4.1. and fabs.

Differential Revision: http://llvm-reviews.chandlerc.com/D466

llvm-svn: 176287
This commit is contained in:
Benjamin Kramer 2013-02-28 19:09:33 +00:00
parent 882987f30c
commit f7cfac7a14
3 changed files with 110 additions and 12 deletions

View File

@ -23,6 +23,7 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
@ -189,6 +190,16 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
0);
return -1;
}
case Instruction::Call:
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
SmallVector<Type*, 4> Tys;
for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
Tys.push_back(II->getArgOperand(J)->getType());
return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
Tys);
}
return -1;
default:
// We don't have any information on this instruction.
return -1;

View File

@ -379,9 +379,12 @@ unsigned BasicTTI::getMemoryOpCost(unsigned Opcode, Type *Src,
return LT.first;
}
unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> Tys) const {
// assume that we need to scalarize this intrinsic.
unsigned ISD = 0;
switch (IID) {
default: {
// Assume that we need to scalarize this intrinsic.
unsigned ScalarizationCost = 0;
unsigned ScalarCalls = 1;
if (RetTy->isVectorTy()) {
@ -394,7 +397,59 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
}
}
return ScalarCalls + ScalarizationCost;
}
// Look for intrinsics that can be lowered directly or turned into a scalar
// intrinsic call.
case Intrinsic::sqrt: ISD = ISD::FSQRT; break;
case Intrinsic::sin: ISD = ISD::FSIN; break;
case Intrinsic::cos: ISD = ISD::FCOS; break;
case Intrinsic::exp: ISD = ISD::FEXP; break;
case Intrinsic::exp2: ISD = ISD::FEXP2; break;
case Intrinsic::log: ISD = ISD::FLOG; break;
case Intrinsic::log10: ISD = ISD::FLOG10; break;
case Intrinsic::log2: ISD = ISD::FLOG2; break;
case Intrinsic::fabs: ISD = ISD::FABS; break;
case Intrinsic::floor: ISD = ISD::FFLOOR; break;
case Intrinsic::ceil: ISD = ISD::FCEIL; break;
case Intrinsic::trunc: ISD = ISD::FTRUNC; break;
case Intrinsic::rint: ISD = ISD::FRINT; break;
case Intrinsic::pow: ISD = ISD::FPOW; break;
case Intrinsic::fma: ISD = ISD::FMA; break;
case Intrinsic::fmuladd: ISD = ISD::FMA; break; // FIXME: mul + add?
}
std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(RetTy);
if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
// The operation is legal. Assume it costs 1.
// If the type is split to multiple registers, assume that thre is some
// overhead to this.
// TODO: Once we have extract/insert subvector cost we need to use them.
if (LT.first > 1)
return LT.first * 2;
return LT.first * 1;
}
if (!TLI->isOperationExpand(ISD, LT.second)) {
// If the operation is custom lowered then assume
// thare the code is twice as expensive.
return LT.first * 2;
}
// Else, assume that we need to scalarize this intrinsic. For math builtins
// this will emit a costly libcall, adding call overhead and spills. Make it
// very expensive.
if (RetTy->isVectorTy()) {
unsigned Num = RetTy->getVectorNumElements();
unsigned Cost = TopTTI->getIntrinsicInstrCost(IID, RetTy->getScalarType(),
Tys);
return 10 * Cost * Num;
}
// This is going to be turned into a library call, make it expensive.
return 10;
}
unsigned BasicTTI::getNumberOfParts(Type *Tp) const {

View File

@ -0,0 +1,32 @@
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck %s -check-prefix=CORE2
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=corei7 -cost-model -analyze < %s | FileCheck %s -check-prefix=COREI7
; If SSE4.1 roundps instruction is available it is cheap to lower, otherwise
; it'll be scalarized into calls which are expensive.
define void @test1(float* nocapture %f) nounwind {
vector.ph:
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds float* %f, i64 %index
%1 = bitcast float* %0 to <4 x float>*
%wide.load = load <4 x float>* %1, align 4
%2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
store <4 x float> %2, <4 x float>* %1, align 4
%index.next = add i64 %index, 4
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body
for.end: ; preds = %vector.body
ret void
; CORE2: Printing analysis 'Cost Model Analysis' for function 'test1':
; CORE2: Cost Model: Found an estimated cost of 400 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
; COREI7: Printing analysis 'Cost Model Analysis' for function 'test1':
; COREI7: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
}
declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone