[ARM] Implement TTI::isHardwareLoopProfitable

Implement the backend target hook to drive the HardwareLoops pass.
The low-overhead branch extension for Arm M-class cores is flexible
enough that we don't have to ensure correctness at this point, except
checking that the loop counter variable can be stored in LR - a
32-bit register. For it to be profitable, we want to avoid loops that
contain function calls, or any other instruction that alters the PC.
    
This implementation uses TargetLoweringInfo, to query type and
operation actions, looks at intrinsic calls and also performs some
manual checks for remainder/division and FP operations.
    
I think this should be a good base to start and extra details can be
filled out later.

Differential Revision: https://reviews.llvm.org/D62907

llvm-svn: 363149
This commit is contained in:
Sam Parker 2019-06-12 12:00:42 +00:00
parent 91bb72a337
commit 757ac02dc8
8 changed files with 1332 additions and 0 deletions

View File

@ -36,6 +36,10 @@ using namespace llvm;
#define DEBUG_TYPE "armtti"
static cl::opt<bool> DisableLowOverheadLoops(
"disable-arm-loloops", cl::Hidden, cl::init(true),
cl::desc("Disable the generation of low-overhead loops"));
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
@ -628,6 +632,196 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
UseMaskForCond, UseMaskForGaps);
}
bool ARMTTIImpl::isLoweredToCall(const Function *F) {
if (!F->isIntrinsic())
BaseT::isLoweredToCall(F);
// Assume all Arm-specific intrinsics map to an instruction.
if (F->getName().startswith("llvm.arm"))
return false;
switch (F->getIntrinsicID()) {
default: break;
case Intrinsic::powi:
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::pow:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
case Intrinsic::exp:
case Intrinsic::exp2:
return true;
case Intrinsic::sqrt:
case Intrinsic::fabs:
case Intrinsic::copysign:
case Intrinsic::floor:
case Intrinsic::ceil:
case Intrinsic::trunc:
case Intrinsic::rint:
case Intrinsic::nearbyint:
case Intrinsic::round:
case Intrinsic::canonicalize:
case Intrinsic::lround:
case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
return true;
if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
return true;
// Some operations can be handled by vector instructions and assume
// unsupported vectors will be expanded into supported scalar ones.
// TODO Handle scalar operations properly.
return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
case Intrinsic::masked_store:
case Intrinsic::masked_load:
case Intrinsic::masked_gather:
case Intrinsic::masked_scatter:
return !ST->hasMVEIntegerOps();
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::sadd_sat:
case Intrinsic::uadd_sat:
case Intrinsic::ssub_sat:
case Intrinsic::usub_sat:
return false;
}
return BaseT::isLoweredToCall(F);
}
bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
TTI::HardwareLoopInfo &HWLoopInfo) {
// Low-overhead branches are only supported in the 'low-overhead branch'
// extension of v8.1-m.
if (!ST->hasLOB() || DisableLowOverheadLoops)
return false;
// For now, for simplicity, only support loops with one exit block.
if (!L->getExitBlock())
return false;
if (!SE.hasLoopInvariantBackedgeTakenCount(L))
return false;
const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
return false;
const SCEV *TripCountSCEV =
SE.getAddExpr(BackedgeTakenCount,
SE.getOne(BackedgeTakenCount->getType()));
// We need to store the trip count in LR, a 32-bit register.
if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
return false;
// Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
// point in generating a hardware loop if that's going to happen.
auto MaybeCall = [this](Instruction &I) {
const ARMTargetLowering *TLI = getTLI();
unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
EVT VT = TLI->getValueType(DL, I.getType(), true);
if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
return true;
// Check if an intrinsic will be lowered to a call and assume that any
// other CallInst will generate a bl.
if (auto *Call = dyn_cast<CallInst>(&I)) {
if (isa<IntrinsicInst>(Call)) {
if (const Function *F = Call->getCalledFunction())
return isLoweredToCall(F);
}
return true;
}
// FPv5 provides conversions between integer, double-precision,
// single-precision, and half-precision formats.
switch (I.getOpcode()) {
default:
break;
case Instruction::FPToSI:
case Instruction::FPToUI:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::FPTrunc:
case Instruction::FPExt:
return !ST->hasFPARMv8Base();
}
// FIXME: Unfortunately the approach of checking the Operation Action does
// not catch all cases of Legalization that use library calls. Our
// Legalization step categorizes some transformations into library calls as
// Custom, Expand or even Legal when doing type legalization. So for now
// we have to special case for instance the SDIV of 64bit integers and the
// use of floating point emulation.
if (VT.isInteger() && VT.getSizeInBits() >= 64) {
switch (ISD) {
default:
break;
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
case ISD::UREM:
case ISD::SDIVREM:
case ISD::UDIVREM:
return true;
}
}
// Assume all other non-float operations are supported.
if (!VT.isFloatingPoint())
return false;
// We'll need a library call to handle most floats when using soft.
if (TLI->useSoftFloat()) {
switch (I.getOpcode()) {
default:
return true;
case Instruction::Alloca:
case Instruction::Load:
case Instruction::Store:
case Instruction::Select:
case Instruction::PHI:
return false;
}
}
// We'll need a libcall to perform double precision operations on a single
// precision only FPU.
if (I.getType()->isDoubleTy() && !ST->hasFP64())
return true;
// Likewise for half precision arithmetic.
if (I.getType()->isHalfTy() && !ST->hasFullFP16())
return true;
return false;
};
// Scan the instructions to see if there's any that we know will turn into a
// call.
for (auto *BB : L->getBlocks())
for (auto &I : *BB)
if (MaybeCall(I))
return false;
// TODO: Check whether the trip count calculation is expensive. If L is the
// inner loop but we know it has a low trip count, calculating that trip
// count (in the parent loop) may be detrimental.
LLVMContext &C = L->getHeader()->getContext();
HWLoopInfo.CounterInReg = true;
HWLoopInfo.CountType = Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;
}
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
// Only currently enable these preferences for M-Class cores.

View File

@ -180,6 +180,12 @@ public:
bool UseMaskForCond = false,
bool UseMaskForGaps = false);
bool isLoweredToCall(const Function *F);
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
TTI::HardwareLoopInfo &HWLoopInfo);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);

View File

@ -0,0 +1,404 @@
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MAIN
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8,+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP64
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP
; CHECK-LABEL: skip_call
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-NOT: call i32 @llvm.loop.decrement
define i32 @skip_call(i32 %n) {
entry:
%cmp6 = icmp eq i32 %n, 0
br i1 %cmp6, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%i.08 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
%res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
%call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #2
%add = add nsw i32 %call, %res.07
%inc1 = add nuw i32 %i.08, 1
%exitcond = icmp eq i32 %inc1, %n
br i1 %exitcond, label %while.end.loopexit, label %while.body
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i32 %res.0.lcssa
}
; CHECK-LABEL: test_target_specific
; CHECK: call void @llvm.set.loop.iterations.i32(i32 50)
; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ 50, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %loop, label %exit
define i32 @test_target_specific(i32* %a, i32* %b) {
entry:
br label %loop
loop:
%acc = phi i32 [ 0, %entry ], [ %res, %loop ]
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr i32, i32* %a, i32 %count
%addr.b = getelementptr i32, i32* %b, i32 %count
%load.a = load i32, i32* %addr.a
%load.b = load i32, i32* %addr.b
%res = call i32 @llvm.arm.smlad(i32 %load.a, i32 %load.b, i32 %acc)
%count.next = add nuw i32 %count, 2
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret i32 %res
}
; CHECK-LABEL: test_fabs_f16
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
define void @test_fabs_f16(half* %a, half* %b) {
entry:
br label %loop
loop:
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr half, half* %a, i32 %count
%load.a = load half, half* %addr.a
%abs = call half @llvm.fabs.f16(half %load.a)
%addr.b = getelementptr half, half* %b, i32 %count
store half %abs, half *%addr.b
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_fabs
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
define float @test_fabs(float* %a) {
entry:
br label %loop
loop:
%acc = phi float [ 0.0, %entry ], [ %res, %loop ]
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr float, float* %a, i32 %count
%load.a = load float, float* %addr.a
%abs = call float @llvm.fabs.f32(float %load.a)
%res = fadd float %abs, %acc
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret float %res
}
; CHECK-LABEL: test_fabs_64
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-FP64: void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
define void @test_fabs_64(double* %a, double* %b) {
entry:
br label %loop
loop:
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr double, double* %a, i32 %count
%load.a = load double, double* %addr.a
%abs = call double @llvm.fabs.f64(double %load.a)
%addr.b = getelementptr double, double* %b, i32 %count
store double %abs, double *%addr.b
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_fabs_vec
; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
define <4 x float> @test_fabs_vec(<4 x float>* %a) {
entry:
br label %loop
loop:
%acc = phi <4 x float> [ zeroinitializer, %entry ], [ %res, %loop ]
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
%load.a = load <4 x float>, <4 x float>* %addr.a
%abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %load.a)
%res = fadd <4 x float> %abs, %acc
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret <4 x float> %res
}
; CHECK-LABEL: test_log
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-NOT: llvm.loop.decrement
define float @test_log(float* %a) {
entry:
br label %loop
loop:
%acc = phi float [ 0.0, %entry ], [ %res, %loop ]
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr float, float* %a, i32 %count
%load.a = load float, float* %addr.a
%abs = call float @llvm.log.f32(float %load.a)
%res = fadd float %abs, %acc
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret float %res
}
; CHECK-LABEL: test_sqrt_16
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100)
define void @test_sqrt_16(half* %a, half* %b) {
entry:
br label %loop
loop:
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr half, half* %a, i32 %count
%load.a = load half, half* %addr.a
%sqrt = call half @llvm.sqrt.f16(half %load.a)
%addr.b = getelementptr half, half* %b, i32 %count
store half %sqrt, half *%addr.b
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_sqrt
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
; CHECK-FP: call void @llvm.set.loop.iterations
; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
define void @test_sqrt(float* %a, float* %b) {
entry:
br label %loop
loop:
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr float, float* %a, i32 %count
%load.a = load float, float* %addr.a
%sqrt = call float @llvm.sqrt.f32(float %load.a)
%addr.b = getelementptr float, float* %b, i32 %count
store float %sqrt, float* %addr.b
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_sqrt_64
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100)
define void @test_sqrt_64(double* %a, double* %b) {
entry:
br label %loop
loop:
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr double, double* %a, i32 %count
%load.a = load double, double* %addr.a
%sqrt = call double @llvm.sqrt.f64(double %load.a)
%addr.b = getelementptr double, double* %b, i32 %count
store double %sqrt, double *%addr.b
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_sqrt_vec
; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) {
entry:
br label %loop
loop:
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
%load.a = load <4 x float>, <4 x float>* %addr.a
%sqrt = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %load.a)
%addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
store <4 x float> %sqrt, <4 x float>* %addr.b
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_overflow
; CHECK: call void @llvm.set.loop.iterations
define i32 @test_overflow(i32* %a, i32* %b) {
entry:
br label %loop
loop:
%acc = phi i32 [ 0, %entry ], [ %res, %loop ]
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr i32, i32* %a, i32 %count
%addr.b = getelementptr i32, i32* %b, i32 %count
%load.a = load i32, i32* %addr.a
%load.b = load i32, i32* %addr.b
%sadd = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %load.a, i32 %load.b)
%res = extractvalue {i32, i1} %sadd, 0
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret i32 %res
}
; TODO: We should be able to generate a qadd/sub
; CHECK-LABEL: test_sat
; CHECK: call void @llvm.set.loop.iterations.i32(i32 100)
define i32 @test_sat(i32* %a, i32* %b) {
entry:
br label %loop
loop:
%acc = phi i32 [ 0, %entry ], [ %res, %loop ]
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr i32, i32* %a, i32 %count
%addr.b = getelementptr i32, i32* %b, i32 %count
%load.a = load i32, i32* %addr.a
%load.b = load i32, i32* %addr.b
%res = call i32 @llvm.sadd.sat.i32(i32 %load.a, i32 %load.b)
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret i32 %res
}
; CHECK-LABEL: test_masked_i32
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-MVEFP: call void @llvm.set.loop.iterations
; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
define void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) {
entry:
br label %loop
loop:
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr <4 x i32>, <4 x i32>* %a, i32 %count
%addr.b = getelementptr <4 x i32>, <4 x i32>* %b, i32 %count
%addr.c = getelementptr <4 x i32>, <4 x i32>* %c, i32 %count
%load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
%load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
%res = add <4 x i32> %load.a, %load.b
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %res, <4 x i32>* %addr.c, i32 4, <4 x i1> %mask)
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_masked_f32
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-MVEFP: call void @llvm.set.loop.iterations
; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
define void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) {
entry:
br label %loop
loop:
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
%addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
%addr.c = getelementptr <4 x float>, <4 x float>* %c, i32 %count
%load.a = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
%load.b = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
%res = fadd <4 x float> %load.a, %load.b
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %res, <4 x float>* %addr.c, i32 4, <4 x i1> %mask)
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: test_gather_scatter
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-MVEFP: call void @llvm.set.loop.iterations
; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
define void @test_gather_scatter(<4 x i1> %mask, <4 x float*> %a, <4 x float*> %b, <4 x float*> %c, <4 x float> %passthru) {
entry:
br label %loop
loop:
%count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
%load.a = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
%load.b = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
%res = fadd <4 x float> %load.a, %load.b
call void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float> %res, <4 x float*> %c, i32 4, <4 x i1> %mask)
%count.next = add nuw i32 %count, 1
%cmp = icmp ne i32 %count.next, 100
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
declare i32 @bar(...) local_unnamed_addr #1
declare i32 @llvm.arm.smlad(i32, i32, i32)
declare half @llvm.fabs.f16(half)
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
declare float @llvm.log.f32(float)
declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
declare half @llvm.sqrt.f16(half)
declare float @llvm.sqrt.f32(float)
declare double @llvm.sqrt.f64(double)
declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
declare i32 @llvm.sadd.sat.i32(i32, i32)
declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
declare <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
declare void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float>, <4 x float*>, i32, <4 x i1>)

View File

@ -0,0 +1,35 @@
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -o - | FileCheck %s
@g = common local_unnamed_addr global i32* null, align 4
; CHECK-LABEL: counter_too_large
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-NOT: call i32 @llvm.loop.decrement
define i32 @counter_too_large(i64 %n) {
entry:
%cmp7 = icmp eq i64 %n, 0
br i1 %cmp7, label %while.end, label %while.body.lr.ph
while.body.lr.ph:
%0 = load i32*, i32** @g, align 4
br label %while.body
while.body:
%i.09 = phi i64 [ 0, %while.body.lr.ph ], [ %inc1, %while.body ]
%res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
%idxprom = trunc i64 %i.09 to i32
%arrayidx = getelementptr inbounds i32, i32* %0, i32 %idxprom
%1 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %1, %res.08
%inc1 = add nuw i64 %i.09, 1
%cmp = icmp ult i64 %inc1, %n
br i1 %cmp, label %while.body, label %while.end.loopexit
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i32 %res.0.lcssa
}

View File

@ -0,0 +1,259 @@
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s
@g = common local_unnamed_addr global i32* null, align 4
; CHECK-LABEL: do_with_i32_urem
; CHECK: while.body.preheader:
; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
; CHECK-NEXT: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
define i32 @do_with_i32_urem(i32 %n) {
entry:
%cmp7 = icmp eq i32 %n, 0
br i1 %cmp7, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
%res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
%rem = urem i32 %i.09, 5
%add = add i32 %rem, %res.08
%inc1 = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc1, %n
br i1 %exitcond, label %while.end.loopexit, label %while.body
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i32 %res.0.lcssa
}
; CHECK-LABEL: do_with_i32_srem
; CHECK: while.body.preheader:
; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
; CHECK-NEXT: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
define i32 @do_with_i32_srem(i32 %n) {
entry:
%cmp7 = icmp eq i32 %n, 0
br i1 %cmp7, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
%res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
%rem = srem i32 %i.09, 5
%add = sub i32 %rem, %res.08
%inc1 = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc1, %n
br i1 %exitcond, label %while.end.loopexit, label %while.body
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i32 %res.0.lcssa
}
; CHECK-LABEL: do_with_i32_udiv
; CHECK: while.body.preheader:
; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
; CHECK-NEXT: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
define i32 @do_with_i32_udiv(i32 %n) {
entry:
%cmp7 = icmp eq i32 %n, 0
br i1 %cmp7, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
%res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
%rem = udiv i32 %i.09, 5
%add = add i32 %rem, %res.08
%inc1 = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc1, %n
br i1 %exitcond, label %while.end.loopexit, label %while.body
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i32 %res.0.lcssa
}
; CHECK-LABEL: do_with_i32_sdiv
; CHECK: while.body.preheader:
; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
; CHECK-NEXT: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
define i32 @do_with_i32_sdiv(i32 %n) {
entry:
%cmp7 = icmp eq i32 %n, 0
br i1 %cmp7, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
%res.08 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
%rem = sdiv i32 %i.09, 5
%add = sub i32 %rem, %res.08
%inc1 = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc1, %n
br i1 %exitcond, label %while.end.loopexit, label %while.body
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i32 %res.0.lcssa
}
; CHECK-LABEL: do_with_i64_urem
; CHECK-NOT: llvm.set.loop.iterations
; CHECK-NOT: llvm.loop.decrement
define i64 @do_with_i64_urem(i32 %n) {
entry:
%cmp7 = icmp eq i32 %n, 0
br i1 %cmp7, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
%res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
%conv = zext i32 %i.09 to i64
%rem = urem i64 %conv, 5
%add = add i64 %rem, %res.08
%inc1 = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc1, %n
br i1 %exitcond, label %while.end.loopexit, label %while.body
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i64 %res.0.lcssa
}
; CHECK-LABEL: do_with_i64_srem
; CHECK-NOT: llvm.set.loop.iterations
; CHECK-NOT: llvm.loop.decrement
define i64 @do_with_i64_srem(i32 %n) {
entry:
%cmp7 = icmp eq i32 %n, 0
br i1 %cmp7, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
%res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
%conv = zext i32 %i.09 to i64
%rem = srem i64 %conv, 5
%add = sub i64 %rem, %res.08
%inc1 = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc1, %n
br i1 %exitcond, label %while.end.loopexit, label %while.body
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i64 %res.0.lcssa
}
; CHECK-LABEL: do_with_i64_udiv
; CHECK-NOT: llvm.set.loop.iterations
; CHECK-NOT: llvm.loop.decrement
define i64 @do_with_i64_udiv(i32 %n) {
entry:
%cmp7 = icmp eq i32 %n, 0
br i1 %cmp7, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
%res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
%conv = zext i32 %i.09 to i64
%rem = udiv i64 %conv, 5
%add = add i64 %rem, %res.08
%inc1 = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc1, %n
br i1 %exitcond, label %while.end.loopexit, label %while.body
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i64 %res.0.lcssa
}
; CHECK-LABEL: do_with_i64_sdiv
; CHECK-NOT: call void @llvm.set.loop.iterations
; CHECK-NOT: call i32 @llvm.loop.decrement
define i64 @do_with_i64_sdiv(i32 %n) {
entry:
%cmp7 = icmp eq i32 %n, 0
br i1 %cmp7, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%i.09 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
%res.08 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
%conv = zext i32 %i.09 to i64
%rem = sdiv i64 %conv, 5
%add = sub i64 %rem, %res.08
%inc1 = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc1, %n
br i1 %exitcond, label %while.end.loopexit, label %while.body
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i64 %res.0.lcssa
}

View File

@ -0,0 +1,207 @@
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+soft-float -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SOFT
; CHECK-LABEL: test_fptosi
; CHECK: while.body.lr.ph:
; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK-FP-NEXT: br label %while.body
; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
define void @test_fptosi(i32 %n, i32** %g, double** %d) {
entry:
%n.off = add i32 %n, -1
%0 = icmp ult i32 %n.off, 500
br i1 %0, label %while.body.lr.ph, label %cleanup
while.body.lr.ph:
%1 = load double*, double** %d, align 4
%2 = load i32*, i32** %g, align 4
br label %while.body
while.body:
%i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
%rem = urem i32 %i.012, 10
%tobool = icmp eq i32 %rem, 0
br i1 %tobool, label %if.end4, label %if.then2
if.then2:
%arrayidx = getelementptr inbounds double, double* %1, i32 %i.012
%3 = load double, double* %arrayidx, align 8
%conv = fptosi double %3 to i32
%arrayidx3 = getelementptr inbounds i32, i32* %2, i32 %i.012
store i32 %conv, i32* %arrayidx3, align 4
br label %if.end4
if.end4:
%inc = add nuw i32 %i.012, 1
%cmp1 = icmp ult i32 %inc, %n
br i1 %cmp1, label %while.body, label %cleanup.loopexit
cleanup.loopexit:
br label %cleanup
cleanup:
ret void
}
; CHECK-LABEL: test_fptoui
; CHECK-FP: while.body.lr.ph:
; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK-FP-NEXT: br label %while.body
; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
define void @test_fptoui(i32 %n, i32** %g, double** %d) {
entry:
%n.off = add i32 %n, -1
%0 = icmp ult i32 %n.off, 500
br i1 %0, label %while.body.lr.ph, label %cleanup
while.body.lr.ph:
%1 = load double*, double** %d, align 4
%2 = load i32*, i32** %g, align 4
br label %while.body
while.body:
%i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
%rem = urem i32 %i.012, 10
%tobool = icmp eq i32 %rem, 0
br i1 %tobool, label %if.end4, label %if.then2
if.then2:
%arrayidx = getelementptr inbounds double, double* %1, i32 %i.012
%3 = load double, double* %arrayidx, align 8
%conv = fptoui double %3 to i32
%arrayidx3 = getelementptr inbounds i32, i32* %2, i32 %i.012
store i32 %conv, i32* %arrayidx3, align 4
br label %if.end4
if.end4:
%inc = add nuw i32 %i.012, 1
%cmp1 = icmp ult i32 %inc, %n
br i1 %cmp1, label %while.body, label %cleanup.loopexit
cleanup.loopexit:
br label %cleanup
cleanup:
ret void
}
; CHECK-LABEL: load_store_float
; CHECK: while.body.lr.ph:
; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK-NEXT: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
define void @load_store_float(i32 %n, double** %d, double** %g) {
entry:
%n.off = add i32 %n, -1
%0 = icmp ult i32 %n.off, 500
br i1 %0, label %while.body.lr.ph, label %cleanup
while.body.lr.ph:
%1 = load double*, double** %d, align 4
%2 = load double*, double** %g, align 4
br label %while.body
while.body:
%i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
%rem = urem i32 %i.012, 10
%tobool = icmp eq i32 %rem, 0
br i1 %tobool, label %if.end4, label %if.then2
if.then2:
%arrayidx = getelementptr inbounds double, double* %1, i32 %i.012
%3 = load double, double* %arrayidx, align 8
%arrayidx3 = getelementptr inbounds double, double* %2, i32 %i.012
store double %3, double* %arrayidx3, align 8
br label %if.end4
if.end4:
%inc = add nuw i32 %i.012, 1
%cmp1 = icmp ult i32 %inc, %n
br i1 %cmp1, label %while.body, label %cleanup.loopexit
cleanup.loopexit:
br label %cleanup
cleanup:
ret void
}
; CHECK-LABEL: fp_add
; CHECK: while.body.lr.ph:
; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK: br label %while.body
; CHECK-SOFT-NOT: call i32 @llvm.loop.decrement
; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
define void @fp_add(i32 %n, float** %d, float** %g) {
entry:
%n.off = add i32 %n, -1
%0 = icmp ult i32 %n.off, 500
br i1 %0, label %while.body.lr.ph, label %cleanup
while.body.lr.ph:
%1 = load float*, float** %d, align 4
%2 = load float*, float** %g, align 4
br label %while.body
while.body:
%i.012 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %if.end4 ]
%rem = urem i32 %i.012, 10
%tobool = icmp eq i32 %rem, 0
br i1 %tobool, label %if.end4, label %if.then2
if.then2:
%arrayidx = getelementptr inbounds float, float* %1, i32 %i.012
%3 = load float, float* %arrayidx, align 4
%arrayidx3 = getelementptr inbounds float, float* %2, i32 %i.012
%4 = load float, float* %arrayidx3, align 4
%add = fadd float %3, %4
store float %add, float* %arrayidx3, align 4
br label %if.end4
if.end4:
%inc = add nuw i32 %i.012, 1
%cmp1 = icmp ult i32 %inc, %n
br i1 %cmp1, label %while.body, label %cleanup.loopexit
cleanup.loopexit:
br label %cleanup
cleanup:
ret void
}

View File

@ -0,0 +1,155 @@
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=true %s -S -o - | FileCheck %s --check-prefix=DISABLED
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-lob -hardware-loops %s -S -o - | FileCheck %s --check-prefix=DISABLED
; DISABLED-NOT: llvm.set.loop.iterations
; DISABLED-NOT: llvm.loop.decrement
@g = common local_unnamed_addr global i32* null, align 4
; CHECK-LABEL: do_copy
; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
; CHECK: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %entry ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %while.body, label %while.end
define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) {
entry:
br label %while.body
while.body:
%q.addr.05 = phi i32* [ %incdec.ptr, %while.body ], [ %q, %entry ]
%p.addr.04 = phi i32* [ %incdec.ptr1, %while.body ], [ %p, %entry ]
%x.addr.03 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
%dec = add nsw i32 %x.addr.03, -1
%incdec.ptr = getelementptr inbounds i32, i32* %q.addr.05, i32 1
%0 = load i32, i32* %q.addr.05, align 4
%incdec.ptr1 = getelementptr inbounds i32, i32* %p.addr.04, i32 1
store i32 %0, i32* %p.addr.04, align 4
%tobool = icmp eq i32 %dec, 0
br i1 %tobool, label %while.end, label %while.body
while.end:
ret i32 0
}
; CHECK-LABEL: do_inc1
; CHECK: while.body.lr.ph:
; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
; CHECK-NEXT: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
define i32 @do_inc1(i32 %n) {
entry:
%cmp7 = icmp eq i32 %n, 0
br i1 %cmp7, label %while.end, label %while.body.lr.ph
while.body.lr.ph:
%0 = load i32*, i32** @g, align 4
br label %while.body
while.body:
%i.09 = phi i32 [ 0, %while.body.lr.ph ], [ %inc1, %while.body ]
%res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
%arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.09
%1 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %1, %res.08
%inc1 = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc1, %n
br i1 %exitcond, label %while.end.loopexit, label %while.body
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i32 %res.0.lcssa
}
; CHECK-LABEL: do_inc2
; CHECK: while.body.lr.ph:
; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, -1
; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[ROUND]], 1
; CHECK: [[COUNT:%[^ ]+]] = add i32 [[HALVE]], 1
; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK-NEXT: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
define i32 @do_inc2(i32 %n) {
entry:
%cmp7 = icmp sgt i32 %n, 0
br i1 %cmp7, label %while.body.lr.ph, label %while.end
while.body.lr.ph:
%0 = load i32*, i32** @g, align 4
br label %while.body
while.body:
%i.09 = phi i32 [ 0, %while.body.lr.ph ], [ %add1, %while.body ]
%res.08 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
%arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.09
%1 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %1, %res.08
%add1 = add nuw nsw i32 %i.09, 2
%cmp = icmp slt i32 %add1, %n
br i1 %cmp, label %while.body, label %while.end.loopexit
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i32 %res.0.lcssa
}
; CHECK-LABEL: do_dec2
; CHECK: while.body.lr.ph:
; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, 1
; CHECK: [[CMP:%[^ ]+]] = icmp slt i32 %n, 2
; CHECK: [[SMIN:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 2
; CHECK: [[SUB:%[^ ]+]] = sub i32 [[ROUND]], [[SMIN]]
; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[SUB]], 1
; CHECK: [[COUNT:%[^ ]+]] = add i32 [[HALVE]], 1
; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK-NEXT: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
define i32 @do_dec2(i32 %n) {
entry:
%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %while.body.lr.ph, label %while.end
while.body.lr.ph:
%0 = load i32*, i32** @g, align 4
br label %while.body
while.body:
%i.08 = phi i32 [ %n, %while.body.lr.ph ], [ %sub, %while.body ]
%res.07 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
%arrayidx = getelementptr inbounds i32, i32* %0, i32 %i.08
%1 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %1, %res.07
%sub = add nsw i32 %i.08, -2
%cmp = icmp sgt i32 %i.08, 2
br i1 %cmp, label %while.body, label %while.end.loopexit
while.end.loopexit:
br label %while.end
while.end:
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
ret i32 %res.0.lcssa
}

View File

@ -0,0 +1,72 @@
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s
; CHECK-LABEL: early_exit
; CHECK-NOT: llvm.set.loop.iterations
; CHECK-NOT: llvm.loop.decrement
define i32 @early_exit(i32* nocapture readonly %a, i32 %max, i32 %n) {
entry:
br label %do.body
do.body:
%i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
%arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.0
%0 = load i32, i32* %arrayidx, align 4
%cmp = icmp sgt i32 %0, %max
br i1 %cmp, label %do.end, label %if.end
if.end:
%inc = add nuw i32 %i.0, 1
%cmp1 = icmp ult i32 %inc, %n
br i1 %cmp1, label %do.body, label %if.end.do.end_crit_edge
if.end.do.end_crit_edge:
%arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i32 %inc
%.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
br label %do.end
do.end:
%1 = phi i32 [ %.pre, %if.end.do.end_crit_edge ], [ %0, %do.body ]
ret i32 %1
}
; CHECK-LABEL: nested
; CHECK-NOT: call void @llvm.set.loop.iterations.i32(i32 %N)
; CHECK: br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
; CHECK: br label %while.body3.us
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %N, %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK: br i1 [[CMP]], label %while.body3.us, label %while.cond1.while.end_crit_edge.us
; CHECK-NOT: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
; CHECK-NOT: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7
define void @nested(i32* nocapture %A, i32 %N) {
entry:
%cmp20 = icmp eq i32 %N, 0
br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
while.cond1.preheader.us:
%i.021.us = phi i32 [ %inc6.us, %while.cond1.while.end_crit_edge.us ], [ 0, %entry ]
%mul.us = mul i32 %i.021.us, %N
br label %while.body3.us
while.body3.us:
%j.019.us = phi i32 [ 0, %while.cond1.preheader.us ], [ %inc.us, %while.body3.us ]
%add.us = add i32 %j.019.us, %mul.us
%arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
store i32 %add.us, i32* %arrayidx.us, align 4
%inc.us = add nuw i32 %j.019.us, 1
%exitcond = icmp eq i32 %inc.us, %N
br i1 %exitcond, label %while.cond1.while.end_crit_edge.us, label %while.body3.us
while.cond1.while.end_crit_edge.us:
%inc6.us = add nuw i32 %i.021.us, 1
%exitcond23 = icmp eq i32 %inc6.us, %N
br i1 %exitcond23, label %while.end7, label %while.cond1.preheader.us
while.end7:
ret void
}