AMDGPU/SI: Move the local memory usage related checking after calling convention checking in PromoteAlloca

Summary:
  Promoting Alloca to Vector and Promoting Alloca to LDS are two independent handling of Alloca and should not affect each other.
As a result, we should not give up promoting to vector if there is not enough LDS. This patch factors out the local memory usage
related checking out and replace it after the calling convention checking.

Reviewer:
  arsenm

Differential Revision:
  http://reviews.llvm.org/D33139

llvm-svn: 303684
This commit is contained in:
Changpeng Fang 2017-05-23 20:25:41 +00:00
parent 3334cc017e
commit 1dbace195d
2 changed files with 136 additions and 99 deletions

View File

@ -97,6 +97,9 @@ private:
Instruction *UseInst,
int OpIdx0, int OpIdx1) const;
/// Check whether we have enough local memory for promotion.
bool hasSufficientLocalMem(const Function &F);
public:
static char ID;
@ -107,7 +110,7 @@ public:
StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
void handleAlloca(AllocaInst &I);
bool handleAlloca(AllocaInst &I, bool SufficientLDS);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
if (!ST.isPromoteAllocaEnabled())
return false;
AS = AMDGPU::getAMDGPUAS(*F.getParent());
FunctionType *FTy = F.getFunctionType();
// If the function has any arguments in the local address space, then it's
// possible these arguments require the entire local memory space, so
// we cannot use local memory in the pass.
for (Type *ParamTy : FTy->params()) {
PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
LocalMemLimit = 0;
DEBUG(dbgs() << "Function has local memory argument. Promoting to "
"local memory disabled.\n");
return false;
}
}
LocalMemLimit = ST.getLocalMemorySize();
if (LocalMemLimit == 0)
return false;
const DataLayout &DL = Mod->getDataLayout();
// Check how much local memory is being used by global objects
CurrentLocalMemUsage = 0;
for (GlobalVariable &GV : Mod->globals()) {
if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
continue;
for (const User *U : GV.users()) {
const Instruction *Use = dyn_cast<Instruction>(U);
if (!Use)
continue;
if (Use->getParent()->getParent() == &F) {
unsigned Align = GV.getAlignment();
if (Align == 0)
Align = DL.getABITypeAlignment(GV.getValueType());
// FIXME: Try to account for padding here. The padding is currently
// determined from the inverse order of uses in the function. I'm not
// sure if the use list order is in any way connected to this, so the
// total reported size is likely incorrect.
uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
CurrentLocalMemUsage += AllocSize;
break;
}
}
}
unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
F);
// Restrict local memory usage so that we don't drastically reduce occupancy,
// unless it is already significantly reduced.
// TODO: Have some sort of hint or other heuristics to guess occupancy based
// on other factors..
unsigned OccupancyHint = ST.getWavesPerEU(F).second;
if (OccupancyHint == 0)
OccupancyHint = 7;
// Clamp to max value.
OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
// Check the hint but ignore it if it's obviously wrong from the existing LDS
// usage.
MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
// Round up to the next tier of usage.
unsigned MaxSizeWithWaveCount
= ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
// Program is possibly broken by using more local mem than available.
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
return false;
LocalMemLimit = MaxSizeWithWaveCount;
DEBUG(
dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
<< " Rounding size to " << MaxSizeWithWaveCount
<< " with a maximum occupancy of " << MaxOccupancy << '\n'
<< " and " << (LocalMemLimit - CurrentLocalMemUsage)
<< " available for promotion\n"
);
bool SufficientLDS = hasSufficientLocalMem(F);
bool Changed = false;
BasicBlock &EntryBB = *F.begin();
for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
AllocaInst *AI = dyn_cast<AllocaInst>(I);
++I;
if (AI)
handleAlloca(*AI);
Changed |= handleAlloca(*AI, SufficientLDS);
}
return true;
return Changed;
}
std::pair<Value *, Value *>
@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
return true;
}
bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
FunctionType *FTy = F.getFunctionType();
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
// If the function has any arguments in the local address space, then it's
// possible these arguments require the entire local memory space, so
// we cannot use local memory in the pass.
for (Type *ParamTy : FTy->params()) {
PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
LocalMemLimit = 0;
DEBUG(dbgs() << "Function has local memory argument. Promoting to "
"local memory disabled.\n");
return false;
}
}
LocalMemLimit = ST.getLocalMemorySize();
if (LocalMemLimit == 0)
return false;
const DataLayout &DL = Mod->getDataLayout();
// Check how much local memory is being used by global objects
CurrentLocalMemUsage = 0;
for (GlobalVariable &GV : Mod->globals()) {
if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
continue;
for (const User *U : GV.users()) {
const Instruction *Use = dyn_cast<Instruction>(U);
if (!Use)
continue;
if (Use->getParent()->getParent() == &F) {
unsigned Align = GV.getAlignment();
if (Align == 0)
Align = DL.getABITypeAlignment(GV.getValueType());
// FIXME: Try to account for padding here. The padding is currently
// determined from the inverse order of uses in the function. I'm not
// sure if the use list order is in any way connected to this, so the
// total reported size is likely incorrect.
uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
CurrentLocalMemUsage += AllocSize;
break;
}
}
}
unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
F);
// Restrict local memory usage so that we don't drastically reduce occupancy,
// unless it is already significantly reduced.
// TODO: Have some sort of hint or other heuristics to guess occupancy based
// on other factors..
unsigned OccupancyHint = ST.getWavesPerEU(F).second;
if (OccupancyHint == 0)
OccupancyHint = 7;
// Clamp to max value.
OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
// Check the hint but ignore it if it's obviously wrong from the existing LDS
// usage.
MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
// Round up to the next tier of usage.
unsigned MaxSizeWithWaveCount
= ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
// Program is possibly broken by using more local mem than available.
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
return false;
LocalMemLimit = MaxSizeWithWaveCount;
DEBUG(
dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
<< " Rounding size to " << MaxSizeWithWaveCount
<< " with a maximum occupancy of " << MaxOccupancy << '\n'
<< " and " << (LocalMemLimit - CurrentLocalMemUsage)
<< " available for promotion\n"
);
return true;
}
// FIXME: Should try to pick the most likely to be profitable allocas first.
void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// Array allocations are probably not worth handling, since an allocation of
// the array type is the canonical form.
if (!I.isStaticAlloca() || I.isArrayAllocation())
return;
return false;
IRBuilder<> Builder(&I);
@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
DEBUG(dbgs() << "Trying to promote " << I << '\n');
if (tryPromoteAllocaToVector(&I, AS)) {
DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
return;
}
if (tryPromoteAllocaToVector(&I, AS))
return true; // Promoted to vector.
const Function &ContainingFunction = *I.getParent()->getParent();
CallingConv::ID CC = ContainingFunction.getCallingConv();
@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
break;
default:
DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
return;
return false;
}
// Not likely to have sufficient local memory for promotion.
if (!SufficientLDS)
return false;
const AMDGPUSubtarget &ST =
TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
if (NewSize > LocalMemLimit) {
DEBUG(dbgs() << " " << AllocSize
<< " bytes of local memory not available to promote\n");
return;
return false;
}
CurrentLocalMemUsage = NewSize;
@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
DEBUG(dbgs() << " Do not know how to convert all uses\n");
return;
return false;
}
DEBUG(dbgs() << "Promoting alloca to local memory\n");
@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
}
}
return true;
}
FunctionPass *llvm::createAMDGPUPromoteAlloca() {

View File

@ -138,3 +138,25 @@ entry:
store float %tmp2, float addrspace(1)* %out
ret void
}
; The pointer arguments in local address space should not affect promotion to vector.
; OPT-LABEL: @vector_read_with_local_arg(
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) {
entry:
%tmp = alloca [4 x i32]
%x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
%y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
%z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
%w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
store i32 0, i32* %x
store i32 1, i32* %y
store i32 2, i32* %z
store i32 3, i32* %w
%tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
%tmp2 = load i32, i32* %tmp1
store i32 %tmp2, i32 addrspace(1)* %out
ret void
}