AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit

Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.

This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.

Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.

Reviewers: mareko, arsenm, tstellarAMD, nhaehnle

Subscribers: FireBurn, kerberizer, llvm-commits, arsenm

Differential Revision: http://reviews.llvm.org/D18340

Patch By: Bas Nieuwenhuizen

llvm-svn: 266337
This commit is contained in:
Tom Stellard 2016-04-14 16:27:07 +00:00
parent f110f8f9f7
commit 79a1fd718c
9 changed files with 213 additions and 62 deletions

View File

@ -621,6 +621,13 @@ inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
return (Value + Align - 1 - Skew) / Align * Align + Skew;
}
/// Returns the largest uint64_t less than or equal to \p Value and is
/// \p Skew mod \p Align. \p Align must be non-zero
inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
Skew %= Align;
return (Value - Skew) / Align * Align + Skew;
}
/// Returns the offset to the next integer (mod 2**64) that is greater than
/// or equal to \p Value and is a multiple of \p Align. \p Align must be
/// non-zero.

View File

@ -496,10 +496,12 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
// FIXME: This is the maximum work group size. We should try to get
// value from the reqd_work_group_size function attribute if it is
// available.
unsigned WorkGroupSize = 256;
const Function &ContainingFunction = *I.getParent()->getParent();
// FIXME: We should also try to get this value from the reqd_work_group_size
// function attribute if it is available.
unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
int AllocaSize =
WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
@ -520,7 +522,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
Function *F = I.getParent()->getParent();
Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
GlobalVariable *GV = new GlobalVariable(
*Mod, GVTy, false, GlobalValue::InternalLinkage,
UndefValue::get(GVTy),

View File

@ -48,6 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
PSInputAddr(0),
ReturnsVoid(true),
MaximumWorkGroupSize(0),
LDSWaveSpillSize(0),
PSInputEna(0),
NumUserSGPRs(0),
@ -123,6 +124,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
ST.isAmdHsaOS())
FlatScratchInit = true;
if (AMDGPU::isCompute(F->getCallingConv()))
MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);
else
MaximumWorkGroupSize = ST.getWavefrontSize();
}
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@ -202,10 +208,5 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
const MachineFunction &MF) const {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
// FIXME: We should get this information from kernel attributes if it
// is available.
if (AMDGPU::isCompute(MF.getFunction()->getCallingConv()))
return 256;
return ST.getWavefrontSize();
return MaximumWorkGroupSize;
}

View File

@ -60,6 +60,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned PSInputAddr;
bool ReturnsVoid;
unsigned MaximumWorkGroupSize;
public:
// FIXME: Make private
unsigned LDSWaveSpillSize;

View File

@ -23,6 +23,53 @@
using namespace llvm;
static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) {
const SIMachineFunctionInfo& MFI = *MF.getInfo<SIMachineFunctionInfo>();
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
unsigned SIMDPerCU = 4;
unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize();
return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) /
MaxInvocationsPerWave;
}
static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment;
unsigned ReservedSGPRCount;
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
TotalSGPRCountPerSIMD = 800;
AddressableSGPRCount = 102;
SGPRUsageAlignment = 16;
ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK
} else {
TotalSGPRCountPerSIMD = 512;
AddressableSGPRCount = 104;
SGPRUsageAlignment = 8;
ReservedSGPRCount = 2; // VCC
}
unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD);
MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment);
if (ST.hasSGPRInitBug())
MaxSGPRCount = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount);
}
static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) {
unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
unsigned TotalVGPRCountPerSIMD = 256;
unsigned VGPRUsageAlignment = 4;
return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD,
VGPRUsageAlignment);
}
static bool hasPressureSet(const int *PSets, unsigned PSetID) {
for (unsigned i = 0; PSets[i] != -1; ++i) {
if (PSets[i] == (int)PSetID)
@ -71,38 +118,27 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
if (ST.hasSGPRInitBug()) {
// Leave space for flat_scr, xnack_mask, vcc, and alignment
unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
}
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
// 100/101 for vcc. This is the next sgpr128 down.
return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
}
return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
}
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
if (ST.hasSGPRInitBug()) {
unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
return AMDGPU::SGPR_32RegClass.getRegister(Idx);
}
unsigned RegCount = getMaxWorkGroupSGPRCount(MF);
unsigned Reg;
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// Next register before reservations for flat_scr, xnack_mask, vcc,
// and scratch resource.
return AMDGPU::SGPR91;
// Try to place it in a hole after PrivateSegmentbufferReg.
if (RegCount & 3) {
// We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
// alignment constraints, so we have a hole where can put the wave offset.
Reg = RegCount - 1;
} else {
// We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
// wave offset before it.
Reg = RegCount - 5;
}
return AMDGPU::SGPR95;
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
}
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@ -124,35 +160,20 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
// Reserve the last 2 registers so we will always have at least 2 more that
// will physically contain VCC.
reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF);
unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF);
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
// for VCC/XNACK_MASK/FLAT_SCR.
//
// TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
// SGPRs when the XNACK feature is not used. This is currently not done
// because the code that counts SGPRs cannot account for such holes.
reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) {
unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);
}
// Tonga and Iceland can only allocate a fixed number of SGPRs due
// to a hw bug.
if (ST.hasSGPRInitBug()) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
// Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).
unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;
for (unsigned i = Limit; i < NumSGPRs; ++i) {
unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);
}
for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) {
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);
}
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

View File

@ -124,6 +124,10 @@ static unsigned getIntegerAttribute(const Function &F, const char *Name,
return Result;
}
unsigned getMaximumWorkGroupSize(const Function &F) {
return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256);
}
unsigned getInitialPSInputAddr(const Function &F) {
return getIntegerAttribute(F, "InitialPSInputAddr", 0);
}

View File

@ -45,6 +45,7 @@ bool isGroupSegment(const GlobalValue *GV);
bool isGlobalSegment(const GlobalValue *GV);
bool isReadOnlySegment(const GlobalValue *GV);
unsigned getMaximumWorkGroupSize(const Function &F);
unsigned getInitialPSInputAddr(const Function &F);
bool isShader(CallingConv::ID cc);

View File

@ -0,0 +1,72 @@
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s
; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
entry:
%stack = alloca [5 x i32], align 4
%0 = load i32, i32 addrspace(1)* %in, align 4
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
store i32 4, i32* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
store i32 5, i32* %arrayidx3, align 4
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
%2 = load i32, i32* %arrayidx10, align 4
store i32 %2, i32 addrspace(1)* %out, align 4
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
%3 = load i32, i32* %arrayidx12
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
store i32 %3, i32 addrspace(1)* %arrayidx13
ret void
}
; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
entry:
%stack = alloca [5 x i32], align 4
%0 = load i32, i32 addrspace(1)* %in, align 4
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
store i32 4, i32* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
store i32 5, i32* %arrayidx3, align 4
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
%2 = load i32, i32* %arrayidx10, align 4
store i32 %2, i32 addrspace(1)* %out, align 4
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
%3 = load i32, i32* %arrayidx12
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
store i32 %3, i32 addrspace(1)* %arrayidx13
ret void
}
; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
entry:
%stack = alloca [5 x i32], align 4
%0 = load i32, i32 addrspace(1)* %in, align 4
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
store i32 4, i32* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
store i32 5, i32* %arrayidx3, align 4
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
%2 = load i32, i32* %arrayidx10, align 4
store i32 %2, i32 addrspace(1)* %out, align 4
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
%3 = load i32, i32* %arrayidx12
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
store i32 %3, i32 addrspace(1)* %arrayidx13
ret void
}
attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
attributes #1 = { nounwind "amdgpu-max-work-group-size"="256" }
attributes #2 = { nounwind "amdgpu-max-work-group-size"="1600" }

View File

@ -0,0 +1,41 @@
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
; CHECK: NumVgprs: 63
define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {
main_body:
%8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8
%9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0
%10 = extractelement <3 x i32> %7, i32 0
%11 = extractelement <3 x i32> %7, i32 1
%12 = mul i32 %10, %11
%bc = bitcast <3 x i32> %7 to <3 x float>
%13 = extractelement <3 x float> %bc, i32 1
%14 = insertelement <512 x float> undef, float %13, i32 %12
call void @llvm.amdgcn.s.barrier()
%15 = extractelement <3 x i32> %6, i32 0
%16 = extractelement <3 x i32> %7, i32 0
%17 = shl i32 %15, 5
%18 = add i32 %17, %16
%19 = shl i32 %18, 4
%20 = extractelement <3 x i32> %7, i32 1
%21 = shl i32 %20, 2
%22 = sext i32 %21 to i64
%23 = getelementptr i8, i8 addrspace(3)* null, i64 %22
%24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)*
%25 = load i32, i32 addrspace(3)* %24, align 4
%26 = extractelement <512 x float> %14, i32 %25
%27 = insertelement <4 x float> undef, float %26, i32 0
call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %27, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false)
ret void
}
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
attributes #0 = { "amdgpu-max-work-group-size"="1024" }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind }
!0 = !{!1, !1, i64 0, i32 1}
!1 = !{!"const", null}