AMDGPU: Analyze callee resource usage in AsmPrinter

llvm-svn: 309781
This commit is contained in:
Matt Arsenault 2017-08-02 01:31:28 +00:00
parent 89d3226019
commit 6ed7b9bfc0
7 changed files with 425 additions and 22 deletions

View File

@ -509,20 +509,154 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
}
}
MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
if (MRI.isPhysRegUsed(Reg)) {
HighestSGPRReg = Reg;
break;
int32_t MaxVGPR = -1;
int32_t MaxSGPR = -1;
uint32_t CalleeFrameSize = 0;
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
// TODO: Check regmasks? Do they occur anywhere except calls?
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
switch (Reg) {
case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
case AMDGPU::EXEC_HI:
case AMDGPU::SCC:
case AMDGPU::M0:
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_SHARED_LIMIT:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT:
continue;
case AMDGPU::NoRegister:
assert(MI.isDebugValue());
continue;
case AMDGPU::VCC:
case AMDGPU::VCC_LO:
case AMDGPU::VCC_HI:
Info.UsesVCC = true;
continue;
case AMDGPU::FLAT_SCR:
case AMDGPU::FLAT_SCR_LO:
case AMDGPU::FLAT_SCR_HI:
continue;
case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
case AMDGPU::TMA:
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
llvm_unreachable("trap handler registers should not be used");
default:
break;
}
if (AMDGPU::SReg_32RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
"trap handler registers should not be used");
IsSGPR = true;
Width = 1;
} else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
IsSGPR = false;
Width = 1;
} else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
"trap handler registers should not be used");
IsSGPR = true;
Width = 2;
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
IsSGPR = false;
Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
Width = 3;
} else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
IsSGPR = true;
Width = 4;
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
IsSGPR = false;
Width = 4;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
IsSGPR = true;
Width = 8;
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
IsSGPR = false;
Width = 8;
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
IsSGPR = true;
Width = 16;
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
IsSGPR = false;
Width = 16;
} else {
llvm_unreachable("Unknown register class");
}
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
if (IsSGPR) {
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
} else {
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
}
}
if (MI.isCall()) {
assert(MI.getOpcode() == AMDGPU::SI_CALL);
// Pseudo used just to encode the underlying global. Is there a better
// way to track this?
const Function *Callee = cast<Function>(MI.getOperand(2).getGlobal());
if (Callee->isDeclaration()) {
// If this is a call to an external function, we can't do much. Make
// conservative guesses.
// 48 SGPRs - vcc, - flat_scr, -xnack
int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true,
ST.hasFlatAddressSpace());
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
MaxVGPR = std::max(MaxVGPR, 23);
CalleeFrameSize = std::max(CalleeFrameSize, 16384u);
Info.UsesVCC = true;
Info.UsesFlatScratch = ST.hasFlatAddressSpace();
Info.HasDynamicallySizedStack = true;
} else {
// We force CodeGen to run in SCC order, so the callee's register
// usage etc. should be the cumulative usage of all callees.
auto I = CallGraphResourceInfo.find(Callee);
assert(I != CallGraphResourceInfo.end() &&
"callee should have been handled before caller");
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
CalleeFrameSize
= std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
Info.UsesVCC |= I->second.UsesVCC;
Info.UsesFlatScratch |= I->second.UsesFlatScratch;
Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
Info.HasRecursion |= I->second.HasRecursion;
}
if (!Callee->doesNotRecurse())
Info.HasRecursion = true;
}
}
}
// We found the maximum register index. They start at 0, so add one to get the
// number of registers.
Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
TRI.getHWRegIndex(HighestVGPRReg) + 1;
Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
TRI.getHWRegIndex(HighestSGPRReg) + 1;
Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
Info.PrivateSegmentSize += CalleeFrameSize;
return Info;
}

View File

@ -135,6 +135,11 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
// do that with a single pseudo source operation.
if (Opcode == AMDGPU::S_SETPC_B64_return)
Opcode = AMDGPU::S_SETPC_B64;
else if (Opcode == AMDGPU::SI_CALL) {
// SI_CALL is just S_SWAPPC_B64 with an additional operand to track the
// called function.
Opcode = AMDGPU::S_SWAPPC_B64;
}
int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode);
if (MCOpcode == -1) {

View File

@ -486,7 +486,10 @@ public:
class GCNPassConfig final : public AMDGPUPassConfig {
public:
GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
: AMDGPUPassConfig(TM, PM) {}
: AMDGPUPassConfig(TM, PM) {
// It is necessary to know the register usage of the entire call graph.
setRequiresCodeGenSCCOrder(EnableAMDGPUFunctionCalls);
}
GCNTargetMachine &getGCNTargetMachine() const {
return getTM<GCNTargetMachine>();

View File

@ -2650,14 +2650,27 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
.addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
return BB;
}
case AMDGPU::SI_CALL: {
case AMDGPU::SI_CALL_ISEL: {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
const DebugLoc &DL = MI.getDebugLoc();
unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned GlobalAddrReg = MI.getOperand(0).getReg();
MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
const GlobalValue *G = PCRel->getOperand(1).getGlobal();
MachineInstrBuilder MIB =
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_SWAPPC_B64), ReturnAddrReg);
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
.add(MI.getOperand(0))
.addGlobalAddress(G);
for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
MIB.add(MI.getOperand(I));
MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
MI.eraseFromParent();

View File

@ -327,16 +327,28 @@ def SI_RETURN : SPseudoInstSI <
let SchedRW = [WriteBranch];
}
// Return for returning function calls.
def SI_CALL : SPseudoInstSI <
(outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)],
"; call $src0"> {
// Return for returning function calls without output register.
//
// This version is only needed so we can fill in the output regiter in
// the custom inserter.
def SI_CALL_ISEL : SPseudoInstSI <
(outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> {
let Size = 4;
let isCall = 1;
let SchedRW = [WriteBranch];
let usesCustomInserter = 1;
}
// Wrapper around s_swappc_b64 with extra $callee parameter to track
// the called function after regalloc.
def SI_CALL : SPseudoInstSI <
(outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> {
let Size = 4;
let isCall = 1;
let SchedRW = [WriteBranch];
}
def ADJCALLSTACKUP : SPseudoInstSI<
(outs), (ins i32imm:$amt0, i32imm:$amt1),
[(callseq_start timm:$amt0, timm:$amt1)],

View File

@ -0,0 +1,230 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
; Make sure to run a GPU with the SGPR allocation bug.
; GCN-LABEL: {{^}}use_vcc:
; GCN: ; NumSgprs: 34
; GCN: ; NumVgprs: 0
define void @use_vcc() #1 {
call void asm sideeffect "", "~{vcc}" () #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_vcc:
; GCN: v_writelane_b32 v32, s33, 0
; GCN: v_writelane_b32 v32, s34, 1
; GCN: v_writelane_b32 v32, s35, 2
; GCN: s_swappc_b64
; GCN: v_readlane_b32 s35, v32, 2
; GCN: v_readlane_b32 s34, v32, 1
; GCN: v_readlane_b32 s33, v32, 0
; GCN: ; NumSgprs: 38
; GCN: ; NumVgprs: 33
define void @indirect_use_vcc() #1 {
call void @use_vcc()
ret void
}
; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
; GCN: is_dynamic_callstack = 0
; CI: ; NumSgprs: 40
; VI-NOBUG: ; NumSgprs: 42
; VI-BUG: ; NumSgprs: 96
; GCN: ; NumVgprs: 33
define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 {
call void @indirect_use_vcc()
ret void
}
; GCN-LABEL: {{^}}use_flat_scratch:
; CI: ; NumSgprs: 36
; VI: ; NumSgprs: 38
; GCN: ; NumVgprs: 0
define void @use_flat_scratch() #1 {
call void asm sideeffect "", "~{flat_scratch}" () #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_flat_scratch:
; CI: ; NumSgprs: 40
; VI: ; NumSgprs: 42
; GCN: ; NumVgprs: 33
define void @indirect_use_flat_scratch() #1 {
call void @use_flat_scratch()
ret void
}
; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
; GCN: is_dynamic_callstack = 0
; CI: ; NumSgprs: 40
; VI-NOBUG: ; NumSgprs: 42
; VI-BUG: ; NumSgprs: 96
; GCN: ; NumVgprs: 33
define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 {
call void @indirect_use_flat_scratch()
ret void
}
; GCN-LABEL: {{^}}use_10_vgpr:
; GCN: ; NumVgprs: 10
define void @use_10_vgpr() #1 {
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4}"() #0
call void asm sideeffect "", "~{v5},~{v6},~{v7},~{v8},~{v9}"() #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_10_vgpr:
; GCN: ; NumVgprs: 33
define void @indirect_use_10_vgpr() #0 {
call void @use_10_vgpr()
ret void
}
; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
; GCN: is_dynamic_callstack = 0
; GCN: ; NumVgprs: 10
define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
call void @indirect_use_10_vgpr()
ret void
}
; GCN-LABEL: {{^}}use_40_vgpr:
; GCN: ; NumVgprs: 40
define void @use_40_vgpr() #1 {
call void asm sideeffect "", "~{v39}"() #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_40_vgpr:
; GCN: ; NumVgprs: 40
define void @indirect_use_40_vgpr() #0 {
call void @use_40_vgpr()
ret void
}
; GCN-LABEL: {{^}}use_80_sgpr:
; GCN: ; NumSgprs: 80
define void @use_80_sgpr() #1 {
call void asm sideeffect "", "~{s79}"() #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_80_sgpr:
; GCN: ; NumSgprs: 82
define void @indirect_use_80_sgpr() #1 {
call void @use_80_sgpr()
ret void
}
; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
; GCN: is_dynamic_callstack = 0
; CI: ; NumSgprs: 84
; VI-NOBUG: ; NumSgprs: 86
; VI-BUG: ; NumSgprs: 96
define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
call void @indirect_use_80_sgpr()
ret void
}
; GCN-LABEL: {{^}}use_stack0:
; GCN: ScratchSize: 2052
define void @use_stack0() #1 {
%alloca = alloca [512 x i32], align 4
call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0
ret void
}
; GCN-LABEL: {{^}}use_stack1:
; GCN: ScratchSize: 404
define void @use_stack1() #1 {
%alloca = alloca [100 x i32], align 4
call void asm sideeffect "; use $0", "v"([100 x i32]* %alloca) #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_stack:
; GCN: ScratchSize: 2120
define void @indirect_use_stack() #1 {
%alloca = alloca [16 x i32], align 4
call void asm sideeffect "; use $0", "v"([16 x i32]* %alloca) #0
call void @use_stack0()
ret void
}
; GCN-LABEL: {{^}}indirect_2_level_use_stack:
; GCN: is_dynamic_callstack = 0
; GCN: ScratchSize: 2120
define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
call void @indirect_use_stack()
ret void
}
; Should be maximum of callee usage
; GCN-LABEL: {{^}}multi_call_use_use_stack:
; GCN: is_dynamic_callstack = 0
; GCN: ScratchSize: 2052
define amdgpu_kernel void @multi_call_use_use_stack() #0 {
call void @use_stack0()
call void @use_stack1()
ret void
}
declare void @external() #0
; GCN-LABEL: {{^}}usage_external:
; GCN: is_dynamic_callstack = 1
; NumSgprs: 48
; NumVgprs: 24
; GCN: ScratchSize: 16384
define amdgpu_kernel void @usage_external() #0 {
call void @external()
ret void
}
declare void @external_recurse() #2
; GCN-LABEL: {{^}}usage_external_recurse:
; GCN: is_dynamic_callstack = 1
; NumSgprs: 48
; NumVgprs: 24
; GCN: ScratchSize: 16384
define amdgpu_kernel void @usage_external_recurse() #0 {
call void @external_recurse()
ret void
}
; GCN-LABEL: {{^}}direct_recursion_use_stack:
; GCN: ScratchSize: 2052
define void @direct_recursion_use_stack(i32 %val) #2 {
%alloca = alloca [512 x i32], align 4
call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0
%cmp = icmp eq i32 %val, 0
br i1 %cmp, label %ret, label %call
call:
%val.sub1 = sub i32 %val, 1
call void @direct_recursion_use_stack(i32 %val.sub1)
br label %ret
ret:
ret void
}
; GCN-LABEL: {{^}}usage_direct_recursion:
; GCN: is_ptr64 = 1
; GCN: is_dynamic_callstack = 1
; GCN: workitem_private_segment_byte_size = 2052
define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
call void @direct_recursion_use_stack(i32 %n)
ret void
}
attributes #0 = { nounwind norecurse }
attributes #1 = { nounwind noinline norecurse }
attributes #2 = { nounwind noinline }

View File

@ -3,9 +3,9 @@
; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
; IR: alloca [5 x i32]
; ASM-LABEL: {{^}}promote_alloca_shaders:
; ASM: ; LDSByteSize: 0 bytes/workgroup (compile time only)
; ASM-LABEL: {{^}}promote_alloca_shaders:
; ASM: ; ScratchSize: 24
define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
entry:
%stack = alloca [5 x i32], align 4
@ -29,7 +29,10 @@ entry:
; OPT-LABEL: @promote_to_vector_call_c(
; OPT-NOT: alloca
; OPT: extractelement <2 x i32> %{{[0-9]+}}, i32 %in
; ASM-LABEL: {{^}}promote_to_vector_call_c:
; ASM-NOT: LDSByteSize
; ASM: ; ScratchSize: 0
define void @promote_to_vector_call_c(i32 addrspace(1)* %out, i32 %in) #0 {
entry:
%tmp = alloca [2 x i32]
@ -47,8 +50,11 @@ entry:
; OPT-LABEL: @no_promote_to_lds_c(
; OPT: alloca
; ASM-LABEL: {{^}}no_promote_to_lds_c:
; ASM-NOT: LDSByteSize
define void @no_promote_to_lds(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
; ASM: ; ScratchSize: 24
define void @no_promote_to_lds_c(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
entry:
%stack = alloca [5 x i32], align 4
%0 = load i32, i32 addrspace(1)* %in, align 4