From 6ed7b9bfc09b98b0987c6d87bd250620050ff2d5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 2 Aug 2017 01:31:28 +0000 Subject: [PATCH] AMDGPU: Analyze callee resource usage in AsmPrinter llvm-svn: 309781 --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 156 +++++++++++- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 5 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 19 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 20 +- .../AMDGPU/call-graph-register-usage.ll | 230 ++++++++++++++++++ .../AMDGPU/promote-alloca-calling-conv.ll | 12 +- 7 files changed, 425 insertions(+), 22 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2247814cfe55..1eab96d5e188 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -509,20 +509,154 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } } - MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestSGPRReg = Reg; - break; + int32_t MaxVGPR = -1; + int32_t MaxSGPR = -1; + uint32_t CalleeFrameSize = 0; + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: Check regmasks? Do they occur anywhere except calls? + for (const MachineOperand &MO : MI.operands()) { + unsigned Width = 0; + bool IsSGPR = false; + + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::SCC: + case AMDGPU::M0: + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: + continue; + + case AMDGPU::NoRegister: + assert(MI.isDebugValue()); + continue; + + case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: + Info.UsesVCC = true; + continue; + + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + continue; + + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + llvm_unreachable("trap handler registers should not be used"); + + default: + break; + } + + if (AMDGPU::SReg_32RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 1; + } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) { + IsSGPR = false; + Width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { + IsSGPR = false; + Width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { + IsSGPR = false; + Width = 3; + } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { + IsSGPR = true; + Width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { + IsSGPR = false; + Width = 4; + } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { + IsSGPR = true; + Width = 8; + } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { + IsSGPR = false; + Width = 8; + } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { + IsSGPR = true; + Width = 16; + } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { + IsSGPR = false; + Width = 16; + } else { + llvm_unreachable("Unknown register class"); + } + unsigned HWReg = TRI.getHWRegIndex(Reg); + int MaxUsed = HWReg + Width - 1; + if (IsSGPR) { + MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; + } else { + MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; + } + } + + if (MI.isCall()) { + assert(MI.getOpcode() == AMDGPU::SI_CALL); + // Pseudo used just to encode the underlying global. Is there a better + // way to track this? + const Function *Callee = cast(MI.getOperand(2).getGlobal()); + if (Callee->isDeclaration()) { + // If this is a call to an external function, we can't do much. Make + // conservative guesses. + + // 48 SGPRs - vcc, - flat_scr, -xnack + int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true, + ST.hasFlatAddressSpace()); + MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); + MaxVGPR = std::max(MaxVGPR, 23); + + CalleeFrameSize = std::max(CalleeFrameSize, 16384u); + Info.UsesVCC = true; + Info.UsesFlatScratch = ST.hasFlatAddressSpace(); + Info.HasDynamicallySizedStack = true; + } else { + // We force CodeGen to run in SCC order, so the callee's register + // usage etc. should be the cumulative usage of all callees. + auto I = CallGraphResourceInfo.find(Callee); + assert(I != CallGraphResourceInfo.end() && + "callee should have been handled before caller"); + + MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); + MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); + CalleeFrameSize + = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); + Info.UsesVCC |= I->second.UsesVCC; + Info.UsesFlatScratch |= I->second.UsesFlatScratch; + Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; + Info.HasRecursion |= I->second.HasRecursion; + } + + if (!Callee->doesNotRecurse()) + Info.HasRecursion = true; + } } } - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestVGPRReg) + 1; - Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestSGPRReg) + 1; + Info.NumExplicitSGPR = MaxSGPR + 1; + Info.NumVGPR = MaxVGPR + 1; + Info.PrivateSegmentSize += CalleeFrameSize; return Info; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index c665bc38f4b2..ba52c3ae1a42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -135,6 +135,11 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { // do that with a single pseudo source operation. if (Opcode == AMDGPU::S_SETPC_B64_return) Opcode = AMDGPU::S_SETPC_B64; + else if (Opcode == AMDGPU::SI_CALL) { + // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the + // called function. + Opcode = AMDGPU::S_SWAPPC_B64; + } int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode); if (MCOpcode == -1) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 714aebbafaeb..854000d1c413 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -486,7 +486,10 @@ public: class GCNPassConfig final : public AMDGPUPassConfig { public: GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) - : AMDGPUPassConfig(TM, PM) {} + : AMDGPUPassConfig(TM, PM) { + // It is necessary to know the register usage of the entire call graph. + setRequiresCodeGenSCCOrder(EnableAMDGPUFunctionCalls); + } GCNTargetMachine &getGCNTargetMachine() const { return getTM(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1d88b5d78d74..47a5aa4b0cea 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2650,14 +2650,27 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); return BB; } - case AMDGPU::SI_CALL: { + case AMDGPU::SI_CALL_ISEL: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned GlobalAddrReg = MI.getOperand(0).getReg(); + MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg); + assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET); + + const GlobalValue *G = PCRel->getOperand(1).getGlobal(); + MachineInstrBuilder MIB = - BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_SWAPPC_B64), ReturnAddrReg); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) + .add(MI.getOperand(0)) + .addGlobalAddress(G); + + for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) MIB.add(MI.getOperand(I)); + + MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index c8b208e69a51..50e806188a93 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -327,16 +327,28 @@ def SI_RETURN : SPseudoInstSI < let SchedRW = [WriteBranch]; } -// Return for returning function calls. -def SI_CALL : SPseudoInstSI < - (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)], - "; call $src0"> { +// Return for returning function calls without output register. +// +// This version is only needed so we can fill in the output regiter in +// the custom inserter. +def SI_CALL_ISEL : SPseudoInstSI < + (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> { let Size = 4; let isCall = 1; let SchedRW = [WriteBranch]; let usesCustomInserter = 1; } +// Wrapper around s_swappc_b64 with extra $callee parameter to track +// the called function after regalloc. +def SI_CALL : SPseudoInstSI < + (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> { + let Size = 4; + let isCall = 1; + let SchedRW = [WriteBranch]; +} + + def ADJCALLSTACKUP : SPseudoInstSI< (outs), (ins i32imm:$amt0, i32imm:$amt1), [(callseq_start timm:$amt0, timm:$amt1)], diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll new file mode 100644 index 000000000000..60616639ea80 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -0,0 +1,230 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s + +; Make sure to run a GPU with the SGPR allocation bug. + +; GCN-LABEL: {{^}}use_vcc: +; GCN: ; NumSgprs: 34 +; GCN: ; NumVgprs: 0 +define void @use_vcc() #1 { + call void asm sideeffect "", "~{vcc}" () #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_vcc: +; GCN: v_writelane_b32 v32, s33, 0 +; GCN: v_writelane_b32 v32, s34, 1 +; GCN: v_writelane_b32 v32, s35, 2 +; GCN: s_swappc_b64 +; GCN: v_readlane_b32 s35, v32, 2 +; GCN: v_readlane_b32 s34, v32, 1 +; GCN: v_readlane_b32 s33, v32, 0 +; GCN: ; NumSgprs: 38 +; GCN: ; NumVgprs: 33 +define void @indirect_use_vcc() #1 { + call void @use_vcc() + ret void +} + +; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: +; GCN: is_dynamic_callstack = 0 +; CI: ; NumSgprs: 40 +; VI-NOBUG: ; NumSgprs: 42 +; VI-BUG: ; NumSgprs: 96 +; GCN: ; NumVgprs: 33 +define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 { + call void @indirect_use_vcc() + ret void +} + +; GCN-LABEL: {{^}}use_flat_scratch: +; CI: ; NumSgprs: 36 +; VI: ; NumSgprs: 38 +; GCN: ; NumVgprs: 0 +define void @use_flat_scratch() #1 { + call void asm sideeffect "", "~{flat_scratch}" () #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_flat_scratch: +; CI: ; NumSgprs: 40 +; VI: ; NumSgprs: 42 +; GCN: ; NumVgprs: 33 +define void @indirect_use_flat_scratch() #1 { + call void @use_flat_scratch() + ret void +} + +; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel: +; GCN: is_dynamic_callstack = 0 +; CI: ; NumSgprs: 40 +; VI-NOBUG: ; NumSgprs: 42 +; VI-BUG: ; NumSgprs: 96 +; GCN: ; NumVgprs: 33 +define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 { + call void @indirect_use_flat_scratch() + ret void +} + +; GCN-LABEL: {{^}}use_10_vgpr: +; GCN: ; NumVgprs: 10 +define void @use_10_vgpr() #1 { + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4}"() #0 + call void asm sideeffect "", "~{v5},~{v6},~{v7},~{v8},~{v9}"() #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_10_vgpr: +; GCN: ; NumVgprs: 33 +define void @indirect_use_10_vgpr() #0 { + call void @use_10_vgpr() + ret void +} + +; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr: +; GCN: is_dynamic_callstack = 0 +; GCN: ; NumVgprs: 10 +define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 { + call void @indirect_use_10_vgpr() + ret void +} + +; GCN-LABEL: {{^}}use_40_vgpr: +; GCN: ; NumVgprs: 40 +define void @use_40_vgpr() #1 { + call void asm sideeffect "", "~{v39}"() #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_40_vgpr: +; GCN: ; NumVgprs: 40 +define void @indirect_use_40_vgpr() #0 { + call void @use_40_vgpr() + ret void +} + +; GCN-LABEL: {{^}}use_80_sgpr: +; GCN: ; NumSgprs: 80 +define void @use_80_sgpr() #1 { + call void asm sideeffect "", "~{s79}"() #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_80_sgpr: +; GCN: ; NumSgprs: 82 +define void @indirect_use_80_sgpr() #1 { + call void @use_80_sgpr() + ret void +} + +; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr: +; GCN: is_dynamic_callstack = 0 +; CI: ; NumSgprs: 84 +; VI-NOBUG: ; NumSgprs: 86 +; VI-BUG: ; NumSgprs: 96 +define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 { + call void @indirect_use_80_sgpr() + ret void +} + + +; GCN-LABEL: {{^}}use_stack0: +; GCN: ScratchSize: 2052 +define void @use_stack0() #1 { + %alloca = alloca [512 x i32], align 4 + call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0 + ret void +} + +; GCN-LABEL: {{^}}use_stack1: +; GCN: ScratchSize: 404 +define void @use_stack1() #1 { + %alloca = alloca [100 x i32], align 4 + call void asm sideeffect "; use $0", "v"([100 x i32]* %alloca) #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_stack: +; GCN: ScratchSize: 2120 +define void @indirect_use_stack() #1 { + %alloca = alloca [16 x i32], align 4 + call void asm sideeffect "; use $0", "v"([16 x i32]* %alloca) #0 + call void @use_stack0() + ret void +} + +; GCN-LABEL: {{^}}indirect_2_level_use_stack: +; GCN: is_dynamic_callstack = 0 +; GCN: ScratchSize: 2120 +define amdgpu_kernel void @indirect_2_level_use_stack() #0 { + call void @indirect_use_stack() + ret void +} + + +; Should be maximum of callee usage +; GCN-LABEL: {{^}}multi_call_use_use_stack: +; GCN: is_dynamic_callstack = 0 +; GCN: ScratchSize: 2052 +define amdgpu_kernel void @multi_call_use_use_stack() #0 { + call void @use_stack0() + call void @use_stack1() + ret void +} + + +declare void @external() #0 + +; GCN-LABEL: {{^}}usage_external: +; GCN: is_dynamic_callstack = 1 +; NumSgprs: 48 +; NumVgprs: 24 +; GCN: ScratchSize: 16384 +define amdgpu_kernel void @usage_external() #0 { + call void @external() + ret void +} + +declare void @external_recurse() #2 + +; GCN-LABEL: {{^}}usage_external_recurse: +; GCN: is_dynamic_callstack = 1 +; NumSgprs: 48 +; NumVgprs: 24 +; GCN: ScratchSize: 16384 +define amdgpu_kernel void @usage_external_recurse() #0 { + call void @external_recurse() + ret void +} + +; GCN-LABEL: {{^}}direct_recursion_use_stack: +; GCN: ScratchSize: 2052 +define void @direct_recursion_use_stack(i32 %val) #2 { + %alloca = alloca [512 x i32], align 4 + call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0 + %cmp = icmp eq i32 %val, 0 + br i1 %cmp, label %ret, label %call + +call: + %val.sub1 = sub i32 %val, 1 + call void @direct_recursion_use_stack(i32 %val.sub1) + br label %ret + +ret: + ret void +} + +; GCN-LABEL: {{^}}usage_direct_recursion: +; GCN: is_ptr64 = 1 +; GCN: is_dynamic_callstack = 1 +; GCN: workitem_private_segment_byte_size = 2052 +define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { + call void @direct_recursion_use_stack(i32 %n) + ret void +} + + +attributes #0 = { nounwind norecurse } +attributes #1 = { nounwind noinline norecurse } +attributes #2 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll index a95e9f828b61..f5a8b65998cf 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll @@ -3,9 +3,9 @@ ; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 { ; IR: alloca [5 x i32] -; ASM-LABEL: {{^}}promote_alloca_shaders: -; ASM: ; LDSByteSize: 0 bytes/workgroup (compile time only) +; ASM-LABEL: {{^}}promote_alloca_shaders: +; ASM: ; ScratchSize: 24 define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 { entry: %stack = alloca [5 x i32], align 4 @@ -29,7 +29,10 @@ entry: ; OPT-LABEL: @promote_to_vector_call_c( ; OPT-NOT: alloca ; OPT: extractelement <2 x i32> %{{[0-9]+}}, i32 %in + +; ASM-LABEL: {{^}}promote_to_vector_call_c: ; ASM-NOT: LDSByteSize +; ASM: ; ScratchSize: 0 define void @promote_to_vector_call_c(i32 addrspace(1)* %out, i32 %in) #0 { entry: %tmp = alloca [2 x i32] @@ -47,8 +50,11 @@ entry: ; OPT-LABEL: @no_promote_to_lds_c( ; OPT: alloca + +; ASM-LABEL: {{^}}no_promote_to_lds_c: ; ASM-NOT: LDSByteSize -define void @no_promote_to_lds(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +; ASM: ; ScratchSize: 24 +define void @no_promote_to_lds_c(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4