AMDGPU: Analyze callee resource usage in AsmPrinter
llvm-svn: 309781
This commit is contained in:
parent
89d3226019
commit
6ed7b9bfc0
|
@ -509,20 +509,154 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
|
|||
}
|
||||
}
|
||||
|
||||
MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
|
||||
for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
|
||||
if (MRI.isPhysRegUsed(Reg)) {
|
||||
HighestSGPRReg = Reg;
|
||||
break;
|
||||
int32_t MaxVGPR = -1;
|
||||
int32_t MaxSGPR = -1;
|
||||
uint32_t CalleeFrameSize = 0;
|
||||
|
||||
for (const MachineBasicBlock &MBB : MF) {
|
||||
for (const MachineInstr &MI : MBB) {
|
||||
// TODO: Check regmasks? Do they occur anywhere except calls?
|
||||
for (const MachineOperand &MO : MI.operands()) {
|
||||
unsigned Width = 0;
|
||||
bool IsSGPR = false;
|
||||
|
||||
if (!MO.isReg())
|
||||
continue;
|
||||
|
||||
unsigned Reg = MO.getReg();
|
||||
switch (Reg) {
|
||||
case AMDGPU::EXEC:
|
||||
case AMDGPU::EXEC_LO:
|
||||
case AMDGPU::EXEC_HI:
|
||||
case AMDGPU::SCC:
|
||||
case AMDGPU::M0:
|
||||
case AMDGPU::SRC_SHARED_BASE:
|
||||
case AMDGPU::SRC_SHARED_LIMIT:
|
||||
case AMDGPU::SRC_PRIVATE_BASE:
|
||||
case AMDGPU::SRC_PRIVATE_LIMIT:
|
||||
continue;
|
||||
|
||||
case AMDGPU::NoRegister:
|
||||
assert(MI.isDebugValue());
|
||||
continue;
|
||||
|
||||
case AMDGPU::VCC:
|
||||
case AMDGPU::VCC_LO:
|
||||
case AMDGPU::VCC_HI:
|
||||
Info.UsesVCC = true;
|
||||
continue;
|
||||
|
||||
case AMDGPU::FLAT_SCR:
|
||||
case AMDGPU::FLAT_SCR_LO:
|
||||
case AMDGPU::FLAT_SCR_HI:
|
||||
continue;
|
||||
|
||||
case AMDGPU::TBA:
|
||||
case AMDGPU::TBA_LO:
|
||||
case AMDGPU::TBA_HI:
|
||||
case AMDGPU::TMA:
|
||||
case AMDGPU::TMA_LO:
|
||||
case AMDGPU::TMA_HI:
|
||||
llvm_unreachable("trap handler registers should not be used");
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (AMDGPU::SReg_32RegClass.contains(Reg)) {
|
||||
assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
|
||||
"trap handler registers should not be used");
|
||||
IsSGPR = true;
|
||||
Width = 1;
|
||||
} else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 1;
|
||||
} else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
|
||||
assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
|
||||
"trap handler registers should not be used");
|
||||
IsSGPR = true;
|
||||
Width = 2;
|
||||
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 2;
|
||||
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 3;
|
||||
} else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 4;
|
||||
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 4;
|
||||
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 8;
|
||||
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 8;
|
||||
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
|
||||
IsSGPR = true;
|
||||
Width = 16;
|
||||
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
|
||||
IsSGPR = false;
|
||||
Width = 16;
|
||||
} else {
|
||||
llvm_unreachable("Unknown register class");
|
||||
}
|
||||
unsigned HWReg = TRI.getHWRegIndex(Reg);
|
||||
int MaxUsed = HWReg + Width - 1;
|
||||
if (IsSGPR) {
|
||||
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
|
||||
} else {
|
||||
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
|
||||
}
|
||||
}
|
||||
|
||||
if (MI.isCall()) {
|
||||
assert(MI.getOpcode() == AMDGPU::SI_CALL);
|
||||
// Pseudo used just to encode the underlying global. Is there a better
|
||||
// way to track this?
|
||||
const Function *Callee = cast<Function>(MI.getOperand(2).getGlobal());
|
||||
if (Callee->isDeclaration()) {
|
||||
// If this is a call to an external function, we can't do much. Make
|
||||
// conservative guesses.
|
||||
|
||||
// 48 SGPRs - vcc, - flat_scr, -xnack
|
||||
int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true,
|
||||
ST.hasFlatAddressSpace());
|
||||
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
|
||||
MaxVGPR = std::max(MaxVGPR, 23);
|
||||
|
||||
CalleeFrameSize = std::max(CalleeFrameSize, 16384u);
|
||||
Info.UsesVCC = true;
|
||||
Info.UsesFlatScratch = ST.hasFlatAddressSpace();
|
||||
Info.HasDynamicallySizedStack = true;
|
||||
} else {
|
||||
// We force CodeGen to run in SCC order, so the callee's register
|
||||
// usage etc. should be the cumulative usage of all callees.
|
||||
auto I = CallGraphResourceInfo.find(Callee);
|
||||
assert(I != CallGraphResourceInfo.end() &&
|
||||
"callee should have been handled before caller");
|
||||
|
||||
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
|
||||
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
|
||||
CalleeFrameSize
|
||||
= std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
|
||||
Info.UsesVCC |= I->second.UsesVCC;
|
||||
Info.UsesFlatScratch |= I->second.UsesFlatScratch;
|
||||
Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
|
||||
Info.HasRecursion |= I->second.HasRecursion;
|
||||
}
|
||||
|
||||
if (!Callee->doesNotRecurse())
|
||||
Info.HasRecursion = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We found the maximum register index. They start at 0, so add one to get the
|
||||
// number of registers.
|
||||
Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
|
||||
TRI.getHWRegIndex(HighestVGPRReg) + 1;
|
||||
Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
|
||||
TRI.getHWRegIndex(HighestSGPRReg) + 1;
|
||||
Info.NumExplicitSGPR = MaxSGPR + 1;
|
||||
Info.NumVGPR = MaxVGPR + 1;
|
||||
Info.PrivateSegmentSize += CalleeFrameSize;
|
||||
|
||||
return Info;
|
||||
}
|
||||
|
|
|
@ -135,6 +135,11 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
|
|||
// do that with a single pseudo source operation.
|
||||
if (Opcode == AMDGPU::S_SETPC_B64_return)
|
||||
Opcode = AMDGPU::S_SETPC_B64;
|
||||
else if (Opcode == AMDGPU::SI_CALL) {
|
||||
// SI_CALL is just S_SWAPPC_B64 with an additional operand to track the
|
||||
// called function.
|
||||
Opcode = AMDGPU::S_SWAPPC_B64;
|
||||
}
|
||||
|
||||
int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode);
|
||||
if (MCOpcode == -1) {
|
||||
|
|
|
@ -486,7 +486,10 @@ public:
|
|||
class GCNPassConfig final : public AMDGPUPassConfig {
|
||||
public:
|
||||
GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
|
||||
: AMDGPUPassConfig(TM, PM) {}
|
||||
: AMDGPUPassConfig(TM, PM) {
|
||||
// It is necessary to know the register usage of the entire call graph.
|
||||
setRequiresCodeGenSCCOrder(EnableAMDGPUFunctionCalls);
|
||||
}
|
||||
|
||||
GCNTargetMachine &getGCNTargetMachine() const {
|
||||
return getTM<GCNTargetMachine>();
|
||||
|
|
|
@ -2650,14 +2650,27 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
|
|||
.addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
|
||||
return BB;
|
||||
}
|
||||
case AMDGPU::SI_CALL: {
|
||||
case AMDGPU::SI_CALL_ISEL: {
|
||||
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
|
||||
|
||||
MachineRegisterInfo &MRI = MF->getRegInfo();
|
||||
unsigned GlobalAddrReg = MI.getOperand(0).getReg();
|
||||
MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
|
||||
assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
|
||||
|
||||
const GlobalValue *G = PCRel->getOperand(1).getGlobal();
|
||||
|
||||
MachineInstrBuilder MIB =
|
||||
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_SWAPPC_B64), ReturnAddrReg);
|
||||
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
|
||||
BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
|
||||
.add(MI.getOperand(0))
|
||||
.addGlobalAddress(G);
|
||||
|
||||
for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
|
||||
MIB.add(MI.getOperand(I));
|
||||
|
||||
|
||||
MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
|
||||
|
||||
MI.eraseFromParent();
|
||||
|
|
|
@ -327,16 +327,28 @@ def SI_RETURN : SPseudoInstSI <
|
|||
let SchedRW = [WriteBranch];
|
||||
}
|
||||
|
||||
// Return for returning function calls.
|
||||
def SI_CALL : SPseudoInstSI <
|
||||
(outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)],
|
||||
"; call $src0"> {
|
||||
// Return for returning function calls without output register.
|
||||
//
|
||||
// This version is only needed so we can fill in the output regiter in
|
||||
// the custom inserter.
|
||||
def SI_CALL_ISEL : SPseudoInstSI <
|
||||
(outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> {
|
||||
let Size = 4;
|
||||
let isCall = 1;
|
||||
let SchedRW = [WriteBranch];
|
||||
let usesCustomInserter = 1;
|
||||
}
|
||||
|
||||
// Wrapper around s_swappc_b64 with extra $callee parameter to track
|
||||
// the called function after regalloc.
|
||||
def SI_CALL : SPseudoInstSI <
|
||||
(outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> {
|
||||
let Size = 4;
|
||||
let isCall = 1;
|
||||
let SchedRW = [WriteBranch];
|
||||
}
|
||||
|
||||
|
||||
def ADJCALLSTACKUP : SPseudoInstSI<
|
||||
(outs), (ins i32imm:$amt0, i32imm:$amt1),
|
||||
[(callseq_start timm:$amt0, timm:$amt1)],
|
||||
|
|
|
@ -0,0 +1,230 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
|
||||
|
||||
; Make sure to run a GPU with the SGPR allocation bug.
|
||||
|
||||
; GCN-LABEL: {{^}}use_vcc:
|
||||
; GCN: ; NumSgprs: 34
|
||||
; GCN: ; NumVgprs: 0
|
||||
define void @use_vcc() #1 {
|
||||
call void asm sideeffect "", "~{vcc}" () #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_use_vcc:
|
||||
; GCN: v_writelane_b32 v32, s33, 0
|
||||
; GCN: v_writelane_b32 v32, s34, 1
|
||||
; GCN: v_writelane_b32 v32, s35, 2
|
||||
; GCN: s_swappc_b64
|
||||
; GCN: v_readlane_b32 s35, v32, 2
|
||||
; GCN: v_readlane_b32 s34, v32, 1
|
||||
; GCN: v_readlane_b32 s33, v32, 0
|
||||
; GCN: ; NumSgprs: 38
|
||||
; GCN: ; NumVgprs: 33
|
||||
define void @indirect_use_vcc() #1 {
|
||||
call void @use_vcc()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
|
||||
; GCN: is_dynamic_callstack = 0
|
||||
; CI: ; NumSgprs: 40
|
||||
; VI-NOBUG: ; NumSgprs: 42
|
||||
; VI-BUG: ; NumSgprs: 96
|
||||
; GCN: ; NumVgprs: 33
|
||||
define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 {
|
||||
call void @indirect_use_vcc()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_flat_scratch:
|
||||
; CI: ; NumSgprs: 36
|
||||
; VI: ; NumSgprs: 38
|
||||
; GCN: ; NumVgprs: 0
|
||||
define void @use_flat_scratch() #1 {
|
||||
call void asm sideeffect "", "~{flat_scratch}" () #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_use_flat_scratch:
|
||||
; CI: ; NumSgprs: 40
|
||||
; VI: ; NumSgprs: 42
|
||||
; GCN: ; NumVgprs: 33
|
||||
define void @indirect_use_flat_scratch() #1 {
|
||||
call void @use_flat_scratch()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
|
||||
; GCN: is_dynamic_callstack = 0
|
||||
; CI: ; NumSgprs: 40
|
||||
; VI-NOBUG: ; NumSgprs: 42
|
||||
; VI-BUG: ; NumSgprs: 96
|
||||
; GCN: ; NumVgprs: 33
|
||||
define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 {
|
||||
call void @indirect_use_flat_scratch()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_10_vgpr:
|
||||
; GCN: ; NumVgprs: 10
|
||||
define void @use_10_vgpr() #1 {
|
||||
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4}"() #0
|
||||
call void asm sideeffect "", "~{v5},~{v6},~{v7},~{v8},~{v9}"() #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_use_10_vgpr:
|
||||
; GCN: ; NumVgprs: 33
|
||||
define void @indirect_use_10_vgpr() #0 {
|
||||
call void @use_10_vgpr()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
|
||||
; GCN: is_dynamic_callstack = 0
|
||||
; GCN: ; NumVgprs: 10
|
||||
define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
|
||||
call void @indirect_use_10_vgpr()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_40_vgpr:
|
||||
; GCN: ; NumVgprs: 40
|
||||
define void @use_40_vgpr() #1 {
|
||||
call void asm sideeffect "", "~{v39}"() #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_use_40_vgpr:
|
||||
; GCN: ; NumVgprs: 40
|
||||
define void @indirect_use_40_vgpr() #0 {
|
||||
call void @use_40_vgpr()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_80_sgpr:
|
||||
; GCN: ; NumSgprs: 80
|
||||
define void @use_80_sgpr() #1 {
|
||||
call void asm sideeffect "", "~{s79}"() #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_use_80_sgpr:
|
||||
; GCN: ; NumSgprs: 82
|
||||
define void @indirect_use_80_sgpr() #1 {
|
||||
call void @use_80_sgpr()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
|
||||
; GCN: is_dynamic_callstack = 0
|
||||
; CI: ; NumSgprs: 84
|
||||
; VI-NOBUG: ; NumSgprs: 86
|
||||
; VI-BUG: ; NumSgprs: 96
|
||||
define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
|
||||
call void @indirect_use_80_sgpr()
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; GCN-LABEL: {{^}}use_stack0:
|
||||
; GCN: ScratchSize: 2052
|
||||
define void @use_stack0() #1 {
|
||||
%alloca = alloca [512 x i32], align 4
|
||||
call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_stack1:
|
||||
; GCN: ScratchSize: 404
|
||||
define void @use_stack1() #1 {
|
||||
%alloca = alloca [100 x i32], align 4
|
||||
call void asm sideeffect "; use $0", "v"([100 x i32]* %alloca) #0
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_use_stack:
|
||||
; GCN: ScratchSize: 2120
|
||||
define void @indirect_use_stack() #1 {
|
||||
%alloca = alloca [16 x i32], align 4
|
||||
call void asm sideeffect "; use $0", "v"([16 x i32]* %alloca) #0
|
||||
call void @use_stack0()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_2_level_use_stack:
|
||||
; GCN: is_dynamic_callstack = 0
|
||||
; GCN: ScratchSize: 2120
|
||||
define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
|
||||
call void @indirect_use_stack()
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; Should be maximum of callee usage
|
||||
; GCN-LABEL: {{^}}multi_call_use_use_stack:
|
||||
; GCN: is_dynamic_callstack = 0
|
||||
; GCN: ScratchSize: 2052
|
||||
define amdgpu_kernel void @multi_call_use_use_stack() #0 {
|
||||
call void @use_stack0()
|
||||
call void @use_stack1()
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
declare void @external() #0
|
||||
|
||||
; GCN-LABEL: {{^}}usage_external:
|
||||
; GCN: is_dynamic_callstack = 1
|
||||
; NumSgprs: 48
|
||||
; NumVgprs: 24
|
||||
; GCN: ScratchSize: 16384
|
||||
define amdgpu_kernel void @usage_external() #0 {
|
||||
call void @external()
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @external_recurse() #2
|
||||
|
||||
; GCN-LABEL: {{^}}usage_external_recurse:
|
||||
; GCN: is_dynamic_callstack = 1
|
||||
; NumSgprs: 48
|
||||
; NumVgprs: 24
|
||||
; GCN: ScratchSize: 16384
|
||||
define amdgpu_kernel void @usage_external_recurse() #0 {
|
||||
call void @external_recurse()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}direct_recursion_use_stack:
|
||||
; GCN: ScratchSize: 2052
|
||||
define void @direct_recursion_use_stack(i32 %val) #2 {
|
||||
%alloca = alloca [512 x i32], align 4
|
||||
call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0
|
||||
%cmp = icmp eq i32 %val, 0
|
||||
br i1 %cmp, label %ret, label %call
|
||||
|
||||
call:
|
||||
%val.sub1 = sub i32 %val, 1
|
||||
call void @direct_recursion_use_stack(i32 %val.sub1)
|
||||
br label %ret
|
||||
|
||||
ret:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}usage_direct_recursion:
|
||||
; GCN: is_ptr64 = 1
|
||||
; GCN: is_dynamic_callstack = 1
|
||||
; GCN: workitem_private_segment_byte_size = 2052
|
||||
define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
|
||||
call void @direct_recursion_use_stack(i32 %n)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
attributes #0 = { nounwind norecurse }
|
||||
attributes #1 = { nounwind noinline norecurse }
|
||||
attributes #2 = { nounwind noinline }
|
|
@ -3,9 +3,9 @@
|
|||
|
||||
; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
|
||||
; IR: alloca [5 x i32]
|
||||
; ASM-LABEL: {{^}}promote_alloca_shaders:
|
||||
; ASM: ; LDSByteSize: 0 bytes/workgroup (compile time only)
|
||||
|
||||
; ASM-LABEL: {{^}}promote_alloca_shaders:
|
||||
; ASM: ; ScratchSize: 24
|
||||
define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4
|
||||
|
@ -29,7 +29,10 @@ entry:
|
|||
; OPT-LABEL: @promote_to_vector_call_c(
|
||||
; OPT-NOT: alloca
|
||||
; OPT: extractelement <2 x i32> %{{[0-9]+}}, i32 %in
|
||||
|
||||
; ASM-LABEL: {{^}}promote_to_vector_call_c:
|
||||
; ASM-NOT: LDSByteSize
|
||||
; ASM: ; ScratchSize: 0
|
||||
define void @promote_to_vector_call_c(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
entry:
|
||||
%tmp = alloca [2 x i32]
|
||||
|
@ -47,8 +50,11 @@ entry:
|
|||
|
||||
; OPT-LABEL: @no_promote_to_lds_c(
|
||||
; OPT: alloca
|
||||
|
||||
; ASM-LABEL: {{^}}no_promote_to_lds_c:
|
||||
; ASM-NOT: LDSByteSize
|
||||
define void @no_promote_to_lds(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
|
||||
; ASM: ; ScratchSize: 24
|
||||
define void @no_promote_to_lds_c(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4
|
||||
%0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
|
|
Loading…
Reference in New Issue