AMDGPU: Fix clobbering CSR VGPRs when spilling SGPR to it
llvm-svn: 309783
This commit is contained in:
parent
1d6317c3ad
commit
8e8f8f43b0
|
@ -454,6 +454,15 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
|||
.addImm(NumBytes * ST.getWavefrontSize())
|
||||
.setMIFlag(MachineInstr::FrameSetup);
|
||||
}
|
||||
|
||||
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
|
||||
: FuncInfo->getSGPRSpillVGPRs()) {
|
||||
if (!Reg.FI.hasValue())
|
||||
continue;
|
||||
TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
|
||||
Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
|
||||
&TII->getRegisterInfo());
|
||||
}
|
||||
}
|
||||
|
||||
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
|
||||
|
@ -462,6 +471,19 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
|
|||
if (FuncInfo->isEntryFunction())
|
||||
return;
|
||||
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
|
||||
|
||||
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
|
||||
: FuncInfo->getSGPRSpillVGPRs()) {
|
||||
if (!Reg.FI.hasValue())
|
||||
continue;
|
||||
TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
|
||||
Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
|
||||
&TII->getRegisterInfo());
|
||||
}
|
||||
|
||||
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
|
||||
if (StackPtrReg == AMDGPU::NoRegister)
|
||||
return;
|
||||
|
@ -469,9 +491,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
|
|||
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
uint32_t NumBytes = MFI.getStackSize();
|
||||
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
|
||||
DebugLoc DL;
|
||||
|
||||
// FIXME: Clarify distinction between no set SP and SP. For callee functions,
|
||||
|
|
|
@ -237,6 +237,15 @@ unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI)
|
|||
return ImplicitBufferPtrUserSGPR;
|
||||
}
|
||||
|
||||
static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
|
||||
for (unsigned I = 0; CSRegs[I]; ++I) {
|
||||
if (CSRegs[I] == Reg)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
|
||||
bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
|
||||
int FI) {
|
||||
|
@ -258,6 +267,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
|
|||
|
||||
int NumLanes = Size / 4;
|
||||
|
||||
const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
|
||||
|
||||
// Make sure to handle the case where a wide SGPR spill may span between two
|
||||
// VGPRs.
|
||||
for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
|
||||
|
@ -274,14 +285,21 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
|
|||
return false;
|
||||
}
|
||||
|
||||
SpillVGPRs.push_back(LaneVGPR);
|
||||
Optional<int> CSRSpillFI;
|
||||
if (FrameInfo.hasCalls() && CSRegs && isCalleeSavedReg(CSRegs, LaneVGPR)) {
|
||||
// TODO: Should this be a CreateSpillStackObject? This is technically a
|
||||
// weird CSR spill.
|
||||
CSRSpillFI = FrameInfo.CreateStackObject(4, 4, false);
|
||||
}
|
||||
|
||||
SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
|
||||
|
||||
// Add this register as live-in to all blocks to avoid machine verifer
|
||||
// complaining about use of an undefined physical register.
|
||||
for (MachineBasicBlock &BB : MF)
|
||||
BB.addLiveIn(LaneVGPR);
|
||||
} else {
|
||||
LaneVGPR = SpillVGPRs.back();
|
||||
LaneVGPR = SpillVGPRs.back().VGPR;
|
||||
}
|
||||
|
||||
SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
|
||||
|
|
|
@ -211,6 +211,19 @@ public:
|
|||
bool hasReg() { return VGPR != AMDGPU::NoRegister;}
|
||||
};
|
||||
|
||||
struct SGPRSpillVGPRCSR {
|
||||
// VGPR used for SGPR spills
|
||||
unsigned VGPR;
|
||||
|
||||
// If the VGPR is a CSR, the stack slot used to save/restore it in the
|
||||
// prolog/epilog.
|
||||
Optional<int> FI;
|
||||
|
||||
SGPRSpillVGPRCSR(unsigned V, Optional<int> F) :
|
||||
VGPR(V),
|
||||
FI(F) {}
|
||||
};
|
||||
|
||||
private:
|
||||
// SGPR->VGPR spilling support.
|
||||
typedef std::pair<unsigned, unsigned> SpillRegMask;
|
||||
|
@ -219,7 +232,7 @@ private:
|
|||
// frameindex key.
|
||||
DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
|
||||
unsigned NumVGPRSpillLanes = 0;
|
||||
SmallVector<unsigned, 2> SpillVGPRs;
|
||||
SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
|
||||
|
||||
public:
|
||||
|
||||
|
@ -231,6 +244,10 @@ public:
|
|||
ArrayRef<SpilledReg>() : makeArrayRef(I->second);
|
||||
}
|
||||
|
||||
ArrayRef<SGPRSpillVGPRCSR> getSGPRSpillVGPRs() const {
|
||||
return SpillVGPRs;
|
||||
}
|
||||
|
||||
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
|
||||
void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
|
||||
|
||||
|
|
|
@ -30,10 +30,11 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: buffer_store_dword v32
|
||||
; GCN-DAG: buffer_store_dword v32
|
||||
; GCN-DAG: buffer_store_dword v33
|
||||
; GCN: v_writelane_b32
|
||||
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0xa00{{$}}
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0xb00{{$}}
|
||||
|
||||
; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN: v_add_i32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
|
||||
|
@ -48,7 +49,8 @@ entry:
|
|||
|
||||
; GCN: v_readlane_b32
|
||||
; GCN: buffer_load_dword v32,
|
||||
; GCN: s_sub_u32 s32, s32, 0xa00{{$}}
|
||||
; GCN: buffer_load_dword v33,
|
||||
; GCN: s_sub_u32 s32, s32, 0xb00{{$}}
|
||||
; GCN: s_setpc_b64
|
||||
define void @void_func_byval_struct_non_leaf(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 {
|
||||
entry:
|
||||
|
@ -67,7 +69,7 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}call_void_func_byval_struct_func:
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: s_add_u32 s32, s32, 0xa00{{$}}
|
||||
; GCN: s_add_u32 s32, s32, 0xc00{{$}}
|
||||
; GCN: v_writelane_b32
|
||||
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}}
|
||||
|
@ -103,7 +105,7 @@ entry:
|
|||
|
||||
; GCN: v_readlane_b32
|
||||
|
||||
; GCN: s_sub_u32 s32, s32, 0xa00{{$}}
|
||||
; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @call_void_func_byval_struct_func() #0 {
|
||||
|
|
|
@ -146,7 +146,7 @@ define void @use_stack1() #1 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}indirect_use_stack:
|
||||
; GCN: ScratchSize: 2120
|
||||
; GCN: ScratchSize: 2124
|
||||
define void @indirect_use_stack() #1 {
|
||||
%alloca = alloca [16 x i32], align 4
|
||||
call void asm sideeffect "; use $0", "v"([16 x i32]* %alloca) #0
|
||||
|
@ -156,7 +156,7 @@ define void @indirect_use_stack() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}indirect_2_level_use_stack:
|
||||
; GCN: is_dynamic_callstack = 0
|
||||
; GCN: ScratchSize: 2120
|
||||
; GCN: ScratchSize: 2124
|
||||
define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
|
||||
call void @indirect_use_stack()
|
||||
ret void
|
||||
|
@ -199,7 +199,7 @@ define amdgpu_kernel void @usage_external_recurse() #0 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}direct_recursion_use_stack:
|
||||
; GCN: ScratchSize: 2052
|
||||
; GCN: ScratchSize: 2056
|
||||
define void @direct_recursion_use_stack(i32 %val) #2 {
|
||||
%alloca = alloca [512 x i32], align 4
|
||||
call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0
|
||||
|
@ -218,7 +218,7 @@ ret:
|
|||
; GCN-LABEL: {{^}}usage_direct_recursion:
|
||||
; GCN: is_ptr64 = 1
|
||||
; GCN: is_dynamic_callstack = 1
|
||||
; GCN: workitem_private_segment_byte_size = 2052
|
||||
; GCN: workitem_private_segment_byte_size = 2056
|
||||
define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
|
||||
call void @direct_recursion_use_stack(i32 %n)
|
||||
ret void
|
||||
|
|
|
@ -36,14 +36,15 @@ define void @callee_with_stack() #0 {
|
|||
; GCN-LABEL: {{^}}callee_with_stack_and_call:
|
||||
; GCN: ; BB#0:
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8
|
||||
|
||||
; GCN-DAG: s_mov_b32 s5, s32
|
||||
; GCN-DAG: v_writelane_b32 v32, s33,
|
||||
; GCN-DAG: v_writelane_b32 v32, s34,
|
||||
; GCN-DAG: v_writelane_b32 v32, s35,
|
||||
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0x200{{$}}
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0x300{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
|
||||
; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN-DAG: s_mov_b32 s33, s5
|
||||
|
||||
|
||||
|
@ -52,6 +53,7 @@ define void @callee_with_stack() #0 {
|
|||
; GCN-DAG: v_readlane_b32 s35,
|
||||
; GCN-DAG: v_readlane_b32 s34,
|
||||
; GCN-DAG: v_readlane_b32 s33,
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @callee_with_stack_and_call() #0 {
|
||||
|
@ -64,13 +66,24 @@ define void @callee_with_stack_and_call() #0 {
|
|||
; Should be able to copy incoming stack pointer directly to inner
|
||||
; call's stack pointer argument.
|
||||
|
||||
; There is stack usage only because of the need to evict a VGPR for
|
||||
; spilling CSR SGPRs.
|
||||
|
||||
; GCN-LABEL: {{^}}callee_no_stack_with_call:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NOT: s32
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
|
||||
; GCN-DAG: v_writelane_b32 v32, s33, 0
|
||||
; GCN-DAG: v_writelane_b32 v32, s34, 1
|
||||
; GCN: s_mov_b32 s33, s5
|
||||
; GCN: s_swappc_b64
|
||||
; GCN: s_mov_b32 s5, s33
|
||||
; GCN-NOT: s32
|
||||
|
||||
; GCN-DAG: v_readlane_b32 s34, v32, 1
|
||||
; GCN-DAG: v_readlane_b32 s33, v32, 0
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
|
||||
; GCN: s_sub_u32 s32, s32, 0x200
|
||||
|
||||
; GCN: s_setpc_b64
|
||||
define void @callee_no_stack_with_call() #0 {
|
||||
call void @external_void_func_void()
|
||||
|
|
|
@ -9,9 +9,21 @@ declare void @external_void_func_i32(i32) #0
|
|||
|
||||
; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NOT: s32
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; Spill CSR VGPR used for SGPR spilling
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0x200
|
||||
; GCN-DAG: v_writelane_b32 v32, s33, 0
|
||||
; GCN-DAG: v_writelane_b32 v32, s34, 1
|
||||
; GCN-DAG: v_writelane_b32 v32, s35, 2
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: s32
|
||||
|
||||
; GCN: v_readlane_b32 s35, v32, 2
|
||||
; GCN: v_readlane_b32 s34, v32, 1
|
||||
; GCN: v_readlane_b32 s33, v32, 0
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
|
||||
; GCN: s_sub_u32 s32, s32, 0x200
|
||||
; GCN: s_setpc_b64
|
||||
define void @test_func_call_external_void_func_i32_imm() #0 {
|
||||
call void @external_void_func_i32(i32 42)
|
||||
|
@ -21,10 +33,10 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
|
|||
; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
|
||||
; GCN: s_waitcnt
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: s_add_u32 s32, s32, 0x1100{{$}}
|
||||
; GCN: s_add_u32 s32, s32, 0x1200{{$}}
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
|
||||
; GCN: s_swappc_b64
|
||||
; GCN: s_sub_u32 s32, s32, 0x1100{{$}}
|
||||
; GCN: s_sub_u32 s32, s32, 0x1200{{$}}
|
||||
; GCN: s_setpc_b64
|
||||
define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
|
||||
%alloca = alloca [16 x i32], align 4
|
||||
|
|
Loading…
Reference in New Issue