[AMDGPU] Enable copy between VGPR and AGPR classes during regalloc

Greedy register allocator prefers to move a constrained
live range into a larger allocatable class over spilling
them. This patch defines the necessary superclasses for
vector registers. For subtargets that support copy between
VGPRs and AGPRs, the vector register spills during regalloc
now become just copies.

Reviewed By: rampitec, arsenm

Differential Revision: https://reviews.llvm.org/D109301
This commit is contained in:
Christudasan Devadasan 2021-09-06 00:19:55 -04:00
parent f1d8345a2a
commit 5297cbf045
9 changed files with 528 additions and 102 deletions

View File

@ -1435,6 +1435,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
FrameInfo.getObjectAlign(FrameIndex));
unsigned SpillSize = TRI->getSpillSize(*RC);
MachineRegisterInfo &MRI = MF->getRegInfo();
if (RI.isSGPRClass(RC)) {
MFI->setHasSpilledSGPRs();
assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
@ -1448,7 +1449,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// The SGPR spill/restore instructions only work on number sgprs, so we need
// to make sure we are using the correct register class.
if (SrcReg.isVirtual() && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
@ -1467,6 +1467,17 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
: getVGPRSpillSaveOpcode(SpillSize);
MFI->setHasSpilledVGPRs();
if (RI.isVectorSuperClass(RC)) {
// Convert an AV spill into a VGPR spill. Introduce a copy from AV to an
// equivalent VGPR register beforehand. Regalloc might want to introduce
// AV spills only to be relevant until rewriter at which they become
// either spills of VGPRs or AGPRs.
Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg)
.addReg(SrcReg, RegState::Kill);
SrcReg = TmpVReg;
}
BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
@ -1600,11 +1611,24 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
: getVGPRSpillRestoreOpcode(SpillSize);
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
Register TmpReg = DestReg;
if (IsVectorSuperClass) {
// For AV classes, insert the spill restore to a VGPR followed by a copy
// into an equivalent AV register.
MachineRegisterInfo &MRI = MF->getRegInfo();
DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
}
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
.addImm(0) // offset
.addMemOperand(MMO);
if (IsVectorSuperClass)
BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg)
.addReg(DestReg, RegState::Kill);
}
void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,

View File

@ -402,6 +402,62 @@ const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
return CSR_AMDGPU_NoRegs_RegMask;
}
const TargetRegisterClass *
SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
const MachineFunction &MF) const {
// FIXME: Should have a helper function like getEquivalentVGPRClass to get the
// equivalent AV class. If used one, the verifier will crash after
// RegBankSelect in the GISel flow. The aligned regclasses are not fully given
// until Instruction selection.
if (MF.getSubtarget<GCNSubtarget>().hasMAIInsts() &&
(isVGPRClass(RC) || isAGPRClass(RC))) {
if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
return &AMDGPU::AV_32RegClass;
if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
return &AMDGPU::AV_64RegClass;
if (RC == &AMDGPU::VReg_64_Align2RegClass ||
RC == &AMDGPU::AReg_64_Align2RegClass)
return &AMDGPU::AV_64_Align2RegClass;
if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
return &AMDGPU::AV_96RegClass;
if (RC == &AMDGPU::VReg_96_Align2RegClass ||
RC == &AMDGPU::AReg_96_Align2RegClass)
return &AMDGPU::AV_96_Align2RegClass;
if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
return &AMDGPU::AV_128RegClass;
if (RC == &AMDGPU::VReg_128_Align2RegClass ||
RC == &AMDGPU::AReg_128_Align2RegClass)
return &AMDGPU::AV_128_Align2RegClass;
if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
return &AMDGPU::AV_160RegClass;
if (RC == &AMDGPU::VReg_160_Align2RegClass ||
RC == &AMDGPU::AReg_160_Align2RegClass)
return &AMDGPU::AV_160_Align2RegClass;
if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
return &AMDGPU::AV_192RegClass;
if (RC == &AMDGPU::VReg_192_Align2RegClass ||
RC == &AMDGPU::AReg_192_Align2RegClass)
return &AMDGPU::AV_192_Align2RegClass;
if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
return &AMDGPU::AV_256RegClass;
if (RC == &AMDGPU::VReg_256_Align2RegClass ||
RC == &AMDGPU::AReg_256_Align2RegClass)
return &AMDGPU::AV_256_Align2RegClass;
if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
return &AMDGPU::AV_512RegClass;
if (RC == &AMDGPU::VReg_512_Align2RegClass ||
RC == &AMDGPU::AReg_512_Align2RegClass)
return &AMDGPU::AV_512_Align2RegClass;
if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
return &AMDGPU::AV_1024RegClass;
if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
RC == &AMDGPU::AReg_1024_Align2RegClass)
return &AMDGPU::AV_1024_Align2RegClass;
}
return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
}
Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const SIFrameLowering *TFI =
MF.getSubtarget<GCNSubtarget>().getFrameLowering();
@ -994,10 +1050,22 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
unsigned Dst = IsStore ? Reg : ValueReg;
unsigned Src = IsStore ? ValueReg : Reg;
unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
: AMDGPU::V_ACCVGPR_READ_B32_e64;
bool IsVGPR = TRI->isVGPR(MRI, Reg);
DebugLoc DL = MI->getDebugLoc();
if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
// Spiller during regalloc may restore a spilled register to its superclass.
// It could result in AGPR spills restored to VGPRs or the other way around,
// making the src and dst with identical regclasses at this point. It just
// needs a copy in such cases.
auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
.addReg(Src, getKillRegState(IsKill));
CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
return CopyMIB;
}
unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
: AMDGPU::V_ACCVGPR_READ_B32_e64;
auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
.addReg(Src, getKillRegState(IsKill));
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
return MIB;

View File

@ -77,6 +77,10 @@ public:
return 100;
}
const TargetRegisterClass *
getLargestLegalSuperClass(const TargetRegisterClass *RC,
const MachineFunction &MF) const override;
Register getFrameRegister(const MachineFunction &MF) const override;
bool hasBasePointer(const MachineFunction &MF) const;

View File

@ -20,7 +20,7 @@ body: |
; CHECK: successors: %bb.1(0x80000000)
; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
; CHECK: SI_SPILL_V1024_SAVE [[DEF1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5)
; CHECK: [[COPY:%[0-9]+]]:av_1024_align2 = COPY [[DEF1]]
; CHECK: bb.1:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: S_NOP 0, implicit [[DEF1]]
@ -29,18 +29,17 @@ body: |
; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
; CHECK: bb.2:
; CHECK: successors: %bb.3(0x80000000)
; CHECK: [[SI_SPILL_V1024_RESTORE:%[0-9]+]]:vreg_1024_align2 = SI_SPILL_V1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
; CHECK: undef %5.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:vreg_1024_align2 = COPY [[SI_SPILL_V1024_RESTORE]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
; CHECK: internal %5.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31:vreg_1024_align2 = COPY [[SI_SPILL_V1024_RESTORE]].sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
; CHECK: undef %5.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
; CHECK: internal %5.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
; CHECK: }
; CHECK: %5.sub0:vreg_1024_align2 = IMPLICIT_DEF
; CHECK: %5.sub0:av_1024_align2 = IMPLICIT_DEF
; CHECK: S_NOP 0, implicit %5.sub0
; CHECK: bb.3:
; CHECK: successors: %bb.4(0x80000000)
; CHECK: S_NOP 0, implicit %5
; CHECK: bb.4:
; CHECK: successors: %bb.3(0x40000000), %bb.5(0x40000000)
; CHECK: [[DEF2:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:av_1024_align2 = IMPLICIT_DEF
; CHECK: S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc
; CHECK: bb.5:
; CHECK: undef %3.sub0:vreg_1024_align2 = COPY [[DEF]]

View File

@ -0,0 +1,62 @@
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,1 -verify-machineinstrs < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=PEI-GFX908 %s
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,1 -verify-machineinstrs < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=PEI-GFX90A %s
; Partial reg copy and spill missed during regalloc handled later at frame lowering.
define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908-LABEL: name: partial_copy
; REGALLOC-GFX908: bb.0 (%ir-block.0):
; REGALLOC-GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2949130 /* regdef:VReg_64 */, def [[VREG_64:%[0-9]+]]
; REGALLOC-GFX908: SI_SPILL_V64_SAVE [[VREG_64]], %stack.0
; REGALLOC-GFX908: [[V_MFMA_I32_4X4X4I8_A128:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64
; REGALLOC-GFX908: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0
; REGALLOC-GFX908: GLOBAL_STORE_DWORDX2 undef %{{[0-9]+}}:vreg_64, [[SI_SPILL_V64_RESTORE]]
; REGALLOC-GFX908: [[COPY_A128_TO_V128:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_A128]]
; REGALLOC-GFX908: GLOBAL_STORE_DWORDX4 undef %{{[0-9]+}}:vreg_64, [[COPY_A128_TO_V128]]
;
; PEI-GFX908-LABEL: name: partial_copy
; PEI-GFX908: bb.0 (%ir-block.0):
; PEI-GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2949130 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
; PEI-GFX908: BUFFER_STORE_DWORD_OFFSET killed $vgpr0
; PEI-GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1
; PEI-GFX908: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64
; PEI-GFX908: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET
; PEI-GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4
; PEI-GFX908: GLOBAL_STORE_DWORDX2 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1
; PEI-GFX908: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
; PEI-GFX908: GLOBAL_STORE_DWORDX4 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3
;
; REGALLOC-GFX90A-LABEL: name: partial_copy
; REGALLOC-GFX90A: bb.0 (%ir-block.0):
; REGALLOC-GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def [[VREG_64:%[0-9]+]]
; REGALLOC-GFX90A: SI_SPILL_V64_SAVE [[VREG_64]], %stack.0
; REGALLOC-GFX90A: [[V_MFMA_I32_4X4X4I8_A128:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64
; REGALLOC-GFX90A: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64_align2 = SI_SPILL_V64_RESTORE %stack.0
; REGALLOC-GFX90A: [[COPY_AV64:%[0-9]+]]:av_64_align2 = COPY [[SI_SPILL_V64_RESTORE]]
; REGALLOC-GFX90A: GLOBAL_STORE_DWORDX2 undef %15:vreg_64_align2, [[COPY_AV64]]
; REGALLOC-GFX90A-NOT: %{{[0-9]+}}:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_A128]]
; REGALLOC-GFX90A: GLOBAL_STORE_DWORDX4 undef %17:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_A128]]
;
; PEI-GFX90A-LABEL: name: partial_copy
; PEI-GFX90A: bb.0 (%ir-block.0):
; PEI-GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
; PEI-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0
; PEI-GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1
; PEI-GFX90A: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64
; PEI-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET
; PEI-GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4
; PEI-GFX90A: GLOBAL_STORE_DWORDX2 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1
; PEI-GFX90A: GLOBAL_STORE_DWORDX4 undef renamable ${{.*}}, killed renamable $agpr0_agpr1_agpr2_agpr3
%v0 = call <4 x i32> asm sideeffect "; def $0", "=v" ()
%v1 = call <2 x i32> asm sideeffect "; def $0", "=v" ()
%mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0)
store volatile <4 x i32> %v0, <4 x i32> addrspace(1)* undef
store volatile <2 x i32> %v1, <2 x i32> addrspace(1)* undef
store volatile <4 x i32> %mai, <4 x i32> addrspace(1)* undef
ret void
}
declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
attributes #0 = { nounwind "amdgpu-num-vgpr"="5" }

View File

@ -1,23 +1,17 @@
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2V %s
; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908-A2M,A2M %s
; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX90A-A2M,A2M %s
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
; GCN-LABEL: {{^}}max_24regs_32a_used:
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GCN-DAG: v_mfma_f32_16x16x1f32
; GCN-DAG: v_mfma_f32_16x16x1f32
; A2V-NOT: SCRATCH_RSRC
; GFX908-A2M-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
; GFX90A-NOT: v_accvgpr_read_b32
; GFX90A-A2M: buffer_store_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
; GFX90A-A2M: buffer_load_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
; GFX90A-NOT: v_accvgpr_write_b32
; A2V: ScratchSize: 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GCN-DAG: v_mfma_f32_16x16x1f32
; GCN-DAG: v_mfma_f32_16x16x1f32
; GCN-DAG: v_accvgpr_read_b32
; GCN-NOT: buffer_store_dword
; GCN-NOT: buffer_load_dword
; GFX908-NOT: v_accvgpr_write_b32
; GFX90A: v_accvgpr_write_b32
; GCN: ScratchSize: 0
define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 {
bb:
%in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
@ -39,16 +33,13 @@ bb:
}
; GCN-LABEL: {{^}}max_12regs_13a_used:
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; A2V-NOT: SCRATCH_RSRC
; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse
; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
; GFX90A-A2M: buffer_store_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
; GFX90A-A2M: buffer_load_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
; A2V: ScratchSize: 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
; GCN-NOT: buffer_store_dword
; GCN-NOT: buffer_load_dword
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
; GCN: ScratchSize: 0
define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
bb:
%in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
@ -70,17 +61,13 @@ st:
}
; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
; GFX908-A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GFX908-A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; A2V-NOT: SCRATCH_RSRC
; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse
; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
; A2V: ScratchSize: 0
; GFX908-A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
; GFX908-A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
; GFX908-A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] ; Reload Reuse
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
; GCN-NOT: buffer_store_dword
; GCN-NOT: buffer_load_dword
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
; GCN: ScratchSize: 0
define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
%a1 = call <4 x i32> asm sideeffect "", "=a"()
%a2 = call <4 x i32> asm sideeffect "", "=a"()
@ -92,17 +79,14 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
}
; GCN-LABEL: {{^}}max_32regs_mfma32:
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; A2V-NOT: SCRATCH_RSRC
; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
; GFX90A-NOT: v_accvgpr_read_b32
; GFX90A: v_mfma_f32_32x32x1f32
; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
; GFX90A-NOT: v_accvgpr_write_b32
; A2V: ScratchSize: 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GCN-NOT: buffer_store_dword
; GCN: v_accvgpr_read_b32
; GCN: v_mfma_f32_32x32x1f32
; GCN-NOT: buffer_load_dword
; GCN: v_accvgpr_write_b32
; GCN: ScratchSize: 0
define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {
bb:
%v = call i32 asm sideeffect "", "=a"()
@ -116,6 +100,47 @@ use:
ret void
}
; Should spill agprs to memory for both gfx908 and gfx90a.
; GCN-LABEL: {{^}}max_5regs_used_8a:
; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GFX908-DAG: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
; GFX908-DAG: v_accvgpr_read_b32 v1, a1 ; Reload Reuse
; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
; GFX908-DAG: v_accvgpr_read_b32 v1, a2 ; Reload Reuse
; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
; GFX908-DAG: v_accvgpr_read_b32 v1, a3 ; Reload Reuse
; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
; GCN-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
; GCN-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
; GCN-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
; GCN-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload
; GCN: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off
; GCN: ScratchSize: 20
define amdgpu_kernel void @max_5regs_used_8a(<4 x float> addrspace(1)* %arg) #4 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v0 = call float asm sideeffect "; def $0", "=v"()
%a4 = call <4 x float> asm sideeffect "; def $0", "=a"()
%gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
%mai.in = load <4 x float>, <4 x float> addrspace(1)* %gep
%mai.out = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.in, i32 0, i32 0, i32 0)
store <4 x float> %mai.out, <4 x float> addrspace(1)* %gep
store volatile <4 x float> %a4, <4 x float> addrspace(1)* undef
call void asm sideeffect "; use $0", "v"(float %v0);
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
@ -125,3 +150,4 @@ attributes #0 = { nounwind "amdgpu-num-vgpr"="24" }
attributes #1 = { nounwind "amdgpu-num-vgpr"="10" }
attributes #2 = { nounwind "amdgpu-num-vgpr"="12" }
attributes #3 = { nounwind "amdgpu-num-vgpr"="32" }
attributes #4 = { nounwind "amdgpu-num-vgpr"="5" }

View File

@ -0,0 +1,23 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
; Convert AV spills into VGPR spills by introducing appropriate copies in between.
define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
; GCN-LABEL: name: test_spill_av_class
; GCN: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %21.sub0
; GCN-NEXT: undef %23.sub0:av_64 = COPY %21.sub0
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY %23
; GCN-NEXT: SI_SPILL_V64_SAVE [[COPY1]], %stack.0, $sgpr32, 0, implicit $exec
; GCN: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec
; GCN-NEXT: [[COPY3:%[0-9]+]]:av_64 = COPY [[SI_SPILL_V64_RESTORE]]
; GCN-NEXT: undef %22.sub0:vreg_64 = COPY [[COPY3]].sub0
%v0 = call i32 asm sideeffect "; def $0", "=v"()
%tmp = insertelement <2 x i32> undef, i32 %v0, i32 0
%mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0)
store volatile <4 x i32> %mai, <4 x i32> addrspace(1)* undef
call void asm sideeffect "; use $0", "v"(<2 x i32> %tmp);
ret void
}
declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
attributes #0 = { nounwind "amdgpu-num-vgpr"="5" }

View File

@ -5,15 +5,14 @@
; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GFX908-NOT: SCRATCH_RSRC
; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}}
; GFX900: buffer_store_dword v{{[0-9]}},
; GFX900: buffer_store_dword v{{[0-9]}},
; GFX900: buffer_load_dword v{{[0-9]}},
; GFX900: buffer_load_dword v{{[0-9]}},
; GFX908-NOT: buffer_
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a0 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse
; GFX908-DAG: v_mov_b32_e32 v{{[0-9]}}, [[V_REG:v[0-9]+]]
; GFX908-DAG: v_accvgpr_read_b32 [[V_REG]], [[A_REG]]
; GCN: NumVgprs: 10
; GFX900: ScratchSize: 12
@ -57,19 +56,19 @@ define amdgpu_kernel void @max_10_vgprs(i32 addrspace(1)* %p) #0 {
}
; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
; GFX908: buffer_store_dword v{{[0-9]}},
; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}}
; GFX908-NOT: buffer_store_dword v{{[0-9]}},
; GFX908-NOT: buffer_
; GFX908: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse
; GFX908: buffer_load_dword v{{[0-9]}},
; GFX908: v_mov_b32_e32 v{{[0-9]}}, [[V_REG:v[0-9]+]]
; GFX908: v_accvgpr_read_b32 [[V_REG]], [[A_REG]]
; GFX908-NOT: buffer_
; GFX900: couldn't allocate input reg for constraint 'a'
; GFX908: NumVgprs: 10
; GFX908: ScratchSize: 8
; GFX908: ScratchSize: 0
; GFX908: VGPRBlocks: 2
; GFX908: NumVGPRsForWavesPerEU: 10
define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 {
@ -113,28 +112,28 @@ define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 {
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GFX908-DAG: v_accvgpr_write_b32 a0, 1
; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
; GFX900: buffer_store_dword v{{[0-9]}},
; GCN-DAG: buffer_store_dword v{{[0-9]}},
; GFX900: buffer_load_dword v{{[0-9]}},
; GCN-DAG: buffer_store_dword v{{[0-9]}},
; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
; GCN-DAG: buffer_load_dword v{{[0-9]}},
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse
; GCN-DAG: buffer_load_dword v{{[0-9]}},
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9
; GCN: NumVgprs: 10
; GFX900: ScratchSize: 44
@ -166,10 +165,10 @@ define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)*
; GCN-LABEL: {{^}}max_10_vgprs_spill_v32:
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
; GCN-NOT: a10
; GCN: buffer_store_dword v{{[0-9]}},
; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
; GCN-NOT: a10
; GFX908: NumVgprs: 10
; GFX900: ScratchSize: 100
@ -231,23 +230,20 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %
ret void
}
; FIXME: adding an AReg_1024 register class for v32f32 and v32i32
; produces unnecessary copies and we still have some amount
; of conventional spilling.
; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb:
; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GFX908-FIXME-NOT: SCRATCH_RSRC
; GFX908-DAG: v_accvgpr_write_b32 a0, v
; GFX908-NOT: SCRATCH_RSRC
; GFX908: v_accvgpr_write_b32
; GFX908: global_load_
; GFX900: buffer_store_dword v
; GFX900: buffer_load_dword v
; GFX908-FIXME-NOT: buffer_
; GFX908-NOT: buffer_
; GFX908-DAG: v_accvgpr_read_b32
; GCN: NumVgprs: 256
; GFX900: ScratchSize: 2052
; GFX908-FIXME: ScratchSize: 0
; GFX908: ScratchSize: 0
; GCN: VGPRBlocks: 63
; GCN: NumVGPRsForWavesPerEU: 256
define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {

View File

@ -0,0 +1,224 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-enable-flat-scratch -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s
# A spilled register can be restored to its superclass during regalloc.
# As a result, we might see AGPR spills restored to VGPRs or the other way around.
---
name: partial_spill_a128_restore_to_v128_1_of_4
tracksRegLiveness: true
stack:
- { id: 0, type: spill-slot, size: 16, alignment: 4 }
machineFunctionInfo:
hasSpilledVGPRs: true
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $vgpr52, $vgpr53, $vgpr54, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
; GCN-LABEL: name: partial_spill_a128_restore_to_v128_1_of_4
; GCN: liveins: $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
; GCN: {{ $}}
; GCN: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
; GCN: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s96) into %stack.0, align 4, addrspace 5)
; GCN: $vgpr51 = COPY $vgpr55, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
; GCN: $vgpr48_vgpr49_vgpr50 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s96) from %stack.0, align 4, addrspace 5)
; GCN: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr54, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr54, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
...
---
name: partial_spill_a128_restore_to_v128_2_of_4
tracksRegLiveness: true
stack:
- { id: 0, type: spill-slot, size: 16, alignment: 4 }
machineFunctionInfo:
hasSpilledVGPRs: true
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $vgpr52, $vgpr53, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
; GCN-LABEL: name: partial_spill_a128_restore_to_v128_2_of_4
; GCN: liveins: $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
; GCN: {{ $}}
; GCN: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
; GCN: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GCN: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s64) into %stack.0, align 4, addrspace 5)
; GCN: $vgpr51 = COPY $vgpr54, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
; GCN: $vgpr50 = COPY $vgpr55, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
; GCN: $vgpr48_vgpr49 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s64) from %stack.0, align 4, addrspace 5)
; GCN: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
...
---
name: partial_spill_a128_restore_to_v128_3_of_4
tracksRegLiveness: true
stack:
- { id: 0, type: spill-slot, size: 16, alignment: 4 }
machineFunctionInfo:
hasSpilledVGPRs: true
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $vgpr52, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
; GCN-LABEL: name: partial_spill_a128_restore_to_v128_3_of_4
; GCN: liveins: $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
; GCN: {{ $}}
; GCN: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
; GCN: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GCN: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GCN: SCRATCH_STORE_DWORD_SADDR killed $agpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s32) into %stack.0, addrspace 5)
; GCN: $vgpr51 = COPY $vgpr53, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
; GCN: $vgpr50 = COPY $vgpr54, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
; GCN: $vgpr49 = COPY $vgpr55, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
; GCN: $vgpr48 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s32) from %stack.0, addrspace 5)
; GCN: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
S_ENDPGM 0, implicit $vgpr52, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
...
---
name: full_spill_a128_restore_to_v128
tracksRegLiveness: true
stack:
- { id: 0, type: spill-slot, size: 16, alignment: 4 }
machineFunctionInfo:
hasSpilledVGPRs: true
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $agpr0_agpr1_agpr2_agpr3
; GCN-LABEL: name: full_spill_a128_restore_to_v128
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $agpr0_agpr1_agpr2_agpr3
; GCN: {{ $}}
; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3
; GCN: $vgpr55 = COPY $vgpr0, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55
; GCN: $vgpr54 = COPY $vgpr1, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55
; GCN: $vgpr53 = COPY $vgpr2, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55
; GCN: $vgpr52 = COPY $vgpr3, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55
; GCN: S_ENDPGM 0
SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$vgpr52_vgpr53_vgpr54_vgpr55 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
S_ENDPGM 0
...
---
name: partial_spill_v128_restore_to_a128_1_of_4
tracksRegLiveness: true
stack:
- { id: 0, type: spill-slot, size: 16, alignment: 4 }
machineFunctionInfo:
hasSpilledVGPRs: true
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $agpr31, $vgpr0_vgpr1_vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr24_agpr25
; GCN-LABEL: name: partial_spill_v128_restore_to_a128_1_of_4
; GCN: liveins: $agpr30, $agpr31, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr24_agpr25, $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: {{ $}}
; GCN: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s96) into %stack.0, align 4, addrspace 5)
; GCN: $agpr29 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29
; GCN: $agpr26_agpr27_agpr28 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s96) from %stack.0, align 4, addrspace 5)
; GCN: S_ENDPGM 0, implicit $agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
S_ENDPGM 0, implicit $agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25
...
---
name: partial_spill_v128_restore_to_a128_2_of_4
tracksRegLiveness: true
stack:
- { id: 0, type: spill-slot, size: 16, alignment: 4 }
machineFunctionInfo:
hasSpilledVGPRs: true
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr24_agpr25
; GCN-LABEL: name: partial_spill_v128_restore_to_a128_2_of_4
; GCN: liveins: $agpr30, $agpr31, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr24_agpr25, $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: {{ $}}
; GCN: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s64) into %stack.0, align 4, addrspace 5)
; GCN: $agpr29 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29
; GCN: $agpr28 = COPY $agpr31, implicit-def $agpr26_agpr27_agpr28_agpr29
; GCN: $agpr26_agpr27 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s64) from %stack.0, align 4, addrspace 5)
; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25
...
---
name: partial_spill_v128_restore_to_a128_3_of_4
tracksRegLiveness: true
stack:
- { id: 0, type: spill-slot, size: 16, alignment: 4 }
machineFunctionInfo:
hasSpilledVGPRs: true
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr24
; GCN-LABEL: name: partial_spill_v128_restore_to_a128_3_of_4
; GCN: liveins: $agpr24, $agpr25, $agpr30, $agpr31, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: {{ $}}
; GCN: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5)
; GCN: $agpr29 = COPY $agpr25, implicit-def $agpr26_agpr27_agpr28_agpr29
; GCN: $agpr28 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29
; GCN: $agpr27 = COPY $agpr31, implicit-def $agpr26_agpr27_agpr28_agpr29
; GCN: $agpr26 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s32) from %stack.0, addrspace 5)
; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24
...
---
name: full_spill_v128_restore_to_a128
tracksRegLiveness: true
stack:
- { id: 0, type: spill-slot, size: 16, alignment: 4 }
machineFunctionInfo:
hasSpilledVGPRs: true
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GCN-LABEL: name: full_spill_v128_restore_to_a128
; GCN: liveins: $agpr4, $agpr5, $agpr6, $agpr7, $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: {{ $}}
; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3
; GCN: $agpr3 = COPY $agpr4, implicit-def $agpr0_agpr1_agpr2_agpr3
; GCN: $agpr2 = COPY $agpr5, implicit-def $agpr0_agpr1_agpr2_agpr3
; GCN: $agpr1 = COPY $agpr6, implicit-def $agpr0_agpr1_agpr2_agpr3
; GCN: $agpr0 = COPY $agpr7, implicit-def $agpr0_agpr1_agpr2_agpr3
; GCN: S_ENDPGM 0
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
$agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
S_ENDPGM 0
...