[AMDGPU] Do not allow register coalescer to create big superregs

Limit register coalescer by not allowing it to artificially increase
size of registers beyond dword. Such super-registers are in fact
register sequences and not distinct HW registers.

With more super-regs we would need to allocate adjacent registers
and constraint regalloc more than needed. Moreover, our super
registers are overlapping. For instance we have VGPR0_VGPR1_VGPR2,
VGPR1_VGPR2_VGPR3, VGPR2_VGPR3_VGPR4 etc, which complicates registers
allocation even more, resulting in excessive spilling.

Differential Revision: https://reviews.llvm.org/D28782

llvm-svn: 292413
This commit is contained in:
Stanislav Mekhanoshin 2017-01-18 17:30:05 +00:00
parent fde0104649
commit a4e63ead4b
4 changed files with 106 additions and 8 deletions

View File

@ -1474,3 +1474,23 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
unsigned Reg) const {
return hasVGPRs(getRegClassForReg(MRI, Reg));
}
bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
const TargetRegisterClass *SrcRC,
unsigned SubReg,
const TargetRegisterClass *DstRC,
unsigned DstSubReg,
const TargetRegisterClass *NewRC) const {
unsigned SrcSize = SrcRC->getSize();
unsigned DstSize = DstRC->getSize();
unsigned NewSize = NewRC->getSize();
// Do not increase size of registers beyond dword, we would need to allocate
// adjacent registers and constraint regalloc more than needed.
// Always allow dword coalescing.
if (SrcSize <= 4 || DstSize <= 4)
return true;
return NewSize <= DstSize || NewSize <= SrcSize;
}

View File

@ -264,6 +264,13 @@ public:
ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
unsigned EltSize) const;
bool shouldCoalesce(MachineInstr *MI,
const TargetRegisterClass *SrcRC,
unsigned SubReg,
const TargetRegisterClass *DstRC,
unsigned DstSubReg,
const TargetRegisterClass *NewRC) const override;
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,

View File

@ -399,15 +399,15 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
; XVI-NOT: v_cvt_f32_f16
; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
; VI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
; VI-DAG: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
; GCN-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
; GCN-NOT: v_cvt_f64_f32_e32
; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}

View File

@ -0,0 +1,71 @@
# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s
# Check that coalescer does not create wider register tuple than in source
# CHECK: - { id: 2, class: vreg_64 }
# CHECK: - { id: 3, class: vreg_64 }
# CHECK: - { id: 4, class: vreg_64 }
# CHECK: - { id: 5, class: vreg_96 }
# CHECK: - { id: 6, class: vreg_96 }
# CHECK: - { id: 7, class: vreg_128 }
# CHECK: - { id: 8, class: vreg_128 }
# No more registers shall be defined
# CHECK-NEXT: liveins:
# CHECK: FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %4,
# CHECK: FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %6,
---
name: main
alignment: 0
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
- { id: 1, class: sreg_32_xm0, preferred-register: '%1' }
- { id: 2, class: vreg_64, preferred-register: '%2' }
- { id: 3, class: vreg_64 }
- { id: 4, class: vreg_64 }
- { id: 5, class: vreg_64 }
- { id: 6, class: vreg_96 }
- { id: 7, class: vreg_96 }
- { id: 8, class: vreg_128 }
- { id: 9, class: vreg_128 }
liveins:
- { reg: '%sgpr6', virtual-reg: '%1' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 0
offsetAdjustment: 0
maxAlignment: 0
adjustsStack: false
hasCalls: false
maxCallFrameSize: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
bb.0.entry:
liveins: %sgpr0, %vgpr0_vgpr1
%3 = IMPLICIT_DEF
undef %4.sub0 = COPY %sgpr0
%4.sub1 = COPY %3.sub0
undef %5.sub0 = COPY %4.sub1
%5.sub1 = COPY %4.sub0
FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, 0, implicit %exec, implicit %flat_scr
%6 = IMPLICIT_DEF
undef %7.sub0_sub1 = COPY %6
%7.sub2 = COPY %3.sub0
FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, 0, implicit %exec, implicit %flat_scr
%8 = IMPLICIT_DEF
undef %9.sub0_sub1_sub2 = COPY %8
%9.sub3 = COPY %3.sub0
FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, 0, implicit %exec, implicit %flat_scr
...