From a4e63ead4b43b0f6be70744b553a2dcb9bdbf605 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 18 Jan 2017 17:30:05 +0000 Subject: [PATCH] [AMDGPU] Do not allow register coalescer to create big superregs Limit register coalescer by not allowing it to artificially increase size of registers beyond dword. Such super-registers are in fact register sequences and not distinct HW registers. With more super-regs we would need to allocate adjacent registers and constraint regalloc more than needed. Moreover, our super registers are overlapping. For instance we have VGPR0_VGPR1_VGPR2, VGPR1_VGPR2_VGPR3, VGPR2_VGPR3_VGPR4 etc, which complicates registers allocation even more, resulting in excessive spilling. Differential Revision: https://reviews.llvm.org/D28782 llvm-svn: 292413 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 20 ++++++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 7 ++ llvm/test/CodeGen/AMDGPU/half.ll | 16 ++--- llvm/test/CodeGen/AMDGPU/limit-coalesce.mir | 71 +++++++++++++++++++++ 4 files changed, 106 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/limit-coalesce.mir diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 612599b1283b..84eb246800a9 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1474,3 +1474,23 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const { return hasVGPRs(getRegClassForReg(MRI, Reg)); } + +bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const { + unsigned SrcSize = SrcRC->getSize(); + unsigned DstSize = DstRC->getSize(); + unsigned NewSize = NewRC->getSize(); + + // Do not increase size of registers beyond dword, we would need to allocate + // adjacent registers and constraint regalloc more than needed. + + // Always allow dword coalescing. + if (SrcSize <= 4 || DstSize <= 4) + return true; + + return NewSize <= DstSize || NewSize <= SrcSize; +} diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 0bcae7d9840c..29c72b6a8f80 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -264,6 +264,13 @@ public: ArrayRef getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const; + bool shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const override; + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index f2bb3f9d110a..e823f736f1ed 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -399,15 +399,15 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x ; XVI-NOT: v_cvt_f32_f16 ; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] -; VI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] -; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] -; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] -; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] -; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] +; VI-DAG: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] +; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] +; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] +; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] +; GCN-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] -; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] -; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] -; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] +; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] +; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] +; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] ; GCN-NOT: v_cvt_f64_f32_e32 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir new file mode 100644 index 000000000000..106a96e32dc3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -0,0 +1,71 @@ +# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s + +# Check that coalescer does not create wider register tuple than in source + +# CHECK: - { id: 2, class: vreg_64 } +# CHECK: - { id: 3, class: vreg_64 } +# CHECK: - { id: 4, class: vreg_64 } +# CHECK: - { id: 5, class: vreg_96 } +# CHECK: - { id: 6, class: vreg_96 } +# CHECK: - { id: 7, class: vreg_128 } +# CHECK: - { id: 8, class: vreg_128 } +# No more registers shall be defined +# CHECK-NEXT: liveins: +# CHECK: FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %4, +# CHECK: FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %6, + +--- +name: main +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 1, class: sreg_32_xm0, preferred-register: '%1' } + - { id: 2, class: vreg_64, preferred-register: '%2' } + - { id: 3, class: vreg_64 } + - { id: 4, class: vreg_64 } + - { id: 5, class: vreg_64 } + - { id: 6, class: vreg_96 } + - { id: 7, class: vreg_96 } + - { id: 8, class: vreg_128 } + - { id: 9, class: vreg_128 } +liveins: + - { reg: '%sgpr6', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %sgpr0, %vgpr0_vgpr1 + + %3 = IMPLICIT_DEF + undef %4.sub0 = COPY %sgpr0 + %4.sub1 = COPY %3.sub0 + undef %5.sub0 = COPY %4.sub1 + %5.sub1 = COPY %4.sub0 + FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, 0, implicit %exec, implicit %flat_scr + + %6 = IMPLICIT_DEF + undef %7.sub0_sub1 = COPY %6 + %7.sub2 = COPY %3.sub0 + FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, 0, implicit %exec, implicit %flat_scr + + %8 = IMPLICIT_DEF + undef %9.sub0_sub1_sub2 = COPY %8 + %9.sub3 = COPY %3.sub0 + FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, 0, implicit %exec, implicit %flat_scr +...