AMDGPU: Distribute SGPR->VGPR copies of REG_SEQUENCE

Make the REG_SEQUENCE be a VGPR, and do the register class
copy first.

llvm-svn: 251855
This commit is contained in:
Matt Arsenault 2015-11-02 23:15:42 +00:00
parent 7f4df5432e
commit 0de924b76d
7 changed files with 111 additions and 62 deletions

View File

@ -85,18 +85,6 @@ class SIFixSGPRCopies : public MachineFunctionPass {
private:
static char ID;
std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getCopyRegClasses(const MachineInstr &Copy,
const SIRegisterInfo &TRI,
const MachineRegisterInfo &MRI) const;
bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) const;
bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) const;
public:
SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
@ -134,10 +122,10 @@ static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
return false;
}
std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
SIFixSGPRCopies::getCopyRegClasses(const MachineInstr &Copy,
const SIRegisterInfo &TRI,
const MachineRegisterInfo &MRI) const {
static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getCopyRegClasses(const MachineInstr &Copy,
const SIRegisterInfo &TRI,
const MachineRegisterInfo &MRI) {
unsigned DstReg = Copy.getOperand(0).getReg();
unsigned SrcReg = Copy.getOperand(1).getReg();
@ -157,18 +145,94 @@ SIFixSGPRCopies::getCopyRegClasses(const MachineInstr &Copy,
return std::make_pair(SrcRC, DstRC);
}
bool SIFixSGPRCopies::isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) const {
static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
}
bool SIFixSGPRCopies::isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) const {
static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
}
// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
//
// SGPRx = ...
// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
// VGPRz = COPY SGPRy
//
// ==>
//
// VGPRx = COPY SGPRx
// VGPRz = REG_SEQUENCE VGPRx, sub0
//
// This exposes immediate folding opportunities when materializing 64-bit
// immediates.
static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
const SIRegisterInfo *TRI,
const SIInstrInfo *TII,
MachineRegisterInfo &MRI) {
assert(MI.isRegSequence());
unsigned DstReg = MI.getOperand(0).getReg();
if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
return false;
if (!MRI.hasOneUse(DstReg))
return false;
MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
if (!CopyUse.isCopy())
return false;
const TargetRegisterClass *SrcRC, *DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
return false;
// TODO: Could have multiple extracts?
unsigned SubReg = CopyUse.getOperand(1).getSubReg();
if (SubReg != AMDGPU::NoSubRegister)
return false;
MRI.setRegClass(DstReg, DstRC);
// SGPRx = ...
// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
// VGPRz = COPY SGPRy
// =>
// VGPRx = COPY SGPRx
// VGPRz = REG_SEQUENCE VGPRx, sub0
MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
unsigned SrcReg = MI.getOperand(I).getReg();
unsigned SrcSubReg = MI.getOperand(I).getReg();
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
assert(TRI->isSGPRClass(SrcRC) &&
"Expected SGPR REG_SEQUENCE to only have SGPR inputs");
SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg)
.addOperand(MI.getOperand(I));
MI.getOperand(I).setReg(TmpReg);
}
CopyUse.eraseFromParent();
return true;
}
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIRegisterInfo *TRI =
@ -273,8 +337,10 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}
case AMDGPU::REG_SEQUENCE: {
if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
!hasVGPROperands(MI, TRI))
!hasVGPROperands(MI, TRI)) {
foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
continue;
}
DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);

View File

@ -3,8 +3,7 @@
; Use a 64-bit value with lo bits that can be represented as an inline constant
; CHECK-LABEL: {{^}}i64_imm_inline_lo:
; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5
; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], 5
; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
entry:
@ -14,8 +13,7 @@ entry:
; Use a 64-bit value with hi bits that can be represented as an inline constant
; CHECK-LABEL: {{^}}i64_imm_inline_hi:
; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5
; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], 5
; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
entry:
@ -24,10 +22,8 @@ entry:
}
; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64:
; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000
; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
store i64 -9223372036854775808, i64 addrspace(1) *%out
@ -523,10 +519,8 @@ define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64:
; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000
; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
store double -0.0, double addrspace(1)* %out
@ -606,10 +600,8 @@ define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
}
; CHECK-LABEL: {{^}}store_literal_imm_f64:
; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000
; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40b00000
; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
define void @store_literal_imm_f64(double addrspace(1)* %out) {
store double 4096.0, double addrspace(1)* %out

View File

@ -68,10 +68,8 @@ define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)*
}
; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
@ -92,10 +90,8 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
}
; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0
; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}}
; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]]
; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]]
; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1

View File

@ -185,8 +185,7 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in
; Make sure load width gets reduced to i32 load.
; GCN-LABEL: {{^}}s_shl_32_i64:
; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
; GCN-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {

View File

@ -190,8 +190,7 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
; Make sure load width gets reduced to i32 load.
; GCN-LABEL: {{^}}s_lshr_32_i64:
; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}}
; GCN-DAG: s_mov_b32 s[[SHI:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {

View File

@ -245,18 +245,16 @@ define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
; GCN-LABEL: {{^}}test_s0_s1_k_f64:
; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
; GCN-DAG: s_mov_b32 s[[SK0_SUB1:[0-9]+]], 0x40900000
; GCN-DAG: s_mov_b32 s[[SZERO:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[VK0_SUB0:[0-9]+]], s[[SZERO]]
; GCN: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], s[[SK0_SUB1]]
; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], 0x40900000
; GCN-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}}
; GCN-DAG: s_mov_b32 s[[SK1_SUB1:[0-9]+]], 0x40b00000{{$}}
; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB0:[0-9]+]], s[[SZERO]]
; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], s[[SK1_SUB1]]
; GCN-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VK0_SUB0]]:[[VK0_SUB1]]{{\]}}
; GCN-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VK1_SUB0]]:[[VK1_SUB1]]{{\]}}
; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}}
; Same zero component is re-used for half of each immediate.
; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}}
; GCN: buffer_store_dwordx2 [[RESULT0]]
; GCN: buffer_store_dwordx2 [[RESULT1]]

View File

@ -7,8 +7,7 @@
; R600: MEM_RAT_CACHELESS STORE_RAW
; SI: {{^}}test:
; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}}
; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
entry: