AMDGPU/LoadStoreOptimizer: combine MMOs when merging instructions
Summary: The LoadStoreOptimizer was creating instructions with 2 MachineMemOperands, which meant they were assumed to alias with all other instructions, because MachineInstr:mayAlias() returns true when an instruction has multiple MachineMemOperands. This was preventing these instructions from being merged again, and was giving the scheduler less freedom to reorder them. Reviewers: arsenm, nhaehnle Reviewed By: arsenm Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D65036 llvm-svn: 367237
This commit is contained in:
parent
899bdaa8c2
commit
cc0bc941d4
|
@ -307,6 +307,16 @@ static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This function assumes that \p A and \p B have are identical except for
|
||||||
|
// size and offset, and they referecne adjacent memory.
|
||||||
|
static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
|
||||||
|
const MachineMemOperand *A,
|
||||||
|
const MachineMemOperand *B) {
|
||||||
|
unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
|
||||||
|
unsigned Size = A->getSize() + B->getSize();
|
||||||
|
return MF.getMachineMemOperand(A, MinOffset, Size);
|
||||||
|
}
|
||||||
|
|
||||||
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
|
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
|
||||||
// XXX - Would the same offset be OK? Is there any reason this would happen or
|
// XXX - Would the same offset be OK? Is there any reason this would happen or
|
||||||
// be useful?
|
// be useful?
|
||||||
|
@ -858,12 +868,20 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
|
||||||
unsigned DestReg = MRI->createVirtualRegister(SuperRC);
|
unsigned DestReg = MRI->createVirtualRegister(SuperRC);
|
||||||
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
|
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
|
||||||
|
|
||||||
|
// It shouldn't be possible to get this far if the two instructions
|
||||||
|
// don't have a single memoperand, because MachineInstr::mayAlias()
|
||||||
|
// will return true if this is the case.
|
||||||
|
assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
|
||||||
|
|
||||||
|
const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
|
||||||
|
const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
|
||||||
|
|
||||||
BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
|
BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
|
||||||
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
|
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
|
||||||
.addImm(MergedOffset) // offset
|
.addImm(MergedOffset) // offset
|
||||||
.addImm(CI.GLC0) // glc
|
.addImm(CI.GLC0) // glc
|
||||||
.addImm(CI.DLC0) // dlc
|
.addImm(CI.DLC0) // dlc
|
||||||
.cloneMergedMemRefs({&*CI.I, &*CI.Paired});
|
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
|
||||||
|
|
||||||
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
|
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
|
||||||
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
|
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
|
||||||
|
@ -909,6 +927,14 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
|
||||||
if (Regs & VADDR)
|
if (Regs & VADDR)
|
||||||
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
|
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
|
||||||
|
|
||||||
|
// It shouldn't be possible to get this far if the two instructions
|
||||||
|
// don't have a single memoperand, because MachineInstr::mayAlias()
|
||||||
|
// will return true if this is the case.
|
||||||
|
assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
|
||||||
|
|
||||||
|
const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
|
||||||
|
const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
|
||||||
|
|
||||||
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
|
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
|
||||||
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
|
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
|
||||||
.addImm(MergedOffset) // offset
|
.addImm(MergedOffset) // offset
|
||||||
|
@ -916,7 +942,7 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
|
||||||
.addImm(CI.SLC0) // slc
|
.addImm(CI.SLC0) // slc
|
||||||
.addImm(0) // tfe
|
.addImm(0) // tfe
|
||||||
.addImm(CI.DLC0) // dlc
|
.addImm(CI.DLC0) // dlc
|
||||||
.cloneMergedMemRefs({&*CI.I, &*CI.Paired});
|
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
|
||||||
|
|
||||||
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
|
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
|
||||||
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
|
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
|
||||||
|
@ -1092,6 +1118,15 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
|
||||||
if (Regs & VADDR)
|
if (Regs & VADDR)
|
||||||
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
|
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
|
||||||
|
|
||||||
|
|
||||||
|
// It shouldn't be possible to get this far if the two instructions
|
||||||
|
// don't have a single memoperand, because MachineInstr::mayAlias()
|
||||||
|
// will return true if this is the case.
|
||||||
|
assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
|
||||||
|
|
||||||
|
const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
|
||||||
|
const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
|
||||||
|
|
||||||
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
|
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
|
||||||
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
|
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
|
||||||
.addImm(std::min(CI.Offset0, CI.Offset1)) // offset
|
.addImm(std::min(CI.Offset0, CI.Offset1)) // offset
|
||||||
|
@ -1099,7 +1134,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
|
||||||
.addImm(CI.SLC0) // slc
|
.addImm(CI.SLC0) // slc
|
||||||
.addImm(0) // tfe
|
.addImm(0) // tfe
|
||||||
.addImm(CI.DLC0) // dlc
|
.addImm(CI.DLC0) // dlc
|
||||||
.cloneMergedMemRefs({&*CI.I, &*CI.Paired});
|
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
|
||||||
|
|
||||||
moveInstsAfter(MIB, CI.InstsToMove);
|
moveInstsAfter(MIB, CI.InstsToMove);
|
||||||
|
|
||||||
|
|
|
@ -64,6 +64,8 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
attributes #0 = { convergent nounwind }
|
attributes #0 = { convergent nounwind }
|
||||||
|
|
||||||
|
define amdgpu_kernel void @merge_mmos() { ret void }
|
||||||
...
|
...
|
||||||
---
|
---
|
||||||
name: mem_dependency
|
name: mem_dependency
|
||||||
|
@ -163,3 +165,24 @@ body: |
|
||||||
%18:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 3, 0, 0 :: (dereferenceable invariant load 4)
|
%18:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 3, 0, 0 :: (dereferenceable invariant load 4)
|
||||||
S_ENDPGM 0
|
S_ENDPGM 0
|
||||||
...
|
...
|
||||||
|
---
|
||||||
|
# CHECK-LABEL: merge_mmos
|
||||||
|
# CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 8, align 4)
|
||||||
|
# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4)
|
||||||
|
# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4)
|
||||||
|
name: merge_mmos
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||||
|
|
||||||
|
%0:sreg_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||||
|
%1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 4)
|
||||||
|
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 1, 0, 0 :: (dereferenceable invariant load 4)
|
||||||
|
%3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4)
|
||||||
|
%4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4)
|
||||||
|
BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4)
|
||||||
|
BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4)
|
||||||
|
S_ENDPGM 0
|
||||||
|
|
||||||
|
...
|
||||||
|
|
|
@ -32,7 +32,7 @@
|
||||||
}
|
}
|
||||||
...
|
...
|
||||||
|
|
||||||
# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
|
# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1)
|
||||||
---
|
---
|
||||||
name: test1
|
name: test1
|
||||||
liveins:
|
liveins:
|
||||||
|
@ -124,7 +124,7 @@ body: |
|
||||||
S_ENDPGM 0
|
S_ENDPGM 0
|
||||||
...
|
...
|
||||||
|
|
||||||
# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1)
|
# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1)
|
||||||
---
|
---
|
||||||
name: test4
|
name: test4
|
||||||
liveins:
|
liveins:
|
||||||
|
|
Loading…
Reference in New Issue