[AMDGPU] Waitcnt pass: Modify the waitcnt pass to propagate info in the case of a single basic block loop. mergeInputScoreBrackets() does this for us; update it so that it processes the single bb's score bracket when processing the single bb's preds. It is, after all, a pred of itself, so it's score bracket is needed.
Differential Revision: https://reviews.llvm.org/D44434 llvm-svn: 327583
This commit is contained in:
parent
dfeebdbed7
commit
c3c02bde73
|
@ -115,11 +115,11 @@ enum RegisterMapping {
|
|||
(w) = (enum WaitEventType)((w) + 1))
|
||||
|
||||
// This is a per-basic-block object that maintains current score brackets
|
||||
// of each wait-counter, and a per-register scoreboard for each wait-couner.
|
||||
// of each wait counter, and a per-register scoreboard for each wait counter.
|
||||
// We also maintain the latest score for every event type that can change the
|
||||
// waitcnt in order to know if there are multiple types of events within
|
||||
// the brackets. When multiple types of event happen in the bracket,
|
||||
// wait-count may get decreased out of order, therefore we need to put in
|
||||
// wait count may get decreased out of order, therefore we need to put in
|
||||
// "s_waitcnt 0" before use.
|
||||
class BlockWaitcntBrackets {
|
||||
public:
|
||||
|
@ -690,7 +690,7 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
|
|||
setScoreLB(T, getScoreUB(T));
|
||||
} else if (counterOutOfOrder(T)) {
|
||||
// Counter can get decremented out-of-order when there
|
||||
// are multiple types event in the brack. Also emit an s_wait counter
|
||||
// are multiple types event in the bracket. Also emit an s_wait counter
|
||||
// with a conservative value of 0 for the counter.
|
||||
NeedWait = CNT_MASK(T);
|
||||
setScoreLB(T, getScoreUB(T));
|
||||
|
@ -1301,27 +1301,37 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
|
|||
}
|
||||
}
|
||||
|
||||
// Merge the score brackets of the Block's predecessors;
|
||||
// this merged score bracket is used when adding waitcnts to the Block
|
||||
void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
|
||||
BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
|
||||
int32_t MaxPending[NUM_INST_CNTS] = {0};
|
||||
int32_t MaxFlat[NUM_INST_CNTS] = {0};
|
||||
bool MixedExpTypes = false;
|
||||
|
||||
// Clear the score bracket state.
|
||||
// For single basic block loops, we need to retain the Block's
|
||||
// score bracket to have accurate Pred info. So, make a copy of Block's
|
||||
// score bracket, clear() it (which retains several important bits of info),
|
||||
// populate, and then replace en masse. For non-single basic block loops,
|
||||
// just clear Block's current score bracket and repopulate in-place.
|
||||
bool IsSelfPred;
|
||||
std::unique_ptr<BlockWaitcntBrackets> S;
|
||||
|
||||
IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
|
||||
!= Block.pred_end();
|
||||
if (IsSelfPred) {
|
||||
S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
|
||||
ScoreBrackets = S.get();
|
||||
}
|
||||
|
||||
ScoreBrackets->clear();
|
||||
|
||||
// Compute the number of pending elements on block entry.
|
||||
|
||||
// IMPORTANT NOTE: If iterative handling of loops is added, the code will
|
||||
// need to handle single BBs with backedges to themselves. This means that
|
||||
// they will need to retain and not clear their initial state.
|
||||
|
||||
// See if there are any uninitialized predecessors. If so, emit an
|
||||
// s_waitcnt 0 at the beginning of the block.
|
||||
for (MachineBasicBlock *pred : Block.predecessors()) {
|
||||
for (MachineBasicBlock *Pred : Block.predecessors()) {
|
||||
BlockWaitcntBrackets *PredScoreBrackets =
|
||||
BlockWaitcntBracketsMap[pred].get();
|
||||
bool Visited = BlockVisitedSet.count(pred);
|
||||
BlockWaitcntBracketsMap[Pred].get();
|
||||
bool Visited = BlockVisitedSet.count(Pred);
|
||||
if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
|
||||
continue;
|
||||
}
|
||||
|
@ -1550,6 +1560,12 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if a single block loop, update the score brackets. Not needed for other
|
||||
// blocks, as we did this in-place
|
||||
if (IsSelfPred) {
|
||||
BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the "bottom" block of a loop. This differs from
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
# RUN: llc -o - %s -march=amdgcn -run-pass=si-insert-waitcnts -verify-machineinstrs | FileCheck -check-prefix=GCN %s
|
||||
|
||||
# Check that the waitcnt propogates info in the case of a single basic block loop
|
||||
|
||||
# GCN-LABEL: waitcnt-loop-single-basic-block
|
||||
# GCN: bb.0
|
||||
# GCN: S_WAITCNT 3952
|
||||
# GCN-NEXT: GLOBAL_STORE_DWORD
|
||||
# GCN: S_WAITCNT 3953
|
||||
# GCN-NEXT: GLOBAL_STORE_DWORD
|
||||
|
||||
...
|
||||
name: waitcnt-loop-single-basic-block
|
||||
body: |
|
||||
bb.0:
|
||||
S_BRANCH %bb.1
|
||||
bb.1:
|
||||
GLOBAL_STORE_DWORD $vgpr7_vgpr8, $vgpr11, 0, 0, 0, implicit $exec
|
||||
$vgpr21 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, implicit $exec
|
||||
$vgpr10 = GLOBAL_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, implicit $exec
|
||||
GLOBAL_STORE_DWORD $vgpr14_vgpr15, $vgpr21, 0, 0, 0, implicit $exec
|
||||
$vgpr11 = GLOBAL_LOAD_DWORD $vgpr11_vgpr12, 0, 0, 0, implicit $exec
|
||||
S_CBRANCH_SCC1 %bb.1, implicit $scc
|
||||
bb.2:
|
||||
S_ENDPGM
|
||||
...
|
Loading…
Reference in New Issue