AMDGPU: Add SIWholeQuadMode pass

Summary:
Whole quad mode is already enabled for pixel shaders that compute
derivatives, but it must be suspended for instructions that cause a
shader to have side effects (i.e. stores and atomics).

This pass addresses the issue by storing the real (initial) live mask
in a register, masking EXEC before instructions that require exact
execution and (re-)enabling WQM where required.

This pass is run before register coalescing so that we can use
machine SSA for analysis.

The changes in this patch expose a problem with the second machine
scheduling pass: target independent instructions like COPY implicitly
use EXEC when they operate on VGPRs, but this fact is not encoded in
the MIR. This can lead to miscompilation because instructions are
moved past changes to EXEC.

This patch fixes the problem by adding use-implicit operands to
target independent instructions. Some general codegen passes are
relaxed to work with such implicit use operands.

Reviewers: arsenm, tstellarAMD, mareko

Subscribers: MatzeB, arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18162

llvm-svn: 263982
This commit is contained in:
Nicolai Haehnle 2016-03-21 20:28:33 +00:00
parent b14f4fd0de
commit 213e87f2ee
10 changed files with 863 additions and 15 deletions

View File

@ -44,6 +44,7 @@ FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
FunctionPass *createSIWholeQuadModePass();
FunctionPass *createSILowerControlFlowPass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIFixSGPRCopiesPass();
@ -70,6 +71,9 @@ extern char &SILowerI1CopiesID;
void initializeSILoadStoreOptimizerPass(PassRegistry &);
extern char &SILoadStoreOptimizerID;
void initializeSIWholeQuadModePass(PassRegistry &);
extern char &SIWholeQuadModeID;
void initializeSILowerControlFlowPass(PassRegistry &);
extern char &SILowerControlFlowPassID;

View File

@ -62,7 +62,6 @@ public:
int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const override;
/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.

View File

@ -57,6 +57,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIAnnotateControlFlowPass(*PR);
initializeSIInsertNopsPass(*PR);
initializeSIInsertWaitsPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
}
@ -346,6 +347,7 @@ void GCNPassConfig::addPreRegAlloc() {
insertPass(&MachineSchedulerID, &RegisterCoalescerID);
}
addPass(createSIShrinkInstructionsPass(), false);
addPass(createSIWholeQuadModePass());
}
void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {

View File

@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen
SIRegisterInfo.cpp
SIShrinkInstructions.cpp
SITypeRewriter.cpp
SIWholeQuadMode.cpp
)
add_subdirectory(AsmParser)

View File

@ -1248,6 +1248,19 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
.addImm(0); // omod
}
bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
const MachineBasicBlock *MBB,
const MachineFunction &MF) const {
// Target-independent instructions do not have an implicit-use of EXEC, even
// when they operate on VGPRs. Treating EXEC modifications as scheduling
// boundaries prevents incorrect movements of such instructions.
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
if (MI->modifiesRegister(AMDGPU::EXEC, TRI))
return true;
return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF);
}
bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
int64_t SVal = Imm.getSExtValue();
if (SVal >= -16 && SVal <= 64)

View File

@ -149,6 +149,10 @@ public:
MachineBasicBlock::iterator &MI,
LiveVariables *LV) const override;
bool isSchedulingBoundary(const MachineInstr *MI,
const MachineBasicBlock *MBB,
const MachineFunction &MF) const override;
static bool isSALU(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::SALU;
}

View File

@ -78,7 +78,7 @@ private:
void SkipIfDead(MachineInstr &MI);
void If(MachineInstr &MI);
void Else(MachineInstr &MI);
void Else(MachineInstr &MI, bool ExecModified);
void Break(MachineInstr &MI);
void IfBreak(MachineInstr &MI);
void ElseBreak(MachineInstr &MI);
@ -215,7 +215,7 @@ void SILowerControlFlow::If(MachineInstr &MI) {
MI.eraseFromParent();
}
void SILowerControlFlow::Else(MachineInstr &MI) {
void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
@ -225,6 +225,15 @@ void SILowerControlFlow::Else(MachineInstr &MI) {
TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
.addReg(Src); // Saved EXEC
if (ExecModified) {
// Adjust the saved exec to account for the modifications during the flow
// block that contains the ELSE. This can happen when WQM mode is switched
// off.
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
.addReg(AMDGPU::EXEC)
.addReg(Dst);
}
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Dst);
@ -488,7 +497,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
bool HaveKill = false;
bool NeedWQM = false;
bool NeedFlat = false;
unsigned Depth = 0;
@ -498,17 +506,24 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
MachineBasicBlock *EmptyMBBAtEnd = NULL;
MachineBasicBlock &MBB = *BI;
MachineBasicBlock::iterator I, Next;
bool ExecModified = false;
for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
if (TII->isWQM(MI) || TII->isDS(MI))
NeedWQM = true;
// Flat uses m0 in case it needs to access LDS.
if (TII->isFLAT(MI))
NeedFlat = true;
for (const auto &Def : I->defs()) {
if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) {
ExecModified = true;
break;
}
}
switch (MI.getOpcode()) {
default: break;
case AMDGPU::SI_IF:
@ -517,7 +532,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
break;
case AMDGPU::SI_ELSE:
Else(MI);
Else(MI, ExecModified);
break;
case AMDGPU::SI_BREAK:
@ -599,12 +614,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
}
}
if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
MachineBasicBlock &MBB = MF.front();
BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC).addReg(AMDGPU::EXEC);
}
if (NeedFlat && MFI->IsKernel) {
// TODO: What to use with function calls?
// We will need to Initialize the flat scratch register pair.

View File

@ -72,9 +72,12 @@ public:
}
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
const TargetRegisterClass *RC;
if (TargetRegisterInfo::isVirtualRegister(Reg))
return isSGPRClass(MRI.getRegClass(Reg));
return getPhysRegClass(Reg);
RC = MRI.getRegClass(Reg);
else
RC = getPhysRegClass(Reg);
return isSGPRClass(RC);
}
/// \returns true if this class contains VGPR registers.

View File

@ -0,0 +1,465 @@
//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief This pass adds instructions to enable whole quad mode for pixel
/// shaders.
///
/// Whole quad mode is required for derivative computations, but it interferes
/// with shader side effects (stores and atomics). This pass is run on the
/// scheduled machine IR but before register coalescing, so that machine SSA is
/// available for analysis. It ensures that WQM is enabled when necessary, but
/// disabled around stores and atomics.
///
/// When necessary, this pass creates a function prolog
///
/// S_MOV_B64 LiveMask, EXEC
/// S_WQM_B64 EXEC, EXEC
///
/// to enter WQM at the top of the function and surrounds blocks of Exact
/// instructions by
///
/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
/// ...
/// S_MOV_B64 EXEC, Tmp
///
/// In order to avoid excessive switching during sequences of Exact
/// instructions, the pass first analyzes which instructions must be run in WQM
/// (aka which instructions produce values that lead to derivative
/// computations).
///
/// Basic blocks are always exited in WQM as long as some successor needs WQM.
///
/// There is room for improvement given better control flow analysis:
///
/// (1) at the top level (outside of control flow statements, and as long as
/// kill hasn't been used), one SGPR can be saved by recovering WQM from
/// the LiveMask (this is implemented for the entry block).
///
/// (2) when entire regions (e.g. if-else blocks or entire loops) only
/// consist of exact and don't-care instructions, the switch only has to
/// be done at the entry and exit points rather than potentially in each
/// block of the region.
///
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineDominanceFrontier.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Constants.h"
using namespace llvm;
#define DEBUG_TYPE "si-wqm"
namespace {
enum {
StateWQM = 0x1,
StateExact = 0x2,
};
struct InstrInfo {
char Needs = 0;
char OutNeeds = 0;
};
struct BlockInfo {
char Needs = 0;
char InNeeds = 0;
char OutNeeds = 0;
};
struct WorkItem {
const MachineBasicBlock *MBB = nullptr;
const MachineInstr *MI = nullptr;
WorkItem() {}
WorkItem(const MachineBasicBlock *MBB) : MBB(MBB) {}
WorkItem(const MachineInstr *MI) : MI(MI) {}
};
class SIWholeQuadMode : public MachineFunctionPass {
private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<const MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<const MachineInstr *, 2> ExecExports;
char scanInstructions(const MachineFunction &MF, std::vector<WorkItem>& Worklist);
void propagateInstruction(const MachineInstr &MI, std::vector<WorkItem>& Worklist);
void propagateBlock(const MachineBasicBlock &MBB, std::vector<WorkItem>& Worklist);
char analyzeFunction(const MachineFunction &MF);
void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SaveWQM, unsigned LiveMaskReg);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SavedWQM);
void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
public:
static char ID;
SIWholeQuadMode() :
MachineFunctionPass(ID) { }
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
return "SI Whole Quad Mode";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
} // End anonymous namespace
char SIWholeQuadMode::ID = 0;
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE,
"SI Whole Quad Mode", false, false)
INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE,
"SI Whole Quad Mode", false, false)
char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
FunctionPass *llvm::createSIWholeQuadModePass() {
return new SIWholeQuadMode;
}
// Scan instructions to determine which ones require an Exact execmask and
// which ones seed WQM requirements.
char SIWholeQuadMode::scanInstructions(const MachineFunction &MF,
std::vector<WorkItem> &Worklist) {
char GlobalFlags = 0;
for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
const MachineBasicBlock &MBB = *BI;
for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
const MachineInstr &MI = *II;
unsigned Opcode = MI.getOpcode();
char Flags;
if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
Flags = StateWQM;
} else if (TII->get(Opcode).mayStore() &&
(MI.getDesc().TSFlags & SIInstrFlags::VM_CNT)) {
Flags = StateExact;
} else {
// Handle export instructions with the exec mask valid flag set
if (Opcode == AMDGPU::EXP && MI.getOperand(4).getImm() != 0)
ExecExports.push_back(&MI);
continue;
}
Instructions[&MI].Needs = Flags;
Worklist.push_back(&MI);
GlobalFlags |= Flags;
}
}
return GlobalFlags;
}
void SIWholeQuadMode::propagateInstruction(const MachineInstr &MI,
std::vector<WorkItem>& Worklist) {
const MachineBasicBlock &MBB = *MI.getParent();
InstrInfo &II = Instructions[&MI];
BlockInfo &BI = Blocks[&MBB];
// Control flow-type instructions that are followed by WQM computations
// must themselves be in WQM.
if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) &&
(MI.isBranch() || MI.isTerminator() || MI.getOpcode() == AMDGPU::SI_KILL))
II.Needs = StateWQM;
// Propagate to block level
BI.Needs |= II.Needs;
if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
BI.InNeeds |= II.Needs;
Worklist.push_back(&MBB);
}
// Propagate backwards within block
if (const MachineInstr *PrevMI = MI.getPrevNode()) {
char InNeeds = II.Needs | II.OutNeeds;
if (!PrevMI->isPHI()) {
InstrInfo &PrevII = Instructions[PrevMI];
if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
PrevII.OutNeeds |= InNeeds;
Worklist.push_back(PrevMI);
}
}
}
// Propagate WQM flag to instruction inputs
assert(II.Needs != (StateWQM | StateExact));
if (II.Needs != StateWQM)
return;
for (const MachineOperand &Use : MI.uses()) {
if (!Use.isReg() || !Use.isUse())
continue;
// At this point, physical registers appear as inputs or outputs
// and following them makes no sense (and would in fact be incorrect
// when the same VGPR is used as both an output and an input that leads
// to a NeedsWQM instruction).
//
// Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
// have to trace this, in practice it happens for 64-bit computations like
// pointers where both dwords are followed already anyway.
if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
continue;
for (const MachineOperand &Def : MRI->def_operands(Use.getReg())) {
const MachineInstr *DefMI = Def.getParent();
InstrInfo &DefII = Instructions[DefMI];
// Obviously skip if DefMI is already flagged as NeedWQM.
//
// The instruction might also be flagged as NeedExact. This happens when
// the result of an atomic is used in a WQM computation. In this case,
// the atomic must not run for helper pixels and the WQM result is
// undefined.
if (DefII.Needs != 0)
continue;
DefII.Needs = StateWQM;
Worklist.push_back(DefMI);
}
}
}
void SIWholeQuadMode::propagateBlock(const MachineBasicBlock &MBB,
std::vector<WorkItem>& Worklist) {
BlockInfo &BI = Blocks[&MBB];
// Propagate through instructions
if (!MBB.empty()) {
const MachineInstr *LastMI = &*MBB.rbegin();
InstrInfo &LastII = Instructions[LastMI];
if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
LastII.OutNeeds |= BI.OutNeeds;
Worklist.push_back(LastMI);
}
}
// Predecessor blocks must provide for our WQM/Exact needs.
for (const MachineBasicBlock *Pred : MBB.predecessors()) {
BlockInfo &PredBI = Blocks[Pred];
if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
continue;
PredBI.OutNeeds |= BI.InNeeds;
PredBI.InNeeds |= BI.InNeeds;
Worklist.push_back(Pred);
}
// All successors must be prepared to accept the same set of WQM/Exact
// data.
for (const MachineBasicBlock *Succ : MBB.successors()) {
BlockInfo &SuccBI = Blocks[Succ];
if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
continue;
SuccBI.InNeeds |= BI.OutNeeds;
Worklist.push_back(Succ);
}
}
char SIWholeQuadMode::analyzeFunction(const MachineFunction &MF) {
std::vector<WorkItem> Worklist;
char GlobalFlags = scanInstructions(MF, Worklist);
while (!Worklist.empty()) {
WorkItem WI = Worklist.back();
Worklist.pop_back();
if (WI.MI)
propagateInstruction(*WI.MI, Worklist);
else
propagateBlock(*WI.MBB, Worklist);
}
return GlobalFlags;
}
void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
unsigned SaveWQM, unsigned LiveMaskReg)
{
if (SaveWQM) {
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
SaveWQM)
.addReg(LiveMaskReg);
} else {
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(LiveMaskReg);
}
}
void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
unsigned SavedWQM)
{
if (SavedWQM) {
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
.addReg(SavedWQM);
} else {
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC)
.addReg(AMDGPU::EXEC);
}
}
void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
bool isEntry) {
auto BII = Blocks.find(&MBB);
if (BII == Blocks.end())
return;
const BlockInfo &BI = BII->second;
if (!(BI.InNeeds & StateWQM))
return;
// This is a non-entry block that is WQM throughout, so no need to do
// anything.
if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
return;
unsigned SavedWQMReg = 0;
bool WQMFromExec = isEntry;
char State = isEntry ? StateExact : StateWQM;
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
while (II != IE) {
MachineInstr &MI = *II;
++II;
// Skip instructions that are not affected by EXEC
if (MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD) &&
!MI.isBranch() && !MI.isTerminator())
continue;
// Generic instructions such as COPY will either disappear by register
// coalescing or be lowered to SALU or VALU instructions.
if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) {
if (MI.getNumExplicitOperands() >= 1) {
const MachineOperand &Op = MI.getOperand(0);
if (Op.isReg()) {
if (TRI->isSGPRReg(*MRI, Op.getReg())) {
// SGPR instructions are not affected by EXEC
continue;
}
}
}
}
char Needs = 0;
char OutNeeds = 0;
auto InstrInfoIt = Instructions.find(&MI);
if (InstrInfoIt != Instructions.end()) {
Needs = InstrInfoIt->second.Needs;
OutNeeds = InstrInfoIt->second.OutNeeds;
// Make sure to switch to Exact mode before the end of the block when
// Exact and only Exact is needed further downstream.
if (OutNeeds == StateExact && (MI.isBranch() || MI.isTerminator())) {
assert(Needs == 0);
Needs = StateExact;
}
}
// State switching
if (Needs && State != Needs) {
if (Needs == StateExact) {
assert(!SavedWQMReg);
if (!WQMFromExec && (OutNeeds & StateWQM))
SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
} else {
assert(WQMFromExec == (SavedWQMReg == 0));
toWQM(MBB, &MI, SavedWQMReg);
SavedWQMReg = 0;
}
State = Needs;
}
if (MI.getOpcode() == AMDGPU::SI_KILL)
WQMFromExec = false;
}
if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
assert(WQMFromExec == (SavedWQMReg == 0));
toWQM(MBB, MBB.end(), SavedWQMReg);
} else if (BI.OutNeeds == StateExact && State != StateExact) {
toExact(MBB, MBB.end(), 0, LiveMaskReg);
}
}
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
if (MFI->getShaderType() != ShaderType::PIXEL)
return false;
Instructions.clear();
Blocks.clear();
ExecExports.clear();
TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
MRI = &MF.getRegInfo();
char GlobalFlags = analyzeFunction(MF);
if (!(GlobalFlags & StateWQM))
return false;
MachineBasicBlock &Entry = MF.front();
MachineInstr *EntryMI = Entry.getFirstNonPHI();
if (GlobalFlags == StateWQM) {
// For a shader that needs only WQM, we can just set it once.
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC).addReg(AMDGPU::EXEC);
return true;
}
// Handle the general case
unsigned LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
.addReg(AMDGPU::EXEC);
for (const auto &BII : Blocks)
processBlock(const_cast<MachineBasicBlock &>(*BII.first), LiveMaskReg,
BII.first == &*MF.begin());
return true;
}

View File

@ -0,0 +1,348 @@
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
; Check that WQM isn't triggered by image load/store intrinsics.
;
;CHECK-LABEL: {{^}}test1:
;CHECK-NOT: s_wqm
define <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
main_body:
%tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
ret <4 x float> %tex
}
; Check that WQM is triggered by image samples and left untouched for loads...
;
;CHECK-LABEL: {{^}}test2:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: image_sample
;CHECK-NOT: exec
;CHECK: _load_dword v0,
define float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 {
main_body:
%c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%c.2 = bitcast <4 x float> %c.1 to <4 x i32>
%c.3 = extractelement <4 x i32> %c.2, i32 0
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
%data = load float, float addrspace(1)* %gep
ret float %data
}
; ... but disabled for stores (and, in this simple case, not re-enabled).
;
;CHECK-LABEL: {{^}}test3:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;CHECK: store
;CHECK-NOT: exec
define <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tex.1 = bitcast <4 x float> %tex to <4 x i32>
%tex.2 = extractelement <4 x i32> %tex.1, i32 0
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
%wr = extractelement <4 x float> %tex, i32 1
store float %wr, float addrspace(1)* %gep
ret <4 x float> %tex
}
; Check that WQM is re-enabled when required.
;
;CHECK-LABEL: {{^}}test4:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;CHECK: store
;CHECK: s_wqm_b64 exec, exec
;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf
define <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) #0 {
main_body:
%c.1 = mul i32 %c, %d
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
store float %data, float addrspace(1)* %gep
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
ret <4 x float> %tex
}
; Check a case of one branch of an if-else requiring WQM, the other requiring
; exact.
;
; Note: In this particular case, the save-and-restore could be avoided if the
; analysis understood that the two branches of the if-else are mutually
; exclusive.
;
;CHECK-LABEL: {{^}}test_control_flow_0:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: %ELSE
;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
;CHECK: store
;CHECK: s_mov_b64 exec, [[SAVED]]
;CHECK: %IF
;CHECK: image_sample
define float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 {
main_body:
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %IF, label %ELSE
IF:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%data.if = extractelement <4 x float> %tex, i32 0
br label %END
ELSE:
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
store float %data, float addrspace(1)* %gep
br label %END
END:
%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
ret float %r
}
; Reverse branch order compared to the previous test.
;
;CHECK-LABEL: {{^}}test_control_flow_1:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: %IF
;CHECK: image_sample
;CHECK: %Flow
;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
;CHECK-NEXT: %ELSE
;CHECK: store
;CHECK: %END
define float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 {
main_body:
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %ELSE, label %IF
IF:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%data.if = extractelement <4 x float> %tex, i32 0
br label %END
ELSE:
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
store float %data, float addrspace(1)* %gep
br label %END
END:
%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
ret float %r
}
; Check that branch conditions are properly marked as needing WQM...
;
;CHECK-LABEL: {{^}}test_control_flow_2:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;CHECK: store
;CHECK: s_wqm_b64 exec, exec
;CHECK: load
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;CHECK: store
;CHECK: s_wqm_b64 exec, exec
;CHECK: v_cmp
define <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 {
main_body:
%idx.1 = extractelement <3 x i32> %idx, i32 0
%gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
%data.1 = extractelement <2 x float> %data, i32 0
store float %data.1, float addrspace(1)* %gep.1
; The load that determines the branch (and should therefore be WQM) is
; surrounded by stores that require disabled WQM.
%idx.2 = extractelement <3 x i32> %idx, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
%z = load float, float addrspace(1)* %gep.2
%idx.3 = extractelement <3 x i32> %idx, i32 2
%gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
%data.3 = extractelement <2 x float> %data, i32 1
store float %data.3, float addrspace(1)* %gep.3
%cc = fcmp ogt float %z, 0.0
br i1 %cc, label %IF, label %ELSE
IF:
%coord.IF = mul i32 %coord, 3
br label %END
ELSE:
%coord.ELSE = mul i32 %coord, 4
br label %END
END:
%coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
ret <4 x float> %tex
}
; ... but only if they really do need it.
;
;CHECK-LABEL: {{^}}test_control_flow_3:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;CHECK: store
;CHECK: load
;CHECK: store
;CHECK: v_cmp
define float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tex.1 = extractelement <4 x float> %tex, i32 0
%idx.1 = extractelement <3 x i32> %idx, i32 0
%gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
%data.1 = extractelement <2 x float> %data, i32 0
store float %data.1, float addrspace(1)* %gep.1
%idx.2 = extractelement <3 x i32> %idx, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
%z = load float, float addrspace(1)* %gep.2
%idx.3 = extractelement <3 x i32> %idx, i32 2
%gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
%data.3 = extractelement <2 x float> %data, i32 1
store float %data.3, float addrspace(1)* %gep.3
%cc = fcmp ogt float %z, 0.0
br i1 %cc, label %IF, label %ELSE
IF:
%tex.IF = fmul float %tex.1, 3.0
br label %END
ELSE:
%tex.ELSE = fmul float %tex.1, 4.0
br label %END
END:
%tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
ret float %tex.END
}
; Another test that failed at some point because of terminator handling.
;
;CHECK-LABEL: {{^}}test_control_flow_4:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: %IF
;CHECK: load
;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
;CHECK: store
;CHECK: s_mov_b64 exec, [[SAVE]]
;CHECK: %END
;CHECK: image_sample
define <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) #0 {
main_body:
%cond = icmp eq i32 %y, 0
br i1 %cond, label %IF, label %END
IF:
%data = load float, float addrspace(1)* %ptr
%gep = getelementptr float, float addrspace(1)* %ptr, i32 1
store float %data, float addrspace(1)* %gep
br label %END
END:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
ret <4 x float> %tex
}
; Kill is performed in WQM mode so that uniform kill behaves correctly ...
;
;CHECK-LABEL: {{^}}test_kill_0:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;SI: buffer_store_dword
;VI: flat_store_dword
;CHECK: s_wqm_b64 exec, exec
;CHECK: v_cmpx_
;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
;SI: buffer_store_dword
;VI: flat_store_dword
;CHECK: s_mov_b64 exec, [[SAVE]]
;CHECK: image_sample
define <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) #0 {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%idx.0 = extractelement <2 x i32> %idx, i32 0
%gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
%data.0 = extractelement <2 x float> %data, i32 0
store float %data.0, float addrspace(1)* %gep.0
call void @llvm.AMDGPU.kill(float %z)
%idx.1 = extractelement <2 x i32> %idx, i32 1
%gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
%data.1 = extractelement <2 x float> %data, i32 1
store float %data.1, float addrspace(1)* %gep.1
%tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%out = fadd <4 x float> %tex, %tex2
ret <4 x float> %out
}
; ... but only if WQM is necessary.
;
;CHECK-LABEL: {{^}}test_kill_1:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;SI: buffer_store_dword
;VI: flat_store_dword
;CHECK-NOT: wqm
;CHECK: v_cmpx_
define <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) #0 {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
store float %data, float addrspace(1)* %gep
call void @llvm.AMDGPU.kill(float %z)
ret <4 x float> %tex
}
declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
declare void @llvm.AMDGPU.kill(float)
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
attributes #0 = { "ShaderType"="0" }
attributes #1 = { nounwind }
attributes #2 = { nounwind readonly }
attributes #3 = { nounwind readnone }