AMDGPU: Add SIWholeQuadMode pass

Summary: Whole quad mode is already enabled for pixel shaders that compute derivatives, but it must be suspended for instructions that cause a shader to have side effects (i.e. stores and atomics). This pass addresses the issue by storing the real (initial) live mask in a register, masking EXEC before instructions that require exact execution and (re-)enabling WQM where required. This pass is run before register coalescing so that we can use machine SSA for analysis. The changes in this patch expose a problem with the second machine scheduling pass: target independent instructions like COPY implicitly use EXEC when they operate on VGPRs, but this fact is not encoded in the MIR. This can lead to miscompilation because instructions are moved past changes to EXEC. This patch fixes the problem by adding use-implicit operands to target independent instructions. Some general codegen passes are relaxed to work with such implicit use operands. Reviewers: arsenm, tstellarAMD, mareko Subscribers: MatzeB, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D18162 llvm-svn: 263982
2016-03-21 20:28:33 +00:00 · 2016-03-21 20:28:33 +00:00 · 213e87f2ee
parent b14f4fd0de
commit 213e87f2ee
10 changed files with 863 additions and 15 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@ -44,6 +44,7 @@ FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
+FunctionPass *createSIWholeQuadModePass();
 FunctionPass *createSILowerControlFlowPass();
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIFixSGPRCopiesPass();
@ -70,6 +71,9 @@ extern char &SILowerI1CopiesID;
 void initializeSILoadStoreOptimizerPass(PassRegistry &);
 extern char &SILoadStoreOptimizerID;

+void initializeSIWholeQuadModePass(PassRegistry &);
+extern char &SIWholeQuadModeID;
+
 void initializeSILowerControlFlowPass(PassRegistry &);
 extern char &SILowerControlFlowPassID;

--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@ -62,7 +62,6 @@ public:
                               int64_t Offset1, int64_t Offset2,
                               unsigned NumLoads) const override;

-
  /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
  /// Return -1 if the target-specific opcode for the pseudo instruction does
  /// not exist. If Opcode is not a pseudo instruction, this is identity.
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -57,6 +57,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
  initializeSIAnnotateControlFlowPass(*PR);
  initializeSIInsertNopsPass(*PR);
  initializeSIInsertWaitsPass(*PR);
+  initializeSIWholeQuadModePass(*PR);
  initializeSILowerControlFlowPass(*PR);
 }

@ -346,6 +347,7 @@ void GCNPassConfig::addPreRegAlloc() {
    insertPass(&MachineSchedulerID, &RegisterCoalescerID);
  }
  addPass(createSIShrinkInstructionsPass(), false);
+  addPass(createSIWholeQuadModePass());
 }

 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen
  SIRegisterInfo.cpp
  SIShrinkInstructions.cpp
  SITypeRewriter.cpp
+  SIWholeQuadMode.cpp
  )

 add_subdirectory(AsmParser)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -1248,6 +1248,19 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
                 .addImm(0); // omod
 }

+bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+                                       const MachineBasicBlock *MBB,
+                                       const MachineFunction &MF) const {
+  // Target-independent instructions do not have an implicit-use of EXEC, even
+  // when they operate on VGPRs. Treating EXEC modifications as scheduling
+  // boundaries prevents incorrect movements of such instructions.
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  if (MI->modifiesRegister(AMDGPU::EXEC, TRI))
+    return true;
+
+  return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
  int64_t SVal = Imm.getSExtValue();
  if (SVal >= -16 && SVal <= 64)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@ -149,6 +149,10 @@ public:
                                      MachineBasicBlock::iterator &MI,
                                      LiveVariables *LV) const override;

+  bool isSchedulingBoundary(const MachineInstr *MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
  static bool isSALU(const MachineInstr &MI) {
    return MI.getDesc().TSFlags & SIInstrFlags::SALU;
  }
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@ -78,7 +78,7 @@ private:
  void SkipIfDead(MachineInstr &MI);

  void If(MachineInstr &MI);
-  void Else(MachineInstr &MI);
+  void Else(MachineInstr &MI, bool ExecModified);
  void Break(MachineInstr &MI);
  void IfBreak(MachineInstr &MI);
  void ElseBreak(MachineInstr &MI);
@ -215,7 +215,7 @@ void SILowerControlFlow::If(MachineInstr &MI) {
  MI.eraseFromParent();
 }

-void SILowerControlFlow::Else(MachineInstr &MI) {
+void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();
  unsigned Dst = MI.getOperand(0).getReg();
@ -225,6 +225,15 @@ void SILowerControlFlow::Else(MachineInstr &MI) {
          TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
          .addReg(Src); // Saved EXEC

+  if (ExecModified) {
+    // Adjust the saved exec to account for the modifications during the flow
+    // block that contains the ELSE. This can happen when WQM mode is switched
+    // off.
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+            .addReg(AMDGPU::EXEC)
+            .addReg(Dst);
+  }
+
  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
          .addReg(AMDGPU::EXEC)
          .addReg(Dst);
@ -488,7 +497,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  bool HaveKill = false;
-  bool NeedWQM = false;
  bool NeedFlat = false;
  unsigned Depth = 0;

@ -498,17 +506,24 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
    MachineBasicBlock *EmptyMBBAtEnd = NULL;
    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
+    bool ExecModified = false;
+
    for (I = MBB.begin(); I != MBB.end(); I = Next) {
      Next = std::next(I);

      MachineInstr &MI = *I;
-      if (TII->isWQM(MI) || TII->isDS(MI))
-        NeedWQM = true;

      // Flat uses m0 in case it needs to access LDS.
      if (TII->isFLAT(MI))
        NeedFlat = true;

+      for (const auto &Def : I->defs()) {
+        if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) {
+          ExecModified = true;
+          break;
+        }
+      }
+
      switch (MI.getOpcode()) {
        default: break;
        case AMDGPU::SI_IF:
@ -517,7 +532,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
          break;

        case AMDGPU::SI_ELSE:
-          Else(MI);
+          Else(MI, ExecModified);
          break;

        case AMDGPU::SI_BREAK:
@ -599,12 +614,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
    }
  }

-  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
-    MachineBasicBlock &MBB = MF.front();
-    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
-  }
-
  if (NeedFlat && MFI->IsKernel) {
    // TODO: What to use with function calls?
    // We will need to Initialize the flat scratch register pair.
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@ -72,9 +72,12 @@ public:
  }

  bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+    const TargetRegisterClass *RC;
    if (TargetRegisterInfo::isVirtualRegister(Reg))
-      return isSGPRClass(MRI.getRegClass(Reg));
-    return getPhysRegClass(Reg);
+      RC = MRI.getRegClass(Reg);
+    else
+      RC = getPhysRegClass(Reg);
+    return isSGPRClass(RC);
  }

  /// \returns true if this class contains VGPR registers.
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@ -0,0 +1,465 @@
+//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass adds instructions to enable whole quad mode for pixel
+/// shaders.
+///
+/// Whole quad mode is required for derivative computations, but it interferes
+/// with shader side effects (stores and atomics). This pass is run on the
+/// scheduled machine IR but before register coalescing, so that machine SSA is
+/// available for analysis. It ensures that WQM is enabled when necessary, but
+/// disabled around stores and atomics.
+///
+/// When necessary, this pass creates a function prolog
+///
+///   S_MOV_B64 LiveMask, EXEC
+///   S_WQM_B64 EXEC, EXEC
+///
+/// to enter WQM at the top of the function and surrounds blocks of Exact
+/// instructions by
+///
+///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
+///   ...
+///   S_MOV_B64 EXEC, Tmp
+///
+/// In order to avoid excessive switching during sequences of Exact
+/// instructions, the pass first analyzes which instructions must be run in WQM
+/// (aka which instructions produce values that lead to derivative
+/// computations).
+///
+/// Basic blocks are always exited in WQM as long as some successor needs WQM.
+///
+/// There is room for improvement given better control flow analysis:
+///
+///  (1) at the top level (outside of control flow statements, and as long as
+///      kill hasn't been used), one SGPR can be saved by recovering WQM from
+///      the LiveMask (this is implemented for the entry block).
+///
+///  (2) when entire regions (e.g. if-else blocks or entire loops) only
+///      consist of exact and don't-care instructions, the switch only has to
+///      be done at the entry and exit points rather than potentially in each
+///      block of the region.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-wqm"
+
+namespace {
+
+enum {
+  StateWQM = 0x1,
+  StateExact = 0x2,
+};
+
+struct InstrInfo {
+  char Needs = 0;
+  char OutNeeds = 0;
+};
+
+struct BlockInfo {
+  char Needs = 0;
+  char InNeeds = 0;
+  char OutNeeds = 0;
+};
+
+struct WorkItem {
+  const MachineBasicBlock *MBB = nullptr;
+  const MachineInstr *MI = nullptr;
+
+  WorkItem() {}
+  WorkItem(const MachineBasicBlock *MBB) : MBB(MBB) {}
+  WorkItem(const MachineInstr *MI) : MI(MI) {}
+};
+
+class SIWholeQuadMode : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  DenseMap<const MachineInstr *, InstrInfo> Instructions;
+  DenseMap<const MachineBasicBlock *, BlockInfo> Blocks;
+  SmallVector<const MachineInstr *, 2> ExecExports;
+
+  char scanInstructions(const MachineFunction &MF, std::vector<WorkItem>& Worklist);
+  void propagateInstruction(const MachineInstr &MI, std::vector<WorkItem>& Worklist);
+  void propagateBlock(const MachineBasicBlock &MBB, std::vector<WorkItem>& Worklist);
+  char analyzeFunction(const MachineFunction &MF);
+
+  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+               unsigned SaveWQM, unsigned LiveMaskReg);
+  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+             unsigned SavedWQM);
+  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+
+public:
+  static char ID;
+
+  SIWholeQuadMode() :
+    MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Whole Quad Mode";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace
+
+char SIWholeQuadMode::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE,
+                      "SI Whole Quad Mode", false, false)
+INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE,
+                      "SI Whole Quad Mode", false, false)
+
+char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
+
+FunctionPass *llvm::createSIWholeQuadModePass() {
+  return new SIWholeQuadMode;
+}
+
+// Scan instructions to determine which ones require an Exact execmask and
+// which ones seed WQM requirements.
+char SIWholeQuadMode::scanInstructions(const MachineFunction &MF,
+                                       std::vector<WorkItem> &Worklist) {
+  char GlobalFlags = 0;
+
+  for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
+    const MachineBasicBlock &MBB = *BI;
+
+    for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
+      const MachineInstr &MI = *II;
+      unsigned Opcode = MI.getOpcode();
+      char Flags;
+
+      if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
+        Flags = StateWQM;
+      } else if (TII->get(Opcode).mayStore() &&
+                 (MI.getDesc().TSFlags & SIInstrFlags::VM_CNT)) {
+        Flags = StateExact;
+      } else {
+        // Handle export instructions with the exec mask valid flag set
+        if (Opcode == AMDGPU::EXP && MI.getOperand(4).getImm() != 0)
+          ExecExports.push_back(&MI);
+        continue;
+      }
+
+      Instructions[&MI].Needs = Flags;
+      Worklist.push_back(&MI);
+      GlobalFlags |= Flags;
+    }
+  }
+
+  return GlobalFlags;
+}
+
+void SIWholeQuadMode::propagateInstruction(const MachineInstr &MI,
+                                           std::vector<WorkItem>& Worklist) {
+  const MachineBasicBlock &MBB = *MI.getParent();
+  InstrInfo &II = Instructions[&MI];
+  BlockInfo &BI = Blocks[&MBB];
+
+  // Control flow-type instructions that are followed by WQM computations
+  // must themselves be in WQM.
+  if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) &&
+      (MI.isBranch() || MI.isTerminator() || MI.getOpcode() == AMDGPU::SI_KILL))
+    II.Needs = StateWQM;
+
+  // Propagate to block level
+  BI.Needs |= II.Needs;
+  if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
+    BI.InNeeds |= II.Needs;
+    Worklist.push_back(&MBB);
+  }
+
+  // Propagate backwards within block
+  if (const MachineInstr *PrevMI = MI.getPrevNode()) {
+    char InNeeds = II.Needs | II.OutNeeds;
+    if (!PrevMI->isPHI()) {
+      InstrInfo &PrevII = Instructions[PrevMI];
+      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
+        PrevII.OutNeeds |= InNeeds;
+        Worklist.push_back(PrevMI);
+      }
+    }
+  }
+
+  // Propagate WQM flag to instruction inputs
+  assert(II.Needs != (StateWQM | StateExact));
+  if (II.Needs != StateWQM)
+    return;
+
+  for (const MachineOperand &Use : MI.uses()) {
+    if (!Use.isReg() || !Use.isUse())
+      continue;
+
+    // At this point, physical registers appear as inputs or outputs
+    // and following them makes no sense (and would in fact be incorrect
+    // when the same VGPR is used as both an output and an input that leads
+    // to a NeedsWQM instruction).
+    //
+    // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
+    // have to trace this, in practice it happens for 64-bit computations like
+    // pointers where both dwords are followed already anyway.
+    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
+      continue;
+
+    for (const MachineOperand &Def : MRI->def_operands(Use.getReg())) {
+      const MachineInstr *DefMI = Def.getParent();
+      InstrInfo &DefII = Instructions[DefMI];
+
+      // Obviously skip if DefMI is already flagged as NeedWQM.
+      //
+      // The instruction might also be flagged as NeedExact. This happens when
+      // the result of an atomic is used in a WQM computation. In this case,
+      // the atomic must not run for helper pixels and the WQM result is
+      // undefined.
+      if (DefII.Needs != 0)
+        continue;
+
+      DefII.Needs = StateWQM;
+      Worklist.push_back(DefMI);
+    }
+  }
+}
+
+void SIWholeQuadMode::propagateBlock(const MachineBasicBlock &MBB,
+                                     std::vector<WorkItem>& Worklist) {
+  BlockInfo &BI = Blocks[&MBB];
+
+  // Propagate through instructions
+  if (!MBB.empty()) {
+    const MachineInstr *LastMI = &*MBB.rbegin();
+    InstrInfo &LastII = Instructions[LastMI];
+    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
+      LastII.OutNeeds |= BI.OutNeeds;
+      Worklist.push_back(LastMI);
+    }
+  }
+
+  // Predecessor blocks must provide for our WQM/Exact needs.
+  for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+    BlockInfo &PredBI = Blocks[Pred];
+    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
+      continue;
+
+    PredBI.OutNeeds |= BI.InNeeds;
+    PredBI.InNeeds |= BI.InNeeds;
+    Worklist.push_back(Pred);
+  }
+
+  // All successors must be prepared to accept the same set of WQM/Exact
+  // data.
+  for (const MachineBasicBlock *Succ : MBB.successors()) {
+    BlockInfo &SuccBI = Blocks[Succ];
+    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
+      continue;
+
+    SuccBI.InNeeds |= BI.OutNeeds;
+    Worklist.push_back(Succ);
+  }
+}
+
+char SIWholeQuadMode::analyzeFunction(const MachineFunction &MF) {
+  std::vector<WorkItem> Worklist;
+  char GlobalFlags = scanInstructions(MF, Worklist);
+
+  while (!Worklist.empty()) {
+    WorkItem WI = Worklist.back();
+    Worklist.pop_back();
+
+    if (WI.MI)
+      propagateInstruction(*WI.MI, Worklist);
+    else
+      propagateBlock(*WI.MBB, Worklist);
+  }
+
+  return GlobalFlags;
+}
+
+void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator Before,
+                              unsigned SaveWQM, unsigned LiveMaskReg)
+{
+  if (SaveWQM) {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+            SaveWQM)
+        .addReg(LiveMaskReg);
+  } else {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
+            AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC)
+        .addReg(LiveMaskReg);
+  }
+}
+
+void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator Before,
+                            unsigned SavedWQM)
+{
+  if (SavedWQM) {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+        .addReg(SavedWQM);
+  } else {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+            AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC);
+  }
+}
+
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
+                                   bool isEntry) {
+  auto BII = Blocks.find(&MBB);
+  if (BII == Blocks.end())
+    return;
+
+  const BlockInfo &BI = BII->second;
+
+  if (!(BI.InNeeds & StateWQM))
+    return;
+
+  // This is a non-entry block that is WQM throughout, so no need to do
+  // anything.
+  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+    return;
+
+  unsigned SavedWQMReg = 0;
+  bool WQMFromExec = isEntry;
+  char State = isEntry ? StateExact : StateWQM;
+
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  while (II != IE) {
+    MachineInstr &MI = *II;
+    ++II;
+
+    // Skip instructions that are not affected by EXEC
+    if (MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD) &&
+        !MI.isBranch() && !MI.isTerminator())
+      continue;
+
+    // Generic instructions such as COPY will either disappear by register
+    // coalescing or be lowered to SALU or VALU instructions.
+    if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) {
+      if (MI.getNumExplicitOperands() >= 1) {
+        const MachineOperand &Op = MI.getOperand(0);
+        if (Op.isReg()) {
+          if (TRI->isSGPRReg(*MRI, Op.getReg())) {
+            // SGPR instructions are not affected by EXEC
+            continue;
+          }
+        }
+      }
+    }
+
+    char Needs = 0;
+    char OutNeeds = 0;
+    auto InstrInfoIt = Instructions.find(&MI);
+    if (InstrInfoIt != Instructions.end()) {
+      Needs = InstrInfoIt->second.Needs;
+      OutNeeds = InstrInfoIt->second.OutNeeds;
+
+      // Make sure to switch to Exact mode before the end of the block when
+      // Exact and only Exact is needed further downstream.
+      if (OutNeeds == StateExact && (MI.isBranch() || MI.isTerminator())) {
+        assert(Needs == 0);
+        Needs = StateExact;
+      }
+    }
+
+    // State switching
+    if (Needs && State != Needs) {
+      if (Needs == StateExact) {
+        assert(!SavedWQMReg);
+
+        if (!WQMFromExec && (OutNeeds & StateWQM))
+          SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+        toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
+      } else {
+        assert(WQMFromExec == (SavedWQMReg == 0));
+        toWQM(MBB, &MI, SavedWQMReg);
+        SavedWQMReg = 0;
+      }
+
+      State = Needs;
+    }
+
+    if (MI.getOpcode() == AMDGPU::SI_KILL)
+      WQMFromExec = false;
+  }
+
+  if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
+    assert(WQMFromExec == (SavedWQMReg == 0));
+    toWQM(MBB, MBB.end(), SavedWQMReg);
+  } else if (BI.OutNeeds == StateExact && State != StateExact) {
+    toExact(MBB, MBB.end(), 0, LiveMaskReg);
+  }
+}
+
+bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  if (MFI->getShaderType() != ShaderType::PIXEL)
+    return false;
+
+  Instructions.clear();
+  Blocks.clear();
+  ExecExports.clear();
+
+  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+  MRI = &MF.getRegInfo();
+
+  char GlobalFlags = analyzeFunction(MF);
+  if (!(GlobalFlags & StateWQM))
+    return false;
+
+  MachineBasicBlock &Entry = MF.front();
+  MachineInstr *EntryMI = Entry.getFirstNonPHI();
+
+  if (GlobalFlags == StateWQM) {
+    // For a shader that needs only WQM, we can just set it once.
+    BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
+    return true;
+  }
+
+  // Handle the general case
+  unsigned LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+      .addReg(AMDGPU::EXEC);
+
+  for (const auto &BII : Blocks)
+    processBlock(const_cast<MachineBasicBlock &>(*BII.first), LiveMaskReg,
+                 BII.first == &*MF.begin());
+
+  return true;
+}
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@ -0,0 +1,348 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
+
+; Check that WQM isn't triggered by image load/store intrinsics.
+;
+;CHECK-LABEL: {{^}}test1:
+;CHECK-NOT: s_wqm
+define <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
+main_body:
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret <4 x float> %tex
+}
+
+; Check that WQM is triggered by image samples and left untouched for loads...
+;
+;CHECK-LABEL: {{^}}test2:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK-NOT: exec
+;CHECK: _load_dword v0,
+define float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 {
+main_body:
+  %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
+  %c.3 = extractelement <4 x i32> %c.2, i32 0
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
+  %data = load float, float addrspace(1)* %gep
+  ret float %data
+}
+
+; ... but disabled for stores (and, in this simple case, not re-enabled).
+;
+;CHECK-LABEL: {{^}}test3:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK-NOT: exec
+define <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %tex.2 = extractelement <4 x i32> %tex.1, i32 0
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
+  %wr = extractelement <4 x float> %tex, i32 1
+  store float %wr, float addrspace(1)* %gep
+  ret <4 x float> %tex
+}
+
+; Check that WQM is re-enabled when required.
+;
+;CHECK-LABEL: {{^}}test4:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf
+define <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) #0 {
+main_body:
+  %c.1 = mul i32 %c, %d
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
+  store float %data, float addrspace(1)* %gep
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+; Check a case of one branch of an if-else requiring WQM, the other requiring
+; exact.
+;
+; Note: In this particular case, the save-and-restore could be avoided if the
+; analysis understood that the two branches of the if-else are mutually
+; exclusive.
+;
+;CHECK-LABEL: {{^}}test_control_flow_0:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %ELSE
+;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
+;CHECK: store
+;CHECK: s_mov_b64 exec, [[SAVED]]
+;CHECK: %IF
+;CHECK: image_sample
+define float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 {
+main_body:
+  %cmp = icmp eq i32 %z, 0
+  br i1 %cmp, label %IF, label %ELSE
+
+IF:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %data.if = extractelement <4 x float> %tex, i32 0
+  br label %END
+
+ELSE:
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
+  store float %data, float addrspace(1)* %gep
+  br label %END
+
+END:
+  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
+  ret float %r
+}
+
+; Reverse branch order compared to the previous test.
+;
+;CHECK-LABEL: {{^}}test_control_flow_1:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %IF
+;CHECK: image_sample
+;CHECK: %Flow
+;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
+;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
+;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
+;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
+;CHECK-NEXT: %ELSE
+;CHECK: store
+;CHECK: %END
+define float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 {
+main_body:
+  %cmp = icmp eq i32 %z, 0
+  br i1 %cmp, label %ELSE, label %IF
+
+IF:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %data.if = extractelement <4 x float> %tex, i32 0
+  br label %END
+
+ELSE:
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
+  store float %data, float addrspace(1)* %gep
+  br label %END
+
+END:
+  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
+  ret float %r
+}
+
+; Check that branch conditions are properly marked as needing WQM...
+;
+;CHECK-LABEL: {{^}}test_control_flow_2:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: load
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: v_cmp
+define <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 {
+main_body:
+  %idx.1 = extractelement <3 x i32> %idx, i32 0
+  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+  %data.1 = extractelement <2 x float> %data, i32 0
+  store float %data.1, float addrspace(1)* %gep.1
+
+  ; The load that determines the branch (and should therefore be WQM) is
+  ; surrounded by stores that require disabled WQM.
+  %idx.2 = extractelement <3 x i32> %idx, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
+  %z = load float, float addrspace(1)* %gep.2
+
+  %idx.3 = extractelement <3 x i32> %idx, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
+  %data.3 = extractelement <2 x float> %data, i32 1
+  store float %data.3, float addrspace(1)* %gep.3
+
+  %cc = fcmp ogt float %z, 0.0
+  br i1 %cc, label %IF, label %ELSE
+
+IF:
+  %coord.IF = mul i32 %coord, 3
+  br label %END
+
+ELSE:
+  %coord.ELSE = mul i32 %coord, 4
+  br label %END
+
+END:
+  %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+; ... but only if they really do need it.
+;
+;CHECK-LABEL: {{^}}test_control_flow_3:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: load
+;CHECK: store
+;CHECK: v_cmp
+define float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.1 = extractelement <4 x float> %tex, i32 0
+
+  %idx.1 = extractelement <3 x i32> %idx, i32 0
+  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+  %data.1 = extractelement <2 x float> %data, i32 0
+  store float %data.1, float addrspace(1)* %gep.1
+
+  %idx.2 = extractelement <3 x i32> %idx, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
+  %z = load float, float addrspace(1)* %gep.2
+
+  %idx.3 = extractelement <3 x i32> %idx, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
+  %data.3 = extractelement <2 x float> %data, i32 1
+  store float %data.3, float addrspace(1)* %gep.3
+
+  %cc = fcmp ogt float %z, 0.0
+  br i1 %cc, label %IF, label %ELSE
+
+IF:
+  %tex.IF = fmul float %tex.1, 3.0
+  br label %END
+
+ELSE:
+  %tex.ELSE = fmul float %tex.1, 4.0
+  br label %END
+
+END:
+  %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
+  ret float %tex.END
+}
+
+; Another test that failed at some point because of terminator handling.
+;
+;CHECK-LABEL: {{^}}test_control_flow_4:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %IF
+;CHECK: load
+;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]],  [[ORIG]]
+;CHECK: store
+;CHECK: s_mov_b64 exec, [[SAVE]]
+;CHECK: %END
+;CHECK: image_sample
+define <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) #0 {
+main_body:
+  %cond = icmp eq i32 %y, 0
+  br i1 %cond, label %IF, label %END
+
+IF:
+  %data = load float, float addrspace(1)* %ptr
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 1
+  store float %data, float addrspace(1)* %gep
+  br label %END
+
+END:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+; Kill is performed in WQM mode so that uniform kill behaves correctly ...
+;
+;CHECK-LABEL: {{^}}test_kill_0:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;SI: buffer_store_dword
+;VI: flat_store_dword
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: v_cmpx_
+;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
+;SI: buffer_store_dword
+;VI: flat_store_dword
+;CHECK: s_mov_b64 exec, [[SAVE]]
+;CHECK: image_sample
+define <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) #0 {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %idx.0 = extractelement <2 x i32> %idx, i32 0
+  %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
+  %data.0 = extractelement <2 x float> %data, i32 0
+  store float %data.0, float addrspace(1)* %gep.0
+
+  call void @llvm.AMDGPU.kill(float %z)
+
+  %idx.1 = extractelement <2 x i32> %idx, i32 1
+  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+  %data.1 = extractelement <2 x float> %data, i32 1
+  store float %data.1, float addrspace(1)* %gep.1
+
+  %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %out = fadd <4 x float> %tex, %tex2
+
+  ret <4 x float> %out
+}
+
+; ... but only if WQM is necessary.
+;
+;CHECK-LABEL: {{^}}test_kill_1:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;SI: buffer_store_dword
+;VI: flat_store_dword
+;CHECK-NOT: wqm
+;CHECK: v_cmpx_
+define <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) #0 {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
+  store float %data, float addrspace(1)* %gep
+
+  call void @llvm.AMDGPU.kill(float %z)
+
+  ret <4 x float> %tex
+}
+
+declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+
+declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
+
+declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+
+declare void @llvm.AMDGPU.kill(float)
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind readnone }