AMDGPU: Merge initial gfx9 support

llvm-svn: 295554
This commit is contained in:
Matt Arsenault 2017-02-18 18:29:53 +00:00
parent 6d5dddb85f
commit e823d92f7f
24 changed files with 578 additions and 60 deletions

View File

@ -47,7 +47,7 @@ FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitsPass();
FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;

View File

@ -79,6 +79,12 @@ def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
"Support unaligned scratch loads and stores"
>;
def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
"HasApertureRegs",
"true",
"Has Memory Aperture Base and Size Registers"
>;
// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
// XNACK. The current default kernel driver setting is:
// - graphics ring: XNACK disabled
@ -377,6 +383,15 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
]
>;
def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
[FeatureFP64, FeatureLocalMemorySize65536,
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs
]
>;
class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
list<SubtargetFeature> Implies>
: SubtargetFeature <
@ -429,6 +444,9 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
FeatureLDSBankCount16,
FeatureXNACK]>;
def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,[]>;
def FeatureISAVersion9_0_1 : SubtargetFeatureISAVersion <9,0,1,[]>;
//===----------------------------------------------------------------------===//
// Debugger related subtarget features.
//===----------------------------------------------------------------------===//
@ -534,10 +552,10 @@ def isVI : Predicate <
"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
AssemblerPredicate<"FeatureGCN3Encoding">;
// TODO: Either the name to be changed or we simply use IsCI!
def isCIVI : Predicate <
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
>, AssemblerPredicate<"FeatureCIInsts">;
"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate<"FeatureCIInsts">;
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;

View File

@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "llvm/ADT/Triple.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
@ -26,6 +27,7 @@ namespace {
class AMDGPUAnnotateKernelFeatures : public ModulePass {
private:
const TargetMachine *TM;
static bool hasAddrSpaceCast(const Function &F);
void addAttrToCallers(Function *Intrin, StringRef AttrName);
@ -34,7 +36,8 @@ private:
public:
static char ID;
AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { }
AMDGPUAnnotateKernelFeatures(const TargetMachine *TM_ = nullptr) :
ModulePass(ID), TM(TM_) {}
bool runOnModule(Module &M) override;
StringRef getPassName() const override {
return "AMDGPU Annotate Kernel Features";
@ -211,7 +214,9 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
if (F.hasFnAttribute("amdgpu-queue-ptr"))
continue;
if (hasAddrSpaceCast(F))
bool HasApertureRegs =
TM && TM->getSubtarget<AMDGPUSubtarget>(F).hasApertureRegs();
if (!HasApertureRegs && hasAddrSpaceCast(F))
F.addFnAttr("amdgpu-queue-ptr");
}
}
@ -219,6 +224,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
return Changed;
}
ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
return new AMDGPUAnnotateKernelFeatures();
ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM) {
return new AMDGPUAnnotateKernelFeatures(TM);
}

View File

@ -381,6 +381,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
case AMDGPU::EXEC_HI:
case AMDGPU::SCC:
case AMDGPU::M0:
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_SHARED_LIMIT:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT:
continue;
case AMDGPU::VCC:

View File

@ -86,6 +86,7 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
case AMDGPUSubtarget::SEA_ISLANDS:
return SIEncodingFamily::SI;
case AMDGPUSubtarget::VOLCANIC_ISLANDS:
case AMDGPUSubtarget::GFX9:
return SIEncodingFamily::VI;
// FIXME: This should never be called for r600 GPUs.

View File

@ -93,6 +93,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
UnalignedScratchAccess(false),
UnalignedBufferAccess(false),
HasApertureRegs(false),
EnableXNACK(false),
TrapHandler(false),
DebuggerInsertNops(false),

View File

@ -51,6 +51,7 @@ public:
SOUTHERN_ISLANDS,
SEA_ISLANDS,
VOLCANIC_ISLANDS,
GFX9,
};
enum {
@ -64,6 +65,8 @@ public:
ISAVersion8_0_3,
ISAVersion8_0_4,
ISAVersion8_1_0,
ISAVersion9_0_0,
ISAVersion9_0_1
};
enum TrapHandlerAbi {
@ -103,6 +106,7 @@ protected:
bool FlatForGlobal;
bool UnalignedScratchAccess;
bool UnalignedBufferAccess;
bool HasApertureRegs;
bool EnableXNACK;
bool TrapHandler;
bool DebuggerInsertNops;
@ -330,6 +334,10 @@ public:
return UnalignedScratchAccess;
}
bool hasApertureRegs() const {
return HasApertureRegs;
}
bool isTrapHandlerEnabled() const {
return TrapHandler;
}
@ -645,6 +653,14 @@ public:
return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
}
bool hasSMovFedHazard() const {
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
bool hasReadM0Hazard() const {
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
@ -656,7 +672,13 @@ public:
/// \returns True if waitcnt instruction is needed before barrier instruction,
/// false otherwise.
bool needWaitcntBeforeBarrier() const {
return true;
return getGeneration() < GFX9;
}
/// \returns true if the flat_scratch register should be initialized with the
/// pointer to the wave's scratch memory rather than a size and offset.
bool flatScratchIsPointer() const {
return getGeneration() >= GFX9;
}
/// \returns SGPR allocation granularity supported by the subtarget.

View File

@ -598,7 +598,8 @@ bool GCNPassConfig::addPreISel() {
// FIXME: We need to run a pass to propagate the attributes when calls are
// supported.
addPass(&AMDGPUAnnotateKernelFeaturesID);
const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM));
addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
addPass(createSinkingPass());
addPass(createSITypeRewriter());

View File

@ -71,6 +71,18 @@ static bool isRFE(unsigned Opcode) {
return Opcode == AMDGPU::S_RFE_B64;
}
static bool isSMovRel(unsigned Opcode) {
return Opcode == AMDGPU::S_MOVRELS_B32 || AMDGPU::S_MOVRELS_B64 ||
Opcode == AMDGPU::S_MOVRELD_B32 || AMDGPU::S_MOVRELD_B64;
}
static bool isVInterp(unsigned Opcode) {
return Opcode == AMDGPU::V_INTERP_P1_F32 ||
Opcode == AMDGPU::V_INTERP_P1_F32_16bank ||
Opcode == AMDGPU::V_INTERP_P2_F32 ||
Opcode == AMDGPU::V_INTERP_MOV_F32;
}
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
AMDGPU::OpName::simm16);
@ -108,6 +120,13 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
return NoopHazard;
if ((isVInterp(MI->getOpcode()) || isSMovRel(MI->getOpcode())) &&
checkReadM0Hazards(MI) > 0)
return NoopHazard;
if (checkAnyInstHazards(MI) > 0)
return NoopHazard;
return NoHazard;
}
@ -116,11 +135,13 @@ unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
}
unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
int WaitStates = std::max(0, checkAnyInstHazards(MI));
if (SIInstrInfo::isSMRD(*MI))
return std::max(0, checkSMRDHazards(MI));
return std::max(WaitStates, checkSMRDHazards(MI));
if (SIInstrInfo::isVALU(*MI)) {
int WaitStates = std::max(0, checkVALUHazards(MI));
WaitStates = std::max(WaitStates, checkVALUHazards(MI));
if (SIInstrInfo::isVMEM(*MI))
WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
@ -134,19 +155,25 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
if (isRWLane(MI->getOpcode()))
WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
if (isVInterp(MI->getOpcode()))
WaitStates = std::max(WaitStates, checkReadM0Hazards(MI));
return WaitStates;
}
if (isSGetReg(MI->getOpcode()))
return std::max(0, checkGetRegHazards(MI));
return std::max(WaitStates, checkGetRegHazards(MI));
if (isSSetReg(MI->getOpcode()))
return std::max(0, checkSetRegHazards(MI));
return std::max(WaitStates, checkSetRegHazards(MI));
if (isRFE(MI->getOpcode()))
return std::max(0, checkRFEHazards(MI));
return std::max(WaitStates, checkRFEHazards(MI));
return 0;
if (isSMovRel(MI->getOpcode()))
return std::max(WaitStates, checkReadM0Hazards(MI));
return WaitStates;
}
void GCNHazardRecognizer::EmitNoop() {
@ -508,3 +535,42 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
return RFEWaitStates - WaitStatesNeeded;
}
int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
if (MI->isDebugValue())
return 0;
const SIRegisterInfo *TRI = ST.getRegisterInfo();
if (!ST.hasSMovFedHazard())
return 0;
// Check for any instruction reading an SGPR after a write from
// s_mov_fed_b32.
int MovFedWaitStates = 1;
int WaitStatesNeeded = 0;
for (const MachineOperand &Use : MI->uses()) {
if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
continue;
auto IsHazardFn = [] (MachineInstr *MI) {
return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
};
int WaitStatesNeededForUse =
MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
return WaitStatesNeeded;
}
int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
if (!ST.hasReadM0Hazard())
return 0;
const SIInstrInfo *TII = ST.getInstrInfo();
int SMovRelWaitStates = 1;
auto IsHazardFn = [TII] (MachineInstr *MI) {
return TII->isSALU(*MI);
};
return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn);
}

View File

@ -52,6 +52,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
int checkVALUHazards(MachineInstr *VALU);
int checkRWLaneHazards(MachineInstr *RWLane);
int checkRFEHazards(MachineInstr *RFE);
int checkAnyInstHazards(MachineInstr *MI);
int checkReadM0Hazards(MachineInstr *SMovRel);
public:
GCNHazardRecognizer(const MachineFunction &MF);
// We can only issue one instruction per cycle.

View File

@ -187,3 +187,10 @@ def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
[FeatureISAVersion8_1_0]
>;
def : ProcessorModel<"gfx900", SIQuarterSpeedModel,
[FeatureGFX9, FeatureISAVersion9_0_0, FeatureLDSBankCount32]
>;
def : ProcessorModel<"gfx901", SIQuarterSpeedModel,
[FeatureGFX9, FeatureXNACK, FeatureISAVersion9_0_1, FeatureLDSBankCount32]
>;

View File

@ -33,10 +33,12 @@ static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST,
ST.getMaxNumSGPRs(MF));
}
void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
const SIRegisterInfo* TRI,
void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
MachineFunction &MF,
MachineBasicBlock &MBB) const {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo* TRI = &TII->getRegisterInfo();
// We don't need this if we only have spills since there is no user facing
// scratch.
@ -59,16 +61,28 @@ void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
MRI.addLiveIn(FlatScratchInitReg);
MBB.addLiveIn(FlatScratchInitReg);
// Copy the size in bytes.
unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitHi, RegState::Kill);
unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
// Do a 64-bit pointer add.
if (ST.flatScratchIsPointer()) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitLo)
.addReg(ScratchWaveOffsetReg);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
.addReg(FlatScrInitHi)
.addImm(0);
return;
}
// Copy the size in bytes.
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
.addReg(FlatScrInitHi, RegState::Kill);
// Add wave offset in bytes to private base offset.
// See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
@ -229,7 +243,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// emitted after frame indices are eliminated.
if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
emitFlatScratchInit(TII, TRI, MF, MBB);
emitFlatScratchInit(ST, MF, MBB);
// We need to insert initialization of the scratch resource descriptor.
unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(

View File

@ -36,8 +36,7 @@ public:
RegScavenger *RS = nullptr) const override;
private:
void emitFlatScratchInit(const SIInstrInfo *TII,
const SIRegisterInfo* TRI,
void emitFlatScratchInit(const SISubtarget &ST,
MachineFunction &MF,
MachineBasicBlock &MBB) const;

View File

@ -532,7 +532,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// in 8-bits, it can use a smaller encoding.
if (!isUInt<32>(AM.BaseOffs / 4))
return false;
} else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
} else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
@ -2233,6 +2233,13 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::getSegmentAperture(unsigned AS,
SelectionDAG &DAG) const {
if (Subtarget->hasApertureRegs()) { // Read from Aperture Registers directly.
unsigned RegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) ? AMDGPU::SRC_SHARED_BASE :
AMDGPU::SRC_PRIVATE_BASE;
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, RegNo, MVT::i32);
}
SDLoc SL;
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

View File

@ -133,6 +133,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::EXEC);
reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
// Reserve the memory aperture registers.
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
// Reserve Trap Handler registers - support is not implemented in Codegen.
reserveRegisterTuples(Reserved, AMDGPU::TBA);
reserveRegisterTuples(Reserved, AMDGPU::TMA);

View File

@ -44,6 +44,11 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
def SCC : SIReg<"scc", 253>;
def M0 : SIReg <"m0", 124>;
def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>;
def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
// Trap handler registers
def TBA_LO : SIReg<"tba_lo", 108>;
def TBA_HI : SIReg<"tba_hi", 109>;
@ -260,7 +265,8 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
let AllocationPriority = 7;
}

View File

@ -72,11 +72,11 @@ unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
return (Src & getBitMask(Shift, Width)) >> Shift;
}
/// \returns Vmcnt bit shift.
unsigned getVmcntBitShift() { return 0; }
/// \returns Vmcnt bit shift (lower bits).
unsigned getVmcntBitShiftLo() { return 0; }
/// \returns Vmcnt bit width.
unsigned getVmcntBitWidth() { return 4; }
/// \returns Vmcnt bit width (lower bits).
unsigned getVmcntBitWidthLo() { return 4; }
/// \returns Expcnt bit shift.
unsigned getExpcntBitShift() { return 4; }
@ -90,6 +90,12 @@ unsigned getLgkmcntBitShift() { return 8; }
/// \returns Lgkmcnt bit width.
unsigned getLgkmcntBitWidth() { return 4; }
/// \returns Vmcnt bit shift (higher bits).
unsigned getVmcntBitShiftHi() { return 14; }
/// \returns Vmcnt bit width (higher bits).
unsigned getVmcntBitWidthHi() { return 2; }
} // end namespace anonymous
namespace llvm {
@ -120,6 +126,12 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
if (Features.test(FeatureISAVersion8_1_0))
return {8, 1, 0};
// GFX9.
if (Features.test(FeatureISAVersion9_0_0))
return {9, 0, 0};
if (Features.test(FeatureISAVersion9_0_1))
return {9, 0, 1};
if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands))
return {0, 0, 0};
return {7, 0, 0};
@ -399,7 +411,12 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
}
unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) {
return (1 << getVmcntBitWidth()) - 1;
unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1;
if (Version.Major < 9)
return VmcntLo;
unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo();
return VmcntLo | VmcntHi;
}
unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) {
@ -411,14 +428,27 @@ unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) {
}
unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) {
unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth());
unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
return Vmcnt | Expcnt | Lgkmcnt;
unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt;
if (Version.Major < 9)
return Waitcnt;
unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi());
return Waitcnt | VmcntHi;
}
unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
unsigned VmcntLo =
unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
if (Version.Major < 9)
return VmcntLo;
unsigned VmcntHi =
unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
VmcntHi <<= getVmcntBitWidthLo();
return VmcntLo | VmcntHi;
}
unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
@ -438,7 +468,13 @@ void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
unsigned Vmcnt) {
return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
Waitcnt =
packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
if (Version.Major < 9)
return Waitcnt;
Vmcnt >>= getVmcntBitWidthLo();
return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
}
unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,

View File

@ -216,7 +216,8 @@ unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
/// \p Lgkmcnt respectively.
///
/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
/// \p Vmcnt = \p Waitcnt[3:0]
/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9 only)
/// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only)
/// \p Expcnt = \p Waitcnt[6:4]
/// \p Lgkmcnt = \p Waitcnt[11:8]
void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
@ -238,9 +239,11 @@ unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
/// \p Version.
///
/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
/// Waitcnt[3:0] = \p Vmcnt
/// Waitcnt[6:4] = \p Expcnt
/// Waitcnt[11:8] = \p Lgkmcnt
/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9 only)
/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9+ only)
/// Waitcnt[6:4] = \p Expcnt
/// Waitcnt[11:8] = \p Lgkmcnt
/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9+ only)
///
/// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
/// isa \p Version.

View File

@ -1,14 +1,19 @@
; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=CI %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=GFX9 %s
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
; HSA: enable_sgpr_private_segment_buffer = 1
; HSA: enable_sgpr_dispatch_ptr = 0
; HSA: enable_sgpr_queue_ptr = 1
; CI: enable_sgpr_queue_ptr = 1
; GFX9: enable_sgpr_queue_ptr = 0
; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
@ -17,6 +22,12 @@
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
; At most 2 digits. Make sure src_shared_base is not counted as a high
; number SGPR.
; CI: NumSgprs: {{[0-9][0-9]+}}
; GFX9: NumSgprs: {{[0-9]+}}
define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
%stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
store volatile i32 7, i32 addrspace(4)* %stof
@ -26,12 +37,16 @@ define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
; HSA: enable_sgpr_private_segment_buffer = 1
; HSA: enable_sgpr_dispatch_ptr = 0
; HSA: enable_sgpr_queue_ptr = 1
; CI: enable_sgpr_queue_ptr = 1
; GFX9: enable_sgpr_queue_ptr = 0
; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
@ -40,6 +55,9 @@ define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
; CI: NumSgprs: {{[0-9][0-9]+}}
; GFX9: NumSgprs: {{[0-9]+}}
define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
%stof = addrspacecast i32* %ptr to i32 addrspace(4)*
store volatile i32 7, i32 addrspace(4)* %stof
@ -133,8 +151,10 @@ define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
}
; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
@ -176,8 +196,11 @@ define void @cast_neg1_flat_to_group_addrspacecast() #0 {
}
; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], src_private_base
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
@ -226,9 +249,13 @@ end:
; Check for prologue initializing special SGPRs pointing to scratch.
; HSA-LABEL: {{^}}store_flat_scratch:
; HSA-DAG: s_mov_b32 flat_scratch_lo, s9
; HSA-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
; CI-DAG: s_mov_b32 flat_scratch_lo, s9
; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
; CI: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
; GFX9: s_add_u32 flat_scratch_lo, s6, s9
; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
; HSA: flat_store_dword
; HSA: s_barrier
; HSA: flat_load_dword

View File

@ -1,4 +1,5 @@
; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s
; CHECK: reserved_vgpr_first = {{[0-9]+}}
; CHECK-NEXT: reserved_vgpr_count = 4
; CHECK: ReservedVGPRFirst: {{[0-9]+}}

View File

@ -13,6 +13,8 @@
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx803 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx804 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI804 %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx901 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX901 %s
; HSA: .hsa_code_object_version 2,1
; HSA-CI700: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
@ -24,3 +26,5 @@
; HSA-VI803: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
; HSA-VI804: .hsa_code_object_isa 8,0,4,"AMD","AMDGPU"
; HSA-VI810: .hsa_code_object_isa 8,1,0,"AMD","AMDGPU"
; HSA-GFX900: .hsa_code_object_isa 9,0,0,"AMD","AMDGPU"
; HSA-GFX901: .hsa_code_object_isa 9,0,1,"AMD","AMDGPU"

View File

@ -1,6 +1,7 @@
# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN
# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI,GFX9
--- |
define void @div_fmas() { ret void }
@ -9,6 +10,37 @@
define void @vmem_gt_8dw_store() { ret void }
define void @readwrite_lane() { ret void }
define void @rfe() { ret void }
define void @s_mov_fed_b32() { ret void }
define void @s_movrel() { ret void }
define void @v_interp() { ret void }
define void @mov_fed_hazard_crash_on_dbg_value(i32 addrspace(1)* %A) {
entry:
%A.addr = alloca i32 addrspace(1)*, align 4
store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !5, metadata !11), !dbg !12
ret void
}
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 268929)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "test01.cl", directory: "/dev/null")
!2 = !{}
!3 = !{i32 2, !"Dwarf Version", i32 2}
!4 = !{i32 2, !"Debug Info Version", i32 3}
!5 = !DILocalVariable(name: "A", arg: 1, scope: !6, file: !1, line: 1, type: !9)
!6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
!7 = !DISubroutineType(types: !8)
!8 = !{null, !9}
!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 32)
!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!11 = !DIExpression()
!12 = !DILocation(line: 1, column: 30, scope: !6)
...
---
# GCN-LABEL: name: div_fmas
@ -331,3 +363,185 @@ body: |
S_ENDPGM
...
...
---
# GCN-LABEL: name: s_mov_fed_b32
# GCN-LABEL: bb.0:
# GCN: S_MOV_FED_B32
# GFX9: S_NOP
# GCN-NEXT: S_MOV_B32
# GCN-LABEL: bb.1:
# GCN: S_MOV_FED_B32
# GFX9: S_NOP
# GCN-NEXT: V_MOV_B32
name: s_mov_fed_b32
body: |
bb.0:
successors: %bb.1
%sgpr0 = S_MOV_FED_B32 %sgpr0
%sgpr0 = S_MOV_B32 %sgpr0
S_BRANCH %bb.1
bb.1:
%sgpr0 = S_MOV_FED_B32 %sgpr0
%vgpr0 = V_MOV_B32_e32 %sgpr0, implicit %exec
S_ENDPGM
...
...
---
# GCN-LABEL: name: s_movrel
# GCN-LABEL: bb.0:
# GCN: S_MOV_B32
# GFX9: S_NOP
# GCN-NEXT: S_MOVRELS_B32
# GCN-LABEL: bb.1:
# GCN: S_MOV_B32
# GFX9: S_NOP
# GCN-NEXT: S_MOVRELS_B64
# GCN-LABEL: bb.2:
# GCN: S_MOV_B32
# GFX9: S_NOP
# GCN-NEXT: S_MOVRELD_B32
# GCN-LABEL: bb.3:
# GCN: S_MOV_B32
# GFX9: S_NOP
# GCN-NEXT: S_MOVRELD_B64
name: s_movrel
body: |
bb.0:
successors: %bb.1
%m0 = S_MOV_B32 0
%sgpr0 = S_MOVRELS_B32 %sgpr0, implicit %m0
S_BRANCH %bb.1
bb.1:
successors: %bb.2
%m0 = S_MOV_B32 0
%sgpr0_sgpr1 = S_MOVRELS_B64 %sgpr0_sgpr1, implicit %m0
S_BRANCH %bb.2
bb.2:
successors: %bb.3
%m0 = S_MOV_B32 0
%sgpr0 = S_MOVRELD_B32 %sgpr0, implicit %m0
S_BRANCH %bb.3
bb.3:
%m0 = S_MOV_B32 0
%sgpr0_sgpr1 = S_MOVRELD_B64 %sgpr0_sgpr1, implicit %m0
S_ENDPGM
...
...
---
# GCN-LABEL: name: v_interp
# GCN-LABEL: bb.0:
# GCN: S_MOV_B32
# GFX9: S_NOP
# GCN-NEXT: V_INTERP_P1_F32
# GCN-LABEL: bb.1:
# GCN: S_MOV_B32
# GFX9: S_NOP
# GCN-NEXT: V_INTERP_P2_F32
# GCN-LABEL: bb.2:
# GCN: S_MOV_B32
# GFX9: S_NOP
# GCN-NEXT: V_INTERP_P1_F32_16bank
# GCN-LABEL: bb.3:
# GCN: S_MOV_B32
# GFX9: S_NOP
# GCN-NEXT: V_INTERP_MOV_F32
name: v_interp
body: |
bb.0:
successors: %bb.1
%m0 = S_MOV_B32 0
%vgpr0 = V_INTERP_P1_F32 %vgpr0, 0, 0, implicit %m0, implicit %exec
S_BRANCH %bb.1
bb.1:
successors: %bb.2
%m0 = S_MOV_B32 0
%vgpr0 = V_INTERP_P2_F32 %vgpr0, %vgpr1, 0, 0, implicit %m0, implicit %exec
S_BRANCH %bb.2
bb.2:
successors: %bb.3
%m0 = S_MOV_B32 0
%vgpr0 = V_INTERP_P1_F32_16bank %vgpr0, 0, 0, implicit %m0, implicit %exec
S_BRANCH %bb.3
bb.3:
%m0 = S_MOV_B32 0
%vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit %m0, implicit %exec
S_ENDPGM
...
---
name: mov_fed_hazard_crash_on_dbg_value
alignment: 0
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
liveins:
- { reg: '%sgpr4_sgpr5' }
- { reg: '%sgpr6_sgpr7' }
- { reg: '%sgpr9' }
- { reg: '%sgpr0_sgpr1_sgpr2_sgpr3' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 16
offsetAdjustment: 0
maxAlignment: 8
adjustsStack: false
hasCalls: false
maxCallFrameSize: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
stack:
- { id: 0, name: A.addr, offset: 0, size: 8, alignment: 8, local-offset: 0 }
- { id: 1, offset: 8, size: 4, alignment: 4 }
body: |
bb.0.entry:
liveins: %sgpr4_sgpr5, %sgpr6_sgpr7, %sgpr9, %sgpr0_sgpr1_sgpr2_sgpr3
%flat_scr_lo = S_ADD_U32 %sgpr6, %sgpr9, implicit-def %scc
%flat_scr_hi = S_ADDC_U32 %sgpr7, 0, implicit-def %scc, implicit %scc
DBG_VALUE _, 2, !5, !11, debug-location !12
%sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr4_sgpr5, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
dead %sgpr6_sgpr7 = KILL %sgpr4_sgpr5
%sgpr8 = S_MOV_B32 %sgpr5
%vgpr0 = V_MOV_B32_e32 killed %sgpr8, implicit %exec
BUFFER_STORE_DWORD_OFFSET %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr9, 4, 0, 0, 0, implicit %exec :: (store 4 into %ir.A.addr + 4)
%sgpr8 = S_MOV_B32 %sgpr4, implicit killed %sgpr4_sgpr5
%vgpr0 = V_MOV_B32_e32 killed %sgpr8, implicit %exec
BUFFER_STORE_DWORD_OFFSET %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr9, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.A.addr)
S_ENDPGM
...

View File

@ -1,8 +1,11 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; GCN-LABEL: {{^}}test_barrier:
; GCN: buffer_store_dword
; GCN: s_waitcnt
; GFX8: buffer_store_dword
; GFX8: s_waitcnt
; GFX9: flat_store_dword
; GFX9-NOT: s_waitcnt
; GCN: s_barrier
define void @test_barrier(i32 addrspace(1)* %out) #0 {
entry:

View File

@ -0,0 +1,71 @@
// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=GFX9 %s
//===----------------------------------------------------------------------===//
// s_waitcnt
//===----------------------------------------------------------------------===//
s_waitcnt 0
// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
s_waitcnt vmcnt(0) & expcnt(0) & lgkmcnt(0)
// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
s_waitcnt vmcnt(0), expcnt(0), lgkmcnt(0)
// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
s_waitcnt vmcnt(1)
// GFX9: s_waitcnt vmcnt(1) ; encoding: [0x71,0x0f,0x8c,0xbf]
s_waitcnt vmcnt(9)
// GFX9: s_waitcnt vmcnt(9) ; encoding: [0x79,0x0f,0x8c,0xbf]
s_waitcnt expcnt(2)
// GFX9: s_waitcnt expcnt(2) ; encoding: [0x2f,0xcf,0x8c,0xbf]
s_waitcnt lgkmcnt(3)
// GFX9: s_waitcnt lgkmcnt(3) ; encoding: [0x7f,0xc3,0x8c,0xbf]
s_waitcnt lgkmcnt(9)
// GFX9: s_waitcnt lgkmcnt(9) ; encoding: [0x7f,0xc9,0x8c,0xbf]
s_waitcnt vmcnt(0), expcnt(0)
// GFX9: s_waitcnt vmcnt(0) expcnt(0) ; encoding: [0x00,0x0f,0x8c,0xbf]
s_waitcnt vmcnt(15)
// GFX9: s_waitcnt vmcnt(15) ; encoding: [0x7f,0x0f,0x8c,0xbf]
s_waitcnt vmcnt(15) expcnt(6)
// GFX9: s_waitcnt vmcnt(15) expcnt(6) ; encoding: [0x6f,0x0f,0x8c,0xbf]
s_waitcnt vmcnt(15) lgkmcnt(14)
// GFX9: s_waitcnt vmcnt(15) lgkmcnt(14) ; encoding: [0x7f,0x0e,0x8c,0xbf]
s_waitcnt vmcnt(15) expcnt(6) lgkmcnt(14)
// GFX9: s_waitcnt vmcnt(15) expcnt(6) lgkmcnt(14) ; encoding: [0x6f,0x0e,0x8c,0xbf]
s_waitcnt vmcnt(31)
// GFX9: s_waitcnt vmcnt(31) ; encoding: [0x7f,0x4f,0x8c,0xbf]
s_waitcnt vmcnt(31) expcnt(6)
// GFX9: s_waitcnt vmcnt(31) expcnt(6) ; encoding: [0x6f,0x4f,0x8c,0xbf]
s_waitcnt vmcnt(31) lgkmcnt(14)
// GFX9: s_waitcnt vmcnt(31) lgkmcnt(14) ; encoding: [0x7f,0x4e,0x8c,0xbf]
s_waitcnt vmcnt(31) expcnt(6) lgkmcnt(14)
// GFX9: s_waitcnt vmcnt(31) expcnt(6) lgkmcnt(14) ; encoding: [0x6f,0x4e,0x8c,0xbf]
s_waitcnt vmcnt(62)
// GFX9: s_waitcnt vmcnt(62) ; encoding: [0x7e,0xcf,0x8c,0xbf]
s_waitcnt vmcnt(62) expcnt(6)
// GFX9: s_waitcnt vmcnt(62) expcnt(6) ; encoding: [0x6e,0xcf,0x8c,0xbf]
s_waitcnt vmcnt(62) lgkmcnt(14)
// GFX9: s_waitcnt vmcnt(62) lgkmcnt(14) ; encoding: [0x7e,0xce,0x8c,0xbf]
s_waitcnt vmcnt(62) expcnt(6) lgkmcnt(14)
// GFX9: s_waitcnt vmcnt(62) expcnt(6) lgkmcnt(14) ; encoding: [0x6e,0xce,0x8c,0xbf]