diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index c06c9f30a5fa..3e269fad71fa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -47,7 +47,7 @@ FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr); -ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); +ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index fbe760665125..d97c89df52ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -79,6 +79,12 @@ def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access", "Support unaligned scratch loads and stores" >; +def FeatureApertureRegs : SubtargetFeature<"aperture-regs", + "HasApertureRegs", + "true", + "Has Memory Aperture Base and Size Registers" +>; + // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support // XNACK. The current default kernel driver setting is: // - graphics ring: XNACK disabled @@ -377,6 +383,15 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", ] >; +def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", + [FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, + FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, + FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, + FeatureApertureRegs + ] +>; + class SubtargetFeatureISAVersion Implies> : SubtargetFeature < @@ -429,6 +444,9 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, FeatureLDSBankCount16, FeatureXNACK]>; +def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,[]>; +def FeatureISAVersion9_0_1 : SubtargetFeatureISAVersion <9,0,1,[]>; + //===----------------------------------------------------------------------===// // Debugger related subtarget features. //===----------------------------------------------------------------------===// @@ -534,10 +552,10 @@ def isVI : Predicate < "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, AssemblerPredicate<"FeatureGCN3Encoding">; +// TODO: Either the name to be changed or we simply use IsCI! def isCIVI : Predicate < - "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " - "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" ->, AssemblerPredicate<"FeatureCIInsts">; + "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"FeatureCIInsts">; def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 4f2ed9fe6236..09d3ff716e6e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" @@ -26,6 +27,7 @@ namespace { class AMDGPUAnnotateKernelFeatures : public ModulePass { private: + const TargetMachine *TM; static bool hasAddrSpaceCast(const Function &F); void addAttrToCallers(Function *Intrin, StringRef AttrName); @@ -34,7 +36,8 @@ private: public: static char ID; - AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { } + AMDGPUAnnotateKernelFeatures(const TargetMachine *TM_ = nullptr) : + ModulePass(ID), TM(TM_) {} bool runOnModule(Module &M) override; StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; @@ -211,7 +214,9 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { if (F.hasFnAttribute("amdgpu-queue-ptr")) continue; - if (hasAddrSpaceCast(F)) + bool HasApertureRegs = + TM && TM->getSubtarget(F).hasApertureRegs(); + if (!HasApertureRegs && hasAddrSpaceCast(F)) F.addFnAttr("amdgpu-queue-ptr"); } } @@ -219,6 +224,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { return Changed; } -ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { - return new AMDGPUAnnotateKernelFeatures(); +ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM) { + return new AMDGPUAnnotateKernelFeatures(TM); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index e4ac295f99af..c3ff2e322546 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -381,6 +381,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, case AMDGPU::EXEC_HI: case AMDGPU::SCC: case AMDGPU::M0: + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: continue; case AMDGPU::VCC: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index e4dc6599e156..a3abb96fb949 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -86,6 +86,7 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { case AMDGPUSubtarget::SEA_ISLANDS: return SIEncodingFamily::SI; case AMDGPUSubtarget::VOLCANIC_ISLANDS: + case AMDGPUSubtarget::GFX9: return SIEncodingFamily::VI; // FIXME: This should never be called for r600 GPUs. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index ec5ba2e9d14a..a37f5a89eda7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -93,6 +93,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, UnalignedScratchAccess(false), UnalignedBufferAccess(false), + HasApertureRegs(false), EnableXNACK(false), TrapHandler(false), DebuggerInsertNops(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 068bf0c5824e..dfdf29e7a12f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -51,6 +51,7 @@ public: SOUTHERN_ISLANDS, SEA_ISLANDS, VOLCANIC_ISLANDS, + GFX9, }; enum { @@ -64,6 +65,8 @@ public: ISAVersion8_0_3, ISAVersion8_0_4, ISAVersion8_1_0, + ISAVersion9_0_0, + ISAVersion9_0_1 }; enum TrapHandlerAbi { @@ -103,6 +106,7 @@ protected: bool FlatForGlobal; bool UnalignedScratchAccess; bool UnalignedBufferAccess; + bool HasApertureRegs; bool EnableXNACK; bool TrapHandler; bool DebuggerInsertNops; @@ -330,6 +334,10 @@ public: return UnalignedScratchAccess; } + bool hasApertureRegs() const { + return HasApertureRegs; + } + bool isTrapHandlerEnabled() const { return TrapHandler; } @@ -645,6 +653,14 @@ public: return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; } + bool hasSMovFedHazard() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; + } + + bool hasReadM0Hazard() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; + } + unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const; /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs @@ -656,7 +672,13 @@ public: /// \returns True if waitcnt instruction is needed before barrier instruction, /// false otherwise. bool needWaitcntBeforeBarrier() const { - return true; + return getGeneration() < GFX9; + } + + /// \returns true if the flat_scratch register should be initialized with the + /// pointer to the wave's scratch memory rather than a size and offset. + bool flatScratchIsPointer() const { + return getGeneration() >= GFX9; } /// \returns SGPR allocation granularity supported by the subtarget. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 497bf6b54add..5e633a3ca376 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -598,7 +598,8 @@ bool GCNPassConfig::addPreISel() { // FIXME: We need to run a pass to propagate the attributes when calls are // supported. - addPass(&AMDGPUAnnotateKernelFeaturesID); + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM)); addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions addPass(createSinkingPass()); addPass(createSITypeRewriter()); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 16b0b788318e..fd61530b7181 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -71,6 +71,18 @@ static bool isRFE(unsigned Opcode) { return Opcode == AMDGPU::S_RFE_B64; } +static bool isSMovRel(unsigned Opcode) { + return Opcode == AMDGPU::S_MOVRELS_B32 || AMDGPU::S_MOVRELS_B64 || + Opcode == AMDGPU::S_MOVRELD_B32 || AMDGPU::S_MOVRELD_B64; +} + +static bool isVInterp(unsigned Opcode) { + return Opcode == AMDGPU::V_INTERP_P1_F32 || + Opcode == AMDGPU::V_INTERP_P1_F32_16bank || + Opcode == AMDGPU::V_INTERP_P2_F32 || + Opcode == AMDGPU::V_INTERP_MOV_F32; +} + static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); @@ -108,6 +120,13 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) return NoopHazard; + if ((isVInterp(MI->getOpcode()) || isSMovRel(MI->getOpcode())) && + checkReadM0Hazards(MI) > 0) + return NoopHazard; + + if (checkAnyInstHazards(MI) > 0) + return NoopHazard; + return NoHazard; } @@ -116,11 +135,13 @@ unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { } unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { + int WaitStates = std::max(0, checkAnyInstHazards(MI)); + if (SIInstrInfo::isSMRD(*MI)) - return std::max(0, checkSMRDHazards(MI)); + return std::max(WaitStates, checkSMRDHazards(MI)); if (SIInstrInfo::isVALU(*MI)) { - int WaitStates = std::max(0, checkVALUHazards(MI)); + WaitStates = std::max(WaitStates, checkVALUHazards(MI)); if (SIInstrInfo::isVMEM(*MI)) WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); @@ -134,19 +155,25 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { if (isRWLane(MI->getOpcode())) WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); + if (isVInterp(MI->getOpcode())) + WaitStates = std::max(WaitStates, checkReadM0Hazards(MI)); + return WaitStates; } if (isSGetReg(MI->getOpcode())) - return std::max(0, checkGetRegHazards(MI)); + return std::max(WaitStates, checkGetRegHazards(MI)); if (isSSetReg(MI->getOpcode())) - return std::max(0, checkSetRegHazards(MI)); + return std::max(WaitStates, checkSetRegHazards(MI)); if (isRFE(MI->getOpcode())) - return std::max(0, checkRFEHazards(MI)); + return std::max(WaitStates, checkRFEHazards(MI)); - return 0; + if (isSMovRel(MI->getOpcode())) + return std::max(WaitStates, checkReadM0Hazards(MI)); + + return WaitStates; } void GCNHazardRecognizer::EmitNoop() { @@ -508,3 +535,42 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); return RFEWaitStates - WaitStatesNeeded; } + +int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) { + if (MI->isDebugValue()) + return 0; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + if (!ST.hasSMovFedHazard()) + return 0; + + // Check for any instruction reading an SGPR after a write from + // s_mov_fed_b32. + int MovFedWaitStates = 1; + int WaitStatesNeeded = 0; + + for (const MachineOperand &Use : MI->uses()) { + if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + auto IsHazardFn = [] (MachineInstr *MI) { + return MI->getOpcode() == AMDGPU::S_MOV_FED_B32; + }; + int WaitStatesNeededForUse = + MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { + if (!ST.hasReadM0Hazard()) + return 0; + + const SIInstrInfo *TII = ST.getInstrInfo(); + int SMovRelWaitStates = 1; + auto IsHazardFn = [TII] (MachineInstr *MI) { + return TII->isSALU(*MI); + }; + return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn); +} diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 0ab82ff4635b..9980847f60e8 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -52,6 +52,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { int checkVALUHazards(MachineInstr *VALU); int checkRWLaneHazards(MachineInstr *RWLane); int checkRFEHazards(MachineInstr *RFE); + int checkAnyInstHazards(MachineInstr *MI); + int checkReadM0Hazards(MachineInstr *SMovRel); public: GCNHazardRecognizer(const MachineFunction &MF); // We can only issue one instruction per cycle. diff --git a/llvm/lib/Target/AMDGPU/Processors.td b/llvm/lib/Target/AMDGPU/Processors.td index 3c07cc76b9a1..0e4eda982139 100644 --- a/llvm/lib/Target/AMDGPU/Processors.td +++ b/llvm/lib/Target/AMDGPU/Processors.td @@ -187,3 +187,10 @@ def : ProcessorModel<"gfx810", SIQuarterSpeedModel, [FeatureISAVersion8_1_0] >; +def : ProcessorModel<"gfx900", SIQuarterSpeedModel, + [FeatureGFX9, FeatureISAVersion9_0_0, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"gfx901", SIQuarterSpeedModel, + [FeatureGFX9, FeatureXNACK, FeatureISAVersion9_0_1, FeatureLDSBankCount32] +>; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 0dd3fd0e58a3..3cb9ba32628c 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -33,10 +33,12 @@ static ArrayRef getAllSGPRs(const SISubtarget &ST, ST.getMaxNumSGPRs(MF)); } -void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, - const SIRegisterInfo* TRI, +void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const { + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + // We don't need this if we only have spills since there is no user facing // scratch. @@ -59,16 +61,28 @@ void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, MRI.addLiveIn(FlatScratchInitReg); MBB.addLiveIn(FlatScratchInitReg); - // Copy the size in bytes. - unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) - .addReg(FlatScrInitHi, RegState::Kill); - unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); const SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + // Do a 64-bit pointer add. + if (ST.flatScratchIsPointer()) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) + .addReg(FlatScrInitHi) + .addImm(0); + + return; + } + + // Copy the size in bytes. + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitHi, RegState::Kill); + // Add wave offset in bytes to private base offset. // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) @@ -229,7 +243,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // emitted after frame indices are eliminated. if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) - emitFlatScratchInit(TII, TRI, MF, MBB); + emitFlatScratchInit(ST, MF, MBB); // We need to insert initialization of the scratch resource descriptor. unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 7657b4e03864..b52a0204d63a 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -36,8 +36,7 @@ public: RegScavenger *RS = nullptr) const override; private: - void emitFlatScratchInit(const SIInstrInfo *TII, - const SIRegisterInfo* TRI, + void emitFlatScratchInit(const SISubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6cfa3440e8ad..051f2153ccb7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -532,7 +532,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // in 8-bits, it can use a smaller encoding. if (!isUInt<32>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) { + } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; @@ -2233,6 +2233,13 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::getSegmentAperture(unsigned AS, SelectionDAG &DAG) const { + + if (Subtarget->hasApertureRegs()) { // Read from Aperture Registers directly. + unsigned RegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) ? AMDGPU::SRC_SHARED_BASE : + AMDGPU::SRC_PRIVATE_BASE; + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, RegNo, MVT::i32); + } + SDLoc SL; MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 00394623db43..a90fc28ced30 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -133,6 +133,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); + // Reserve the memory aperture registers. + reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); + reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); + reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); + reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); + // Reserve Trap Handler registers - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::TBA); reserveRegisterTuples(Reserved, AMDGPU::TMA); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 31e714b9f6b9..fdf82a6d818b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -44,6 +44,11 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>, def SCC : SIReg<"scc", 253>; def M0 : SIReg <"m0", 124>; +def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>; +def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>; +def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>; +def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>; + // Trap handler registers def TBA_LO : SIReg<"tba_lo", 108>; def TBA_HI : SIReg<"tba_hi", 109>; @@ -260,7 +265,8 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, // See comments in SIInstructions.td for more info. def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, - TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> { + TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, + SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> { let AllocationPriority = 7; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 0a1ab73d8dcf..cc112ba90372 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -72,11 +72,11 @@ unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) { return (Src & getBitMask(Shift, Width)) >> Shift; } -/// \returns Vmcnt bit shift. -unsigned getVmcntBitShift() { return 0; } +/// \returns Vmcnt bit shift (lower bits). +unsigned getVmcntBitShiftLo() { return 0; } -/// \returns Vmcnt bit width. -unsigned getVmcntBitWidth() { return 4; } +/// \returns Vmcnt bit width (lower bits). +unsigned getVmcntBitWidthLo() { return 4; } /// \returns Expcnt bit shift. unsigned getExpcntBitShift() { return 4; } @@ -90,6 +90,12 @@ unsigned getLgkmcntBitShift() { return 8; } /// \returns Lgkmcnt bit width. unsigned getLgkmcntBitWidth() { return 4; } +/// \returns Vmcnt bit shift (higher bits). +unsigned getVmcntBitShiftHi() { return 14; } + +/// \returns Vmcnt bit width (higher bits). +unsigned getVmcntBitWidthHi() { return 2; } + } // end namespace anonymous namespace llvm { @@ -120,6 +126,12 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) { if (Features.test(FeatureISAVersion8_1_0)) return {8, 1, 0}; + // GFX9. + if (Features.test(FeatureISAVersion9_0_0)) + return {9, 0, 0}; + if (Features.test(FeatureISAVersion9_0_1)) + return {9, 0, 1}; + if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands)) return {0, 0, 0}; return {7, 0, 0}; @@ -399,7 +411,12 @@ std::pair getIntegerPairAttribute(const Function &F, } unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) { - return (1 << getVmcntBitWidth()) - 1; + unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1; + if (Version.Major < 9) + return VmcntLo; + + unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo(); + return VmcntLo | VmcntHi; } unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) { @@ -411,14 +428,27 @@ unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) { } unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) { - unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth()); + unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo()); unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth()); - return Vmcnt | Expcnt | Lgkmcnt; + unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt; + if (Version.Major < 9) + return Waitcnt; + + unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi()); + return Waitcnt | VmcntHi; } unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { - return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth()); + unsigned VmcntLo = + unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo()); + if (Version.Major < 9) + return VmcntLo; + + unsigned VmcntHi = + unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi()); + VmcntHi <<= getVmcntBitWidthLo(); + return VmcntLo | VmcntHi; } unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { @@ -438,7 +468,13 @@ void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, unsigned Vmcnt) { - return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth()); + Waitcnt = + packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo()); + if (Version.Major < 9) + return Waitcnt; + + Vmcnt >>= getVmcntBitWidthLo(); + return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi()); } unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a8d82ba45194..acc68afeaf22 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -216,7 +216,8 @@ unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \p Lgkmcnt respectively. /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows: -/// \p Vmcnt = \p Waitcnt[3:0] +/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9 only) +/// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only) /// \p Expcnt = \p Waitcnt[6:4] /// \p Lgkmcnt = \p Waitcnt[11:8] void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, @@ -238,9 +239,11 @@ unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, /// \p Version. /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows: -/// Waitcnt[3:0] = \p Vmcnt -/// Waitcnt[6:4] = \p Expcnt -/// Waitcnt[11:8] = \p Lgkmcnt +/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9 only) +/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9+ only) +/// Waitcnt[6:4] = \p Expcnt +/// Waitcnt[11:8] = \p Lgkmcnt +/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9+ only) /// /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given /// isa \p Version. diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index b6ceee32c72c..12dcda959867 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -1,14 +1,19 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=CI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=GFX9 %s ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: ; HSA: enable_sgpr_private_segment_buffer = 1 ; HSA: enable_sgpr_dispatch_ptr = 0 -; HSA: enable_sgpr_queue_ptr = 1 +; CI: enable_sgpr_queue_ptr = 1 +; GFX9: enable_sgpr_queue_ptr = 0 -; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} -; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} +; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} +; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] + +; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} +; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base -; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 @@ -17,6 +22,12 @@ ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] + +; At most 2 digits. Make sure src_shared_base is not counted as a high +; number SGPR. + +; CI: NumSgprs: {{[0-9][0-9]+}} +; GFX9: NumSgprs: {{[0-9]+}} define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* store volatile i32 7, i32 addrspace(4)* %stof @@ -26,12 +37,16 @@ define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: ; HSA: enable_sgpr_private_segment_buffer = 1 ; HSA: enable_sgpr_dispatch_ptr = 0 -; HSA: enable_sgpr_queue_ptr = 1 +; CI: enable_sgpr_queue_ptr = 1 +; GFX9: enable_sgpr_queue_ptr = 0 -; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} -; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} +; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} +; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] + +; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} +; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base -; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 @@ -40,6 +55,9 @@ define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] + +; CI: NumSgprs: {{[0-9][0-9]+}} +; GFX9: NumSgprs: {{[0-9]+}} define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 { %stof = addrspacecast i32* %ptr to i32 addrspace(4)* store volatile i32 7, i32 addrspace(4)* %stof @@ -133,8 +151,10 @@ define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 { } ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: -; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 -; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] +; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 +; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] +; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base + ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] @@ -176,8 +196,11 @@ define void @cast_neg1_flat_to_group_addrspacecast() #0 { } ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: -; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 -; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] +; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 +; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] + +; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], src_private_base + ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] @@ -226,9 +249,13 @@ end: ; Check for prologue initializing special SGPRs pointing to scratch. ; HSA-LABEL: {{^}}store_flat_scratch: -; HSA-DAG: s_mov_b32 flat_scratch_lo, s9 -; HSA-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11 -; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 +; CI-DAG: s_mov_b32 flat_scratch_lo, s9 +; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11 +; CI: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 + +; GFX9: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9: s_addc_u32 flat_scratch_hi, s7, 0 + ; HSA: flat_store_dword ; HSA: s_barrier ; HSA: flat_load_dword diff --git a/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll b/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll index d30bb20bb03a..d63ba149ba64 100644 --- a/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll +++ b/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s ; CHECK: reserved_vgpr_first = {{[0-9]+}} ; CHECK-NEXT: reserved_vgpr_count = 4 ; CHECK: ReservedVGPRFirst: {{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll index a4e599230b74..af63a4f8df76 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll @@ -13,6 +13,8 @@ ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx803 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx804 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI804 %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx901 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX901 %s ; HSA: .hsa_code_object_version 2,1 ; HSA-CI700: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" @@ -24,3 +26,5 @@ ; HSA-VI803: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU" ; HSA-VI804: .hsa_code_object_isa 8,0,4,"AMD","AMDGPU" ; HSA-VI810: .hsa_code_object_isa 8,1,0,"AMD","AMDGPU" +; HSA-GFX900: .hsa_code_object_isa 9,0,0,"AMD","AMDGPU" +; HSA-GFX901: .hsa_code_object_isa 9,0,1,"AMD","AMDGPU" diff --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir index 85cd903a405d..e97cb1b5c34b 100644 --- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -1,6 +1,7 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI # RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI,GFX9 --- | define void @div_fmas() { ret void } @@ -9,6 +10,37 @@ define void @vmem_gt_8dw_store() { ret void } define void @readwrite_lane() { ret void } define void @rfe() { ret void } + define void @s_mov_fed_b32() { ret void } + define void @s_movrel() { ret void } + define void @v_interp() { ret void } + + define void @mov_fed_hazard_crash_on_dbg_value(i32 addrspace(1)* %A) { + entry: + %A.addr = alloca i32 addrspace(1)*, align 4 + store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !5, metadata !11), !dbg !12 + ret void + } + + declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!3, !4} + + !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 268929)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) + !1 = !DIFile(filename: "test01.cl", directory: "/dev/null") + !2 = !{} + !3 = !{i32 2, !"Dwarf Version", i32 2} + !4 = !{i32 2, !"Debug Info Version", i32 3} + !5 = !DILocalVariable(name: "A", arg: 1, scope: !6, file: !1, line: 1, type: !9) + !6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) + !7 = !DISubroutineType(types: !8) + !8 = !{null, !9} + !9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 32) + !10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) + !11 = !DIExpression() + !12 = !DILocation(line: 1, column: 30, scope: !6) + ... --- # GCN-LABEL: name: div_fmas @@ -331,3 +363,185 @@ body: | S_ENDPGM ... + +... +--- + +# GCN-LABEL: name: s_mov_fed_b32 + +# GCN-LABEL: bb.0: +# GCN: S_MOV_FED_B32 +# GFX9: S_NOP +# GCN-NEXT: S_MOV_B32 + +# GCN-LABEL: bb.1: +# GCN: S_MOV_FED_B32 +# GFX9: S_NOP +# GCN-NEXT: V_MOV_B32 +name: s_mov_fed_b32 + +body: | + bb.0: + successors: %bb.1 + %sgpr0 = S_MOV_FED_B32 %sgpr0 + %sgpr0 = S_MOV_B32 %sgpr0 + S_BRANCH %bb.1 + + bb.1: + %sgpr0 = S_MOV_FED_B32 %sgpr0 + %vgpr0 = V_MOV_B32_e32 %sgpr0, implicit %exec + S_ENDPGM + +... + +... +--- + +# GCN-LABEL: name: s_movrel + +# GCN-LABEL: bb.0: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: S_MOVRELS_B32 + +# GCN-LABEL: bb.1: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: S_MOVRELS_B64 + +# GCN-LABEL: bb.2: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: S_MOVRELD_B32 + +# GCN-LABEL: bb.3: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: S_MOVRELD_B64 + +name: s_movrel + +body: | + bb.0: + successors: %bb.1 + %m0 = S_MOV_B32 0 + %sgpr0 = S_MOVRELS_B32 %sgpr0, implicit %m0 + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %m0 = S_MOV_B32 0 + %sgpr0_sgpr1 = S_MOVRELS_B64 %sgpr0_sgpr1, implicit %m0 + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + %m0 = S_MOV_B32 0 + %sgpr0 = S_MOVRELD_B32 %sgpr0, implicit %m0 + S_BRANCH %bb.3 + + bb.3: + %m0 = S_MOV_B32 0 + %sgpr0_sgpr1 = S_MOVRELD_B64 %sgpr0_sgpr1, implicit %m0 + S_ENDPGM +... + +... +--- + +# GCN-LABEL: name: v_interp + +# GCN-LABEL: bb.0: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: V_INTERP_P1_F32 + +# GCN-LABEL: bb.1: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: V_INTERP_P2_F32 + +# GCN-LABEL: bb.2: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: V_INTERP_P1_F32_16bank + +# GCN-LABEL: bb.3: +# GCN: S_MOV_B32 +# GFX9: S_NOP +# GCN-NEXT: V_INTERP_MOV_F32 + +name: v_interp + +body: | + bb.0: + successors: %bb.1 + %m0 = S_MOV_B32 0 + %vgpr0 = V_INTERP_P1_F32 %vgpr0, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %m0 = S_MOV_B32 0 + %vgpr0 = V_INTERP_P2_F32 %vgpr0, %vgpr1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + %m0 = S_MOV_B32 0 + %vgpr0 = V_INTERP_P1_F32_16bank %vgpr0, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.3 + + bb.3: + %m0 = S_MOV_B32 0 + %vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit %m0, implicit %exec + S_ENDPGM +... +--- +name: mov_fed_hazard_crash_on_dbg_value +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%sgpr4_sgpr5' } + - { reg: '%sgpr6_sgpr7' } + - { reg: '%sgpr9' } + - { reg: '%sgpr0_sgpr1_sgpr2_sgpr3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 16 + offsetAdjustment: 0 + maxAlignment: 8 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +stack: + - { id: 0, name: A.addr, offset: 0, size: 8, alignment: 8, local-offset: 0 } + - { id: 1, offset: 8, size: 4, alignment: 4 } +body: | + bb.0.entry: + liveins: %sgpr4_sgpr5, %sgpr6_sgpr7, %sgpr9, %sgpr0_sgpr1_sgpr2_sgpr3 + + %flat_scr_lo = S_ADD_U32 %sgpr6, %sgpr9, implicit-def %scc + %flat_scr_hi = S_ADDC_U32 %sgpr7, 0, implicit-def %scc, implicit %scc + DBG_VALUE _, 2, !5, !11, debug-location !12 + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr4_sgpr5, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + dead %sgpr6_sgpr7 = KILL %sgpr4_sgpr5 + %sgpr8 = S_MOV_B32 %sgpr5 + %vgpr0 = V_MOV_B32_e32 killed %sgpr8, implicit %exec + BUFFER_STORE_DWORD_OFFSET %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr9, 4, 0, 0, 0, implicit %exec :: (store 4 into %ir.A.addr + 4) + %sgpr8 = S_MOV_B32 %sgpr4, implicit killed %sgpr4_sgpr5 + %vgpr0 = V_MOV_B32_e32 killed %sgpr8, implicit %exec + BUFFER_STORE_DWORD_OFFSET %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr9, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.A.addr) + S_ENDPGM + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 132e476d5e29..9559b5a84b1c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -1,8 +1,11 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}test_barrier: -; GCN: buffer_store_dword -; GCN: s_waitcnt +; GFX8: buffer_store_dword +; GFX8: s_waitcnt +; GFX9: flat_store_dword +; GFX9-NOT: s_waitcnt ; GCN: s_barrier define void @test_barrier(i32 addrspace(1)* %out) #0 { entry: diff --git a/llvm/test/MC/AMDGPU/sopp-gfx9.s b/llvm/test/MC/AMDGPU/sopp-gfx9.s new file mode 100644 index 000000000000..237bceb287f2 --- /dev/null +++ b/llvm/test/MC/AMDGPU/sopp-gfx9.s @@ -0,0 +1,71 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=GFX9 %s + +//===----------------------------------------------------------------------===// +// s_waitcnt +//===----------------------------------------------------------------------===// + +s_waitcnt 0 +// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] + +s_waitcnt vmcnt(0) & expcnt(0) & lgkmcnt(0) +// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] + +s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] + +s_waitcnt vmcnt(0), expcnt(0), lgkmcnt(0) +// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] + +s_waitcnt vmcnt(1) +// GFX9: s_waitcnt vmcnt(1) ; encoding: [0x71,0x0f,0x8c,0xbf] + +s_waitcnt vmcnt(9) +// GFX9: s_waitcnt vmcnt(9) ; encoding: [0x79,0x0f,0x8c,0xbf] + +s_waitcnt expcnt(2) +// GFX9: s_waitcnt expcnt(2) ; encoding: [0x2f,0xcf,0x8c,0xbf] + +s_waitcnt lgkmcnt(3) +// GFX9: s_waitcnt lgkmcnt(3) ; encoding: [0x7f,0xc3,0x8c,0xbf] + +s_waitcnt lgkmcnt(9) +// GFX9: s_waitcnt lgkmcnt(9) ; encoding: [0x7f,0xc9,0x8c,0xbf] + +s_waitcnt vmcnt(0), expcnt(0) +// GFX9: s_waitcnt vmcnt(0) expcnt(0) ; encoding: [0x00,0x0f,0x8c,0xbf] + +s_waitcnt vmcnt(15) +// GFX9: s_waitcnt vmcnt(15) ; encoding: [0x7f,0x0f,0x8c,0xbf] + +s_waitcnt vmcnt(15) expcnt(6) +// GFX9: s_waitcnt vmcnt(15) expcnt(6) ; encoding: [0x6f,0x0f,0x8c,0xbf] + +s_waitcnt vmcnt(15) lgkmcnt(14) +// GFX9: s_waitcnt vmcnt(15) lgkmcnt(14) ; encoding: [0x7f,0x0e,0x8c,0xbf] + +s_waitcnt vmcnt(15) expcnt(6) lgkmcnt(14) +// GFX9: s_waitcnt vmcnt(15) expcnt(6) lgkmcnt(14) ; encoding: [0x6f,0x0e,0x8c,0xbf] + +s_waitcnt vmcnt(31) +// GFX9: s_waitcnt vmcnt(31) ; encoding: [0x7f,0x4f,0x8c,0xbf] + +s_waitcnt vmcnt(31) expcnt(6) +// GFX9: s_waitcnt vmcnt(31) expcnt(6) ; encoding: [0x6f,0x4f,0x8c,0xbf] + +s_waitcnt vmcnt(31) lgkmcnt(14) +// GFX9: s_waitcnt vmcnt(31) lgkmcnt(14) ; encoding: [0x7f,0x4e,0x8c,0xbf] + +s_waitcnt vmcnt(31) expcnt(6) lgkmcnt(14) +// GFX9: s_waitcnt vmcnt(31) expcnt(6) lgkmcnt(14) ; encoding: [0x6f,0x4e,0x8c,0xbf] + +s_waitcnt vmcnt(62) +// GFX9: s_waitcnt vmcnt(62) ; encoding: [0x7e,0xcf,0x8c,0xbf] + +s_waitcnt vmcnt(62) expcnt(6) +// GFX9: s_waitcnt vmcnt(62) expcnt(6) ; encoding: [0x6e,0xcf,0x8c,0xbf] + +s_waitcnt vmcnt(62) lgkmcnt(14) +// GFX9: s_waitcnt vmcnt(62) lgkmcnt(14) ; encoding: [0x7e,0xce,0x8c,0xbf] + +s_waitcnt vmcnt(62) expcnt(6) lgkmcnt(14) +// GFX9: s_waitcnt vmcnt(62) expcnt(6) lgkmcnt(14) ; encoding: [0x6e,0xce,0x8c,0xbf]