From 8623e8d864b151e8aa023f951d29ecb8aecbf078 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 3 Aug 2017 23:00:29 +0000 Subject: [PATCH] AMDGPU: Pass special input registers to functions llvm-svn: 309998 --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 4 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 43 ++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 21 +- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 12 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 311 ++++++-- llvm/lib/Target/AMDGPU/SIISelLowering.h | 13 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 9 +- .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 79 +-- .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 96 +-- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 55 -- llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 25 - .../AMDGPU/callee-special-input-sgprs.ll | 612 ++++++++++++++++ .../AMDGPU/callee-special-input-vgprs.ll | 671 ++++++++++++++++++ 13 files changed, 1714 insertions(+), 237 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll create mode 100644 llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 21aa0e592569..6d6fccb10cb3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -41,7 +41,7 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, unsigned Offset) const { MachineFunction &MF = MIRBuilder.getMF(); - const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = *MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); @@ -49,7 +49,7 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, LLT PtrType = getLLTForType(*PtrTy, DL); unsigned DstReg = MRI.createGenericVirtualRegister(PtrType); unsigned KernArgSegmentPtr = - TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 9aa02346d424..4f65b40ece50 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3582,6 +3582,49 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); } +SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, + EVT VT, + const SDLoc &SL, + int64_t Offset) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true); + auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); + SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); + + return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); +} + +SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Chain, + SDValue StackPtr, + SDValue ArgVal, + int64_t Offset) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); + SDValue PtrOffset = DAG.getConstant(Offset, SL, MVT::i32); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, StackPtr, PtrOffset); + + SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, + MachineMemOperand::MODereferenceable); + return Store; +} + +SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, + const TargetRegisterClass *RC, + EVT VT, const SDLoc &SL, + const ArgDescriptor &Arg) const { + assert(Arg && "Attempting to load missing argument"); + + if (Arg.isRegister()) + return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL); + return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); +} + uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index d85aada6053a..46c81f91d601 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -24,7 +24,7 @@ namespace llvm { class AMDGPUMachineFunction; class AMDGPUSubtarget; -class MachineRegisterInfo; +struct ArgDescriptor; class AMDGPUTargetLowering : public TargetLowering { private: @@ -237,6 +237,25 @@ public: return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true); } + /// Similar to CreateLiveInRegister, except value maybe loaded from a stack + /// slot rather than passed in a register. + SDValue loadStackInputValue(SelectionDAG &DAG, + EVT VT, + const SDLoc &SL, + int64_t Offset) const; + + SDValue storeStackInputValue(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Chain, + SDValue StackPtr, + SDValue ArgVal, + int64_t Offset) const; + + SDValue loadInputValue(SelectionDAG &DAG, + const TargetRegisterClass *RC, + EVT VT, const SDLoc &SL, + const ArgDescriptor &Arg) const; + enum ImplicitParameter { FIRST_IMPLICIT, GRID_DIM = FIRST_IMPLICIT, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index f7e5cb03b3e4..2ecf32c6ffef 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -38,6 +38,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, MachineBasicBlock &MBB) const { const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); // We don't need this if we only have spills since there is no user facing // scratch. @@ -55,7 +56,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, MachineBasicBlock::iterator I = MBB.begin(); unsigned FlatScratchInitReg - = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); + = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); MachineRegisterInfo &MRI = MF.getRegInfo(); MRI.addLiveIn(FlatScratchInitReg); @@ -64,7 +65,6 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - const SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); // Do a 64-bit pointer add. @@ -283,13 +283,13 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } // We need to insert initialization of the scratch resource descriptor. - unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; if (ST.isAmdCodeObjectV2(MF)) { - PreloadedPrivateBufferReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + PreloadedPrivateBufferReg = MFI->getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); } bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 47a5aa4b0cea..5a53d7914c0c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -45,6 +45,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineValueType.h" @@ -895,14 +896,19 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, uint64_t Offset) const { const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); - const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, - SIRegisterInfo::KERNARG_SEGMENT_PTR); + const SIMachineFunctionInfo *Info = MF.getInfo(); + + const ArgDescriptor *InputPtrReg; + const TargetRegisterClass *RC; + + std::tie(InputPtrReg, RC) + = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, - MRI.getLiveInVirtReg(InputPtrReg), PtrVT); + MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); + return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); } @@ -1005,6 +1011,17 @@ SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA return ArgValue; } +SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, + const SIMachineFunctionInfo &MFI, + EVT VT, + AMDGPUFunctionArgInfo::PreloadedValue PVID) const { + const ArgDescriptor *Reg; + const TargetRegisterClass *RC; + + std::tie(Reg, RC) = MFI.getPreloadedValue(PVID); + return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); +} + static void processShaderInputArgs(SmallVectorImpl &Splits, CallingConv::ID CallConv, ArrayRef Ins, @@ -1055,27 +1072,129 @@ static void processShaderInputArgs(SmallVectorImpl &Splits, } // Allocate special inputs passed in VGPRs. +static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + if (Info.hasWorkItemIDX()) { + unsigned Reg = AMDGPU::VGPR0; + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + assert(Reg == AMDGPU::VGPR0); + + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); + } + + if (Info.hasWorkItemIDY()) { + unsigned Reg = AMDGPU::VGPR1; + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + + assert(Reg == AMDGPU::VGPR1); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); + } + + if (Info.hasWorkItemIDZ()) { + unsigned Reg = AMDGPU::VGPR2; + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + + assert(Reg == AMDGPU::VGPR2); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); + } +} + +// Try to allocate a VGPR at the end of the argument list, or if no argument +// VGPRs are left allocating a stack slot. +static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { + ArrayRef ArgVGPRs + = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); + unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); + if (RegIdx == ArgVGPRs.size()) { + // Spill to stack required. + int64_t Offset = CCInfo.AllocateStack(4, 4); + + return ArgDescriptor::createStack(Offset); + } + + unsigned Reg = ArgVGPRs[RegIdx]; + Reg = CCInfo.AllocateReg(Reg); + assert(Reg != AMDGPU::NoRegister); + + MachineFunction &MF = CCInfo.getMachineFunction(); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + return ArgDescriptor::createRegister(Reg); +} + +static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, + const TargetRegisterClass *RC, + unsigned NumArgRegs) { + ArrayRef ArgSGPRs = makeArrayRef(RC->begin(), 32); + unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs); + if (RegIdx == ArgSGPRs.size()) + report_fatal_error("ran out of SGPRs for arguments"); + + unsigned Reg = ArgSGPRs[RegIdx]; + Reg = CCInfo.AllocateReg(Reg); + assert(Reg != AMDGPU::NoRegister); + + MachineFunction &MF = CCInfo.getMachineFunction(); + MF.addLiveIn(Reg, RC); + return ArgDescriptor::createRegister(Reg); +} + +static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) { + return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); +} + +static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) { + return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); +} + static void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { - if (Info.hasWorkItemIDX()) { - unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); - } + if (Info.hasWorkItemIDX()) + Info.setWorkItemIDX(allocateVGPR32Input(CCInfo)); - if (Info.hasWorkItemIDY()) { - unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); - } + if (Info.hasWorkItemIDY()) + Info.setWorkItemIDY(allocateVGPR32Input(CCInfo)); - if (Info.hasWorkItemIDZ()) { - unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); - } + if (Info.hasWorkItemIDZ()) + Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo)); +} + +static void allocateSpecialInputSGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + auto &ArgInfo = Info.getArgInfo(); + + // TODO: Unify handling with private memory pointers. + + if (Info.hasDispatchPtr()) + ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo); + + if (Info.hasQueuePtr()) + ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo); + + if (Info.hasKernargSegmentPtr()) + ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo); + + if (Info.hasDispatchID()) + ArgInfo.DispatchID = allocateSGPR64Input(CCInfo); + + // flat_scratch_init is not applicable for non-kernel functions. + + if (Info.hasWorkGroupIDX()) + ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo); + + if (Info.hasWorkGroupIDY()) + ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo); + + if (Info.hasWorkGroupIDZ()) + ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo); } // Allocate special inputs passed in user SGPRs. @@ -1212,8 +1331,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // resource. For the Code Object V2 ABI, this will be the first 4 user // SGPR inputs. We can reserve those and use them directly. - unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + unsigned PrivateSegmentBufferReg = Info.getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); Info.setScratchRSrcReg(PrivateSegmentBufferReg); if (MFI.hasCalls()) { @@ -1229,8 +1348,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); Info.setScratchWaveOffsetReg(ReservedOffsetReg); } else { - unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); } } else { @@ -1256,8 +1375,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, Info.setScratchRSrcReg(ReservedBufferReg); if (HasStackObjects && !MFI.hasCalls()) { - unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + unsigned ScratchWaveOffsetReg = Info.getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); } else { unsigned ReservedOffsetReg @@ -1390,7 +1509,7 @@ SDValue SITargetLowering::LowerFormalArguments( } if (IsEntryFunc) { - allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); + allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); } @@ -1509,6 +1628,11 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } + if (!IsEntryFunc) { + // Special inputs come after user arguments. + allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); + } + // Start adding system SGPRs. if (IsEntryFunc) { allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); @@ -1516,8 +1640,13 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(Info->getScratchRSrcReg()); CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); CCInfo.AllocateReg(Info->getFrameOffsetReg()); + allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } + auto &ArgUsageInfo = + DAG.getPass()->getAnalysis(); + ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo()); + return Chains.empty() ? Chain : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } @@ -1741,6 +1870,81 @@ SDValue SITargetLowering::LowerCallResult( return Chain; } +// Add code to pass special inputs required depending on used features separate +// from the explicit user arguments present in the IR. +void SITargetLowering::passSpecialInputs( + CallLoweringInfo &CLI, + const SIMachineFunctionInfo &Info, + SmallVectorImpl> &RegsToPass, + SmallVectorImpl &MemOpChains, + SDValue Chain, + SDValue StackPtr) const { + // If we don't have a call site, this was a call inserted by + // legalization. These can never use special inputs. + if (!CLI.CS) + return; + + const Function *CalleeFunc = CLI.CS.getCalledFunction(); + if (!CalleeFunc) + report_fatal_error("indirect calls not handled"); + + SelectionDAG &DAG = CLI.DAG; + const SDLoc &DL = CLI.DL; + + const SISubtarget *ST = getSubtarget(); + const SIRegisterInfo *TRI = ST->getRegisterInfo(); + + auto &ArgUsageInfo = + DAG.getPass()->getAnalysis(); + const AMDGPUFunctionArgInfo &CalleeArgInfo + = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); + + const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); + + // TODO: Unify with private memory register handling. This is complicated by + // the fact that at least in kernels, the input argument is not necessarily + // in the same location as the input. + AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { + AMDGPUFunctionArgInfo::DISPATCH_PTR, + AMDGPUFunctionArgInfo::QUEUE_PTR, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR, + AMDGPUFunctionArgInfo::DISPATCH_ID, + AMDGPUFunctionArgInfo::WORKGROUP_ID_X, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, + AMDGPUFunctionArgInfo::WORKITEM_ID_X, + AMDGPUFunctionArgInfo::WORKITEM_ID_Y, + AMDGPUFunctionArgInfo::WORKITEM_ID_Z + }; + + for (auto InputID : InputRegs) { + const ArgDescriptor *OutgoingArg; + const TargetRegisterClass *ArgRC; + + std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID); + if (!OutgoingArg) + continue; + + const ArgDescriptor *IncomingArg; + const TargetRegisterClass *IncomingArgRC; + std::tie(IncomingArg, IncomingArgRC) + = CallerArgInfo.getPreloadedValue(InputID); + assert(IncomingArgRC == ArgRC); + + // All special arguments are ints for now. + EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32; + SDValue InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg); + if (OutgoingArg->isRegister()) { + RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + } else { + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr, + InputReg, + OutgoingArg->getStackOffset()); + MemOpChains.push_back(ArgStore); + } + } +} + // The wave scratch offset register is used as the global base pointer. SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { @@ -1897,6 +2101,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } } + // Copy special input registers after user input arguments. + passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr); + if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); @@ -3424,7 +3631,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); auto MFI = MF.getInfo(); - const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); EVT VT = Op.getValueType(); SDLoc DL(Op); @@ -3436,10 +3642,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_implicit_buffer_ptr: { if (getSubtarget()->isAmdCodeObjectV2(MF)) return emitNonHSAIntrinsicError(DAG, DL, VT); - - unsigned Reg = TRI->getPreloadedValue(MF, - SIRegisterInfo::IMPLICIT_BUFFER_PTR); - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); } case Intrinsic::amdgcn_dispatch_ptr: case Intrinsic::amdgcn_queue_ptr: { @@ -3451,10 +3655,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getUNDEF(VT); } - auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? - SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR; - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, - TRI->getPreloadedValue(MF, Reg), VT); + auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? + AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR; + return getPreloadedValue(DAG, *MFI, VT, RegID); } case Intrinsic::amdgcn_implicitarg_ptr: { if (MFI->isEntryFunction()) @@ -3462,13 +3665,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, report_fatal_error("amdgcn.implicitarg.ptr not implemented for functions"); } case Intrinsic::amdgcn_kernarg_segment_ptr: { - unsigned Reg - = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); } case Intrinsic::amdgcn_dispatch_id: { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID); - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID); } case Intrinsic::amdgcn_rcp: return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); @@ -3553,28 +3754,32 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::amdgcn_workgroup_id_x: case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::WORKGROUP_ID_X); case Intrinsic::amdgcn_workgroup_id_y: case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); - case Intrinsic::amdgcn_workitem_id_x: + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_workitem_id_x: { case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, + SDLoc(DAG.getEntryNode()), + MFI->getArgInfo().WorkItemIDX); + } case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, + SDLoc(DAG.getEntryNode()), + MFI->getArgInfo().WorkItemIDY); case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, + SDLoc(DAG.getEntryNode()), + MFI->getArgInfo().WorkItemIDZ); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { Op.getOperand(1), diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index dbe78876298d..9176e4a3004d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -16,6 +16,7 @@ #define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H #include "AMDGPUISelLowering.h" +#include "AMDGPUArgumentUsageInfo.h" #include "SIInstrInfo.h" namespace llvm { @@ -32,6 +33,10 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, const SDLoc &SL, SDValue Chain, const ISD::InputArg &Arg) const; + SDValue getPreloadedValue(SelectionDAG &DAG, + const SIMachineFunctionInfo &MFI, + EVT VT, + AMDGPUFunctionArgInfo::PreloadedValue) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; @@ -205,6 +210,14 @@ public: const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + void passSpecialInputs( + CallLoweringInfo &CLI, + const SIMachineFunctionInfo &Info, + SmallVectorImpl> &RegsToPass, + SmallVectorImpl &MemOpChains, + SDValue Chain, + SDValue StackPtr) const; + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 204eeff54d1f..1d884524bcd4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -916,7 +916,6 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); const SISubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -936,13 +935,13 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( WorkGroupSize > WavefrontSize) { unsigned TIDIGXReg - = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); + = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); unsigned TIDIGYReg - = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); + = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); unsigned TIDIGZReg - = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); + = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); unsigned InputPtrReg = - TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) Entry.addLiveIn(Reg); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index cfc9fe5fa515..c4405309e12f 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -27,24 +27,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG), FrameOffsetReg(AMDGPU::FP_REG), StackPtrOffsetReg(AMDGPU::SP_REG), - PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), - DispatchPtrUserSGPR(AMDGPU::NoRegister), - QueuePtrUserSGPR(AMDGPU::NoRegister), - KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), - DispatchIDUserSGPR(AMDGPU::NoRegister), - FlatScratchInitUserSGPR(AMDGPU::NoRegister), - PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), - GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), - GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), - GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), - WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), - WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), - WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), - WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), - PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), - WorkItemIDXVGPR(AMDGPU::NoRegister), - WorkItemIDYVGPR(AMDGPU::NoRegister), - WorkItemIDZVGPR(AMDGPU::NoRegister), + ArgInfo(), PSInputAddr(0), PSInputEnable(0), ReturnsVoid(true), @@ -91,8 +74,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) FrameOffsetReg = AMDGPU::SGPR5; StackPtrOffsetReg = AMDGPU::SGPR32; - // FIXME: Not really a system SGPR. - PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg; + ArgInfo.PrivateSegmentBuffer = + ArgDescriptor::createRegister(ScratchRSrcReg); + ArgInfo.PrivateSegmentWaveByteOffset = + ArgDescriptor::createRegister(ScratchWaveOffsetReg); + if (F->hasFnAttribute("amdgpu-implicitarg-ptr")) ImplicitArgPtr = true; } else { @@ -151,10 +137,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (HasStackObjects || MaySpill) { PrivateSegmentWaveByteOffset = true; - // HS and GS always have the scratch wave offset in SGPR5 on GFX9. - if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && - (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) - PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5; + // HS and GS always have the scratch wave offset in SGPR5 on GFX9. + if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && + (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) + ArgInfo.PrivateSegmentWaveByteOffset + = ArgDescriptor::createRegister(AMDGPU::SGPR5); } } @@ -189,52 +176,54 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( const SIRegisterInfo &TRI) { - PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + ArgInfo.PrivateSegmentBuffer = + ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass)); NumUserSGPRs += 4; - return PrivateSegmentBufferUserSGPR; + return ArgInfo.PrivateSegmentBuffer.getRegister(); } unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { - DispatchPtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return DispatchPtrUserSGPR; + return ArgInfo.DispatchPtr.getRegister(); } unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { - QueuePtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return QueuePtrUserSGPR; + return ArgInfo.QueuePtr.getRegister(); } unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { - KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.KernargSegmentPtr + = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return KernargSegmentPtrUserSGPR; + return ArgInfo.KernargSegmentPtr.getRegister(); } unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) { - DispatchIDUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return DispatchIDUserSGPR; + return ArgInfo.DispatchID.getRegister(); } unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { - FlatScratchInitUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return FlatScratchInitUserSGPR; + return ArgInfo.FlatScratchInit.getRegister(); } unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { - ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return ImplicitBufferPtrUserSGPR; + return ArgInfo.ImplicitBufferPtr.getRegister(); } static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 94145c46e10a..5581fe4c55e4 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -16,6 +16,7 @@ #include "AMDGPUMachineFunction.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "AMDGPUArgumentUsageInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/MC/MCRegisterInfo.h" @@ -96,33 +97,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg. unsigned StackPtrOffsetReg; - // Input registers for non-HSA ABI - unsigned ImplicitBufferPtrUserSGPR; - - // Input registers setup for the HSA ABI. - // User SGPRs in allocation order. - unsigned PrivateSegmentBufferUserSGPR; - unsigned DispatchPtrUserSGPR; - unsigned QueuePtrUserSGPR; - unsigned KernargSegmentPtrUserSGPR; - unsigned DispatchIDUserSGPR; - unsigned FlatScratchInitUserSGPR; - unsigned PrivateSegmentSizeUserSGPR; - unsigned GridWorkGroupCountXUserSGPR; - unsigned GridWorkGroupCountYUserSGPR; - unsigned GridWorkGroupCountZUserSGPR; - - // System SGPRs in allocation order. - unsigned WorkGroupIDXSystemSGPR; - unsigned WorkGroupIDYSystemSGPR; - unsigned WorkGroupIDZSystemSGPR; - unsigned WorkGroupInfoSystemSGPR; - unsigned PrivateSegmentWaveByteOffsetSystemSGPR; - - // VGPR inputs. These are always v0, v1 and v2 for entry functions. - unsigned WorkItemIDXVGPR; - unsigned WorkItemIDYVGPR; - unsigned WorkItemIDZVGPR; + AMDGPUFunctionArgInfo ArgInfo; // Graphics info. unsigned PSInputAddr; @@ -235,7 +210,6 @@ private: SmallVector SpillVGPRs; public: - SIMachineFunctionInfo(const MachineFunction &MF); ArrayRef getSGPRToVGPRSpills(int FrameIndex) const { @@ -266,37 +240,52 @@ public: // Add system SGPRs. unsigned addWorkGroupIDX() { - WorkGroupIDXSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupIDXSystemSGPR; + return ArgInfo.WorkGroupIDX.getRegister(); } unsigned addWorkGroupIDY() { - WorkGroupIDYSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupIDYSystemSGPR; + return ArgInfo.WorkGroupIDY.getRegister(); } unsigned addWorkGroupIDZ() { - WorkGroupIDZSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupIDZSystemSGPR; + return ArgInfo.WorkGroupIDZ.getRegister(); } unsigned addWorkGroupInfo() { - WorkGroupInfoSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupInfoSystemSGPR; + return ArgInfo.WorkGroupInfo.getRegister(); } + // Add special VGPR inputs + void setWorkItemIDX(ArgDescriptor Arg) { + ArgInfo.WorkItemIDX = Arg; + } + + void setWorkItemIDY(ArgDescriptor Arg) { + ArgInfo.WorkItemIDY = Arg; + } + + void setWorkItemIDZ(ArgDescriptor Arg) { + ArgInfo.WorkItemIDZ = Arg; + } + + unsigned addPrivateSegmentWaveByteOffset() { - PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR(); + ArgInfo.PrivateSegmentWaveByteOffset + = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return PrivateSegmentWaveByteOffsetSystemSGPR; + return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } void setPrivateSegmentWaveByteOffset(unsigned Reg) { - PrivateSegmentWaveByteOffsetSystemSGPR = Reg; + ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg); } bool hasPrivateSegmentBuffer() const { @@ -375,6 +364,23 @@ public: return ImplicitBufferPtr; } + AMDGPUFunctionArgInfo &getArgInfo() { + return ArgInfo; + } + + const AMDGPUFunctionArgInfo &getArgInfo() const { + return ArgInfo; + } + + std::pair + getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const { + return ArgInfo.getPreloadedValue(Value); + } + + unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const { + return ArgInfo.getPreloadedValue(Value).first->getRegister(); + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } @@ -384,7 +390,7 @@ public: } unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { - return PrivateSegmentWaveByteOffsetSystemSGPR; + return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } /// \brief Returns the physical register reserved for use as the resource @@ -426,11 +432,11 @@ public: } unsigned getQueuePtrUserSGPR() const { - return QueuePtrUserSGPR; + return ArgInfo.QueuePtr.getRegister(); } unsigned getImplicitBufferPtrUserSGPR() const { - return ImplicitBufferPtrUserSGPR; + return ArgInfo.ImplicitBufferPtr.getRegister(); } bool hasSpilledSGPRs() const { @@ -562,13 +568,13 @@ public: switch (Dim) { case 0: assert(hasWorkGroupIDX()); - return WorkGroupIDXSystemSGPR; + return ArgInfo.WorkGroupIDX.getRegister(); case 1: assert(hasWorkGroupIDY()); - return WorkGroupIDYSystemSGPR; + return ArgInfo.WorkGroupIDY.getRegister(); case 2: assert(hasWorkGroupIDZ()); - return WorkGroupIDZSystemSGPR; + return ArgInfo.WorkGroupIDZ.getRegister(); } llvm_unreachable("unexpected dimension"); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ea0bfb2ad9ff..7c73f92eed27 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1338,61 +1338,6 @@ bool SIRegisterInfo::shouldRewriteCopySrc( return getCommonSubClass(DefRC, SrcRC) != nullptr; } -// FIXME: Most of these are flexible with HSA and we don't need to reserve them -// as input registers if unused. Whether the dispatch ptr is necessary should be -// easy to detect from used intrinsics. Scratch setup is harder to know. -unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, - enum PreloadedValue Value) const { - - const SIMachineFunctionInfo *MFI = MF.getInfo(); - const SISubtarget &ST = MF.getSubtarget(); - (void)ST; - switch (Value) { - case SIRegisterInfo::WORKGROUP_ID_X: - assert(MFI->hasWorkGroupIDX()); - return MFI->WorkGroupIDXSystemSGPR; - case SIRegisterInfo::WORKGROUP_ID_Y: - assert(MFI->hasWorkGroupIDY()); - return MFI->WorkGroupIDYSystemSGPR; - case SIRegisterInfo::WORKGROUP_ID_Z: - assert(MFI->hasWorkGroupIDZ()); - return MFI->WorkGroupIDZSystemSGPR; - case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: - return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; - case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - assert(MFI->hasPrivateSegmentBuffer()); - return MFI->PrivateSegmentBufferUserSGPR; - case SIRegisterInfo::IMPLICIT_BUFFER_PTR: - assert(MFI->hasImplicitBufferPtr()); - return MFI->ImplicitBufferPtrUserSGPR; - case SIRegisterInfo::KERNARG_SEGMENT_PTR: - assert(MFI->hasKernargSegmentPtr()); - return MFI->KernargSegmentPtrUserSGPR; - case SIRegisterInfo::DISPATCH_ID: - assert(MFI->hasDispatchID()); - return MFI->DispatchIDUserSGPR; - case SIRegisterInfo::FLAT_SCRATCH_INIT: - assert(MFI->hasFlatScratchInit()); - return MFI->FlatScratchInitUserSGPR; - case SIRegisterInfo::DISPATCH_PTR: - assert(MFI->hasDispatchPtr()); - return MFI->DispatchPtrUserSGPR; - case SIRegisterInfo::QUEUE_PTR: - assert(MFI->hasQueuePtr()); - return MFI->QueuePtrUserSGPR; - case SIRegisterInfo::WORKITEM_ID_X: - assert(MFI->hasWorkItemIDX()); - return AMDGPU::VGPR0; - case SIRegisterInfo::WORKITEM_ID_Y: - assert(MFI->hasWorkItemIDY()); - return AMDGPU::VGPR1; - case SIRegisterInfo::WORKITEM_ID_Z: - assert(MFI->hasWorkItemIDZ()); - return AMDGPU::VGPR2; - } - llvm_unreachable("unexpected preloaded value type"); -} - /// \brief Returns a register that is not used at any point in the function. /// If all registers are used, then this function will return // AMDGPU::NoRegister. diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 3a8dea29df5b..65655b79c214 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -186,31 +186,6 @@ public: OpType <= AMDGPU::OPERAND_SRC_LAST; } - enum PreloadedValue { - // SGPRS: - PRIVATE_SEGMENT_BUFFER = 0, - DISPATCH_PTR = 1, - QUEUE_PTR = 2, - KERNARG_SEGMENT_PTR = 3, - DISPATCH_ID = 4, - FLAT_SCRATCH_INIT = 5, - WORKGROUP_ID_X = 10, - WORKGROUP_ID_Y = 11, - WORKGROUP_ID_Z = 12, - PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, - IMPLICIT_BUFFER_PTR = 15, - - // VGPRS: - FIRST_VGPR_VALUE = 16, - WORKITEM_ID_X = FIRST_VGPR_VALUE, - WORKITEM_ID_Y = 17, - WORKITEM_ID_Z = 18 - }; - - /// \brief Returns the physical register that \p Value is stored in. - unsigned getPreloadedValue(const MachineFunction &MF, - enum PreloadedValue Value) const; - unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF) const; diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll new file mode 100644 index 000000000000..eb8c4281706e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -0,0 +1,612 @@ +; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s + +; GCN-LABEL: {{^}}use_dispatch_ptr: +; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +define void @use_dispatch_ptr() #1 { + %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %header_ptr + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_dispatch_ptr: +; GCN: enable_sgpr_dispatch_ptr = 1 +; GCN: s_mov_b64 s[6:7], s[4:5] +define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 { + call void @use_dispatch_ptr() + ret void +} + +; GCN-LABEL: {{^}}use_queue_ptr: +; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +define void @use_queue_ptr() #1 { + %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 + %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %header_ptr + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr: +; GCN: enable_sgpr_queue_ptr = 1 +; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 { + call void @use_queue_ptr() + ret void +} + +; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast: +; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10 +; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]] + +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}} +define void @use_queue_ptr_addrspacecast() #1 { + %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %asc + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr_addrspacecast: +; CIVI: enable_sgpr_queue_ptr = 1 + +; CIVI: s_mov_b64 s[6:7], s[4:5] +; GFX9-NOT: s_mov_b64 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 { + call void @use_queue_ptr_addrspacecast() + ret void +} + +; GCN-LABEL: {{^}}use_kernarg_segment_ptr: +; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +define void @use_kernarg_segment_ptr() #1 { + %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %header_ptr = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %header_ptr + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_kernarg_segment_ptr: +; GCN: enable_sgpr_kernarg_segment_ptr = 1 +; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_kernarg_segment_ptr(i32) #1 { + call void @use_kernarg_segment_ptr() + ret void +} + +; GCN-LABEL: {{^}}use_dispatch_id: +; GCN: ; use s[6:7] +define void @use_dispatch_id() #1 { + %id = call i64 @llvm.amdgcn.dispatch.id() + call void asm sideeffect "; use $0", "s"(i64 %id) + ret void +} + +; No kernarg segment so that there is a mov to check. With kernarg +; pointer enabled, it happens to end up in the right place anyway. + +; GCN-LABEL: {{^}}kern_indirect_use_dispatch_id: +; GCN: enable_sgpr_dispatch_id = 1 + +; GCN: s_mov_b64 s[6:7], s[4:5] +define amdgpu_kernel void @kern_indirect_use_dispatch_id() #1 { + call void @use_dispatch_id() + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_x: +; GCN: s_waitcnt +; GCN: ; use s6 +define void @use_workgroup_id_x() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_stack_workgroup_id_x: +; GCN: s_waitcnt +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4 +; GCN: ; use s6 +; GCN: s_setpc_b64 +define void @use_stack_workgroup_id_x() #1 { + %alloca = alloca i32 + store volatile i32 0, i32* %alloca + %val = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_y: +; GCN: s_waitcnt +; GCN: ; use s6 +define void @use_workgroup_id_y() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_z: +; GCN: s_waitcnt +; GCN: ; use s6 +define void @use_workgroup_id_z() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_xy: +; GCN: ; use s6 +; GCN: ; use s7 +define void @use_workgroup_id_xy() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.x() + %val1 = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_xyz: +; GCN: ; use s6 +; GCN: ; use s7 +; GCN: ; use s8 +define void @use_workgroup_id_xyz() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.x() + %val1 = call i32 @llvm.amdgcn.workgroup.id.y() + %val2 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + call void asm sideeffect "; use $0", "s"(i32 %val2) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_xz: +; GCN: ; use s6 +; GCN: ; use s7 +define void @use_workgroup_id_xz() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.x() + %val1 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_yz: +; GCN: ; use s6 +; GCN: ; use s7 +define void @use_workgroup_id_yz() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.y() + %val1 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 0 +; GCN: enable_sgpr_workgroup_id_z = 0 + +; GCN-NOT: s6 +; GCN: s_mov_b32 s33, s7 +; GCN-NOT: s6 +; GCN: s_mov_b32 s4, s33 +; GCN-NOT: s6 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { + call void @use_workgroup_id_x() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 1 +; GCN: enable_sgpr_workgroup_id_z = 0 + +; GCN: s_mov_b32 s33, s8 +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s6, s7 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 { + call void @use_workgroup_id_y() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 0 +; GCN: enable_sgpr_workgroup_id_z = 1 + +; GCN: s_mov_b32 s33, s8 +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s6, s7 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 { + call void @use_workgroup_id_z() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 1 +; GCN: enable_sgpr_workgroup_id_z = 0 + +; GCN: s_mov_b32 s33, s8 +; GCN-NOT: s6 +; GCN-NOT: s7 +; GCN: s_mov_b32 s4, s33 +; GCN-NOT: s6 +; GCN-NOT: s7 +; GCN: s_mov_b32 s32, s33 +; GCN-NOT: s6 +; GCN-NOT: s7 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 { + call void @use_workgroup_id_xy() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 1 +; GCN: enable_sgpr_workgroup_id_z = 1 + +; GCN: s_mov_b32 s33, s9 + +; GCN-NOT: s6 +; GCN-NOT: s7 +; GCN-NOT: s8 + +; GCN: s_mov_b32 s4, s33 + +; GCN-NOT: s6 +; GCN-NOT: s7 +; GCN-NOT: s8 + +; GCN: s_mov_b32 s32, s33 + +; GCN-NOT: s6 +; GCN-NOT: s7 +; GCN-NOT: s8 + +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 { + call void @use_workgroup_id_xyz() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 0 +; GCN: enable_sgpr_workgroup_id_z = 1 + +; GCN: s_mov_b32 s33, s8 +; GCN-NOT: s6 +; GCN-NOT: s7 + +; GCN: s_mov_b32 s4, s33 +; GCN-NOT: s6 +; GCN-NOT: s7 + +; GCN: s_mov_b32 s32, s33 +; GCN-NOT: s6 +; GCN-NOT: s7 + +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 { + call void @use_workgroup_id_xz() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 1 +; GCN: enable_sgpr_workgroup_id_z = 1 + +; GCN: s_mov_b32 s33, s9 +; GCN: s_mov_b32 s6, s7 +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s7, s8 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 { + call void @use_workgroup_id_yz() + ret void +} + +; Argument is in right place already +; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x: +; GCN-NOT: s6 +define void @func_indirect_use_workgroup_id_x() #1 { + call void @use_workgroup_id_x() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y: +; GCN-NOT: s6 +define void @func_indirect_use_workgroup_id_y() #1 { + call void @use_workgroup_id_y() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z: +; GCN-NOT: s6 +define void @func_indirect_use_workgroup_id_z() #1 { + call void @use_workgroup_id_z() + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workgroup_id_x: +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN: ; use s6 +define void @other_arg_use_workgroup_id_x(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.x() + store volatile i32 %arg0, i32 addrspace(1)* undef + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workgroup_id_y: +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN: ; use s6 +define void @other_arg_use_workgroup_id_y(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.y() + store volatile i32 %arg0, i32 addrspace(1)* undef + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workgroup_id_z: +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN: ; use s6 +define void @other_arg_use_workgroup_id_z(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %arg0, i32 addrspace(1)* undef + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_x: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 0 +; GCN: enable_sgpr_workgroup_id_z = 0 + +; GCN-DAG: s_mov_b32 s33, s7 +; GCN-DAG: v_mov_b32_e32 v0, 0x22b + +; GCN-NOT: s6 +; GCN: s_mov_b32 s4, s33 +; GCN-NOT: s6 +; GCN-DAG: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { + call void @other_arg_use_workgroup_id_x(i32 555) + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 1 +; GCN: enable_sgpr_workgroup_id_z = 0 + +; GCN-DAG: s_mov_b32 s33, s8 +; GCN-DAG: v_mov_b32_e32 v0, 0x22b +; GCN: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s6, s7 +; GCN-DAG: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 { + call void @other_arg_use_workgroup_id_y(i32 555) + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 0 +; GCN: enable_sgpr_workgroup_id_z = 1 + +; GCN: s_mov_b32 s33, s8 +; GCN-DAG: v_mov_b32_e32 v0, 0x22b +; GCN: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s6, s7 + +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 { + call void @other_arg_use_workgroup_id_z(i32 555) + ret void +} + +; GCN-LABEL: {{^}}use_every_sgpr_input: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 +; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[10:11], 0x0 +; GCN: ; use s[12:13] +; GCN: ; use s14 +; GCN: ; use s15 +; GCN: ; use s16 +define void @use_every_sgpr_input() #1 { + %alloca = alloca i32, align 4 + store volatile i32 0, i32* %alloca + + %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* + %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + + %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* + %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + + %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* + %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + + %val3 = call i64 @llvm.amdgcn.dispatch.id() + call void asm sideeffect "; use $0", "s"(i64 %val3) + + %val4 = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val4) + + %val5 = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val5) + + %val6 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val6) + + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 1 +; GCN: enable_sgpr_workgroup_id_z = 1 +; GCN: enable_sgpr_workgroup_info = 0 + +; GCN: enable_sgpr_private_segment_buffer = 1 +; GCN: enable_sgpr_dispatch_ptr = 1 +; GCN: enable_sgpr_queue_ptr = 1 +; GCN: enable_sgpr_kernarg_segment_ptr = 1 +; GCN: enable_sgpr_dispatch_id = 1 +; GCN: enable_sgpr_flat_scratch_init = 1 + +; GCN: s_mov_b32 s33, s17 +; GCN: s_mov_b64 s[12:13], s[10:11] +; GCN: s_mov_b64 s[10:11], s[8:9] +; GCN: s_mov_b64 s[8:9], s[6:7] +; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 { + call void @use_every_sgpr_input() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_every_sgpr_input: +; GCN-NOT: s6 +; GCN-NOT: s7 +; GCN-NOT: s8 +; GCN-NOT: s9 +; GCN-NOT: s10 +; GCN-NOT: s11 +; GCN-NOT: s12 +; GCN-NOT: s13 +; GCN-NOT: s[6:7] +; GCN-NOT: s[8:9] +; GCN-NOT: s[10:11] +; GCN-NOT: s[12:13] +define void @func_indirect_use_every_sgpr_input() #1 { + call void @use_every_sgpr_input() + ret void +} + +; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz: +; GCN-DAG: s_mov_b32 s6, s14 +; GCN-DAG: s_mov_b32 s7, s15 +; GCN-DAG: s_mov_b32 s8, s16 +; GCN: s_swappc_b64 +define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 { + %alloca = alloca i32, align 4 + store volatile i32 0, i32* %alloca + + %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* + %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + + %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* + %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + + %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* + %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + + %val3 = call i64 @llvm.amdgcn.dispatch.id() + call void asm sideeffect "; use $0", "s"(i64 %val3) + + %val4 = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val4) + + %val5 = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val5) + + %val6 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val6) + + call void @use_workgroup_id_xyz() + ret void +} + +; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill: +; GCN: s_mov_b32 s5, s32 +; GCN: s_add_u32 s32, s32, 0x300 + +; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-9]+]], s14 +; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-9]+]], s15 +; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-9]+]], s16 +; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[6:7] +; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[8:9] +; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[10:11] + +; GCN-DAG: s_mov_b32 s6, [[SAVE_X]] +; GCN-DAG: s_mov_b32 s7, [[SAVE_Y]] +; GCN-DAG: s_mov_b32 s8, [[SAVE_Z]] +; GCN: s_swappc_b64 + +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 +; GCN: s_load_dword s{{[0-9]+}}, +; GCN: s_load_dword s{{[0-9]+}}, +; GCN: s_load_dword s{{[0-9]+}}, +; GCN: ; use +; GCN: ; use [[SAVE_X]] +; GCN: ; use [[SAVE_Y]] +; GCN: ; use [[SAVE_Z]] +define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill() #1 { + %alloca = alloca i32, align 4 + call void @use_workgroup_id_xyz() + + store volatile i32 0, i32* %alloca + + %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* + %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + + %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* + %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + + %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* + %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + + %val3 = call i64 @llvm.amdgcn.dispatch.id() + call void asm sideeffect "; use $0", "s"(i64 %val3) + + %val4 = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val4) + + %val5 = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val5) + + %val6 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val6) + + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.y() #0 +declare i32 @llvm.amdgcn.workgroup.id.z() #0 +declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 +declare noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i64 @llvm.amdgcn.dispatch.id() #0 +declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll new file mode 100644 index 000000000000..fa8a2d2f1bb3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -0,0 +1,671 @@ +; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; GCN-LABEL: {{^}}use_workitem_id_x: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_x() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_y: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_y() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_z: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_z() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_xy: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_xy() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_xyz: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v2 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_xyz() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + %val2 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + store volatile i32 %val2, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_xz: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_xz() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + %val1 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_yz: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_yz() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.y() + %val1 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x: +; GCN: enable_vgpr_workitem_id = 0 + +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 { + call void @use_workitem_id_x() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y: +; GCN: enable_vgpr_workitem_id = 1 + +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: v_mov_b32_e32 v0, v1 +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 { + call void @use_workitem_id_y() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z: +; GCN: enable_vgpr_workitem_id = 2 + +; GCN-NOT: v0 +; GCN-NOT: v2 +; GCN: v_mov_b32_e32 v0, v2 +; GCN-NOT: v0 +; GCN-NOT: v2 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { + call void @use_workitem_id_z() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x: +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define void @func_indirect_use_workitem_id_x() #1 { + call void @use_workitem_id_x() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workitem_id_y: +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define void @func_indirect_use_workitem_id_y() #1 { + call void @use_workitem_id_y() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workitem_id_z: +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define void @func_indirect_use_workitem_id_z() #1 { + call void @use_workitem_id_z() + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workitem_id_x: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +define void @other_arg_use_workitem_id_x(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workitem_id_y: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +define void @other_arg_use_workitem_id_y(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workitem_id_z: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +define void @other_arg_use_workitem_id_z(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x: +; GCN: enable_vgpr_workitem_id = 0 + +; GCN: v_mov_b32_e32 v1, v0 +; GCN: v_mov_b32_e32 v0, 0x22b +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 { + call void @other_arg_use_workitem_id_x(i32 555) + ret void +} + + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: +; GCN: enable_vgpr_workitem_id = 1 + +; GCN-NOT: v1 +; GCN: v_mov_b32_e32 v0, 0x22b +; GCN-NOT: v1 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 { + call void @other_arg_use_workitem_id_y(i32 555) + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z: +; GCN: enable_vgpr_workitem_id = 2 + +; GCN: v_mov_b32_e32 v0, 0x22b +; GCN: v_mov_b32_e32 v1, v2 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { + call void @other_arg_use_workitem_id_z(i32 555) + ret void +} + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 + +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @too_many_args_use_workitem_id_x( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + store volatile i32 %arg31, i32 addrspace(1)* undef + + ret void +} + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: +; GCN: enable_vgpr_workitem_id = 0 + +; GCN: s_mov_b32 s33, s7 +; GCN: s_mov_b32 s32, s33 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GCN: s_mov_b32 s4, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { + call void @too_many_args_use_workitem_id_x( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320) + ret void +} + +; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN: s_swappc_b64 +define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { + store volatile i32 %arg0, i32 addrspace(1)* undef + call void @too_many_args_use_workitem_id_x( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320) + ret void +} + +; Requires loading and storing to stack slot. +; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 +; GCN: s_add_u32 s32, s32, 0x400{{$}} + +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}} + +; GCN: s_swappc_b64 + +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload +; GCN: s_sub_u32 s32, s32, 0x400{{$}} +; GCN: s_setpc_b64 +define void @too_many_args_call_too_many_args_use_workitem_id_x( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 { + call void @too_many_args_use_workitem_id_x( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) + ret void +} + +; stack layout: +; frame[0] = emergency stack slot +; frame[1] = byval arg32 +; frame[2] = stack passed workitem ID x +; frame[3] = VGPR spill slot + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 +; GCN: buffer_load_dword v0, off, s[0:3], s5 offset:4 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload +; GCN: s_setpc_b64 +define void @too_many_args_use_workitem_id_x_byval( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31, i32* byval %arg32) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + store volatile i32 %arg31, i32 addrspace(1)* undef + %private = load volatile i32, i32* %arg32 + ret void +} + +; frame[0] = emergency stack slot +; frame[1] = + +; sp[0] = callee emergency stack slot reservation +; sp[1] = byval +; sp[2] = ?? +; sp[3] = stack passed workitem ID x + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: +; GCN: enable_vgpr_workitem_id = 0 + +; GCN: s_mov_b32 s33, s7 +; GCN: s_add_u32 s32, s33, 0x200{{$}} + +; GCN-DAG: s_add_u32 s32, s32, 0x100{{$}} +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12 + +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4 +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} +; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { + %alloca = alloca i32, align 4 + store volatile i32 999, i32* %alloca + call void @too_many_args_use_workitem_id_x_byval( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320, + i32* %alloca) + ret void +} + +; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12 + +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} +; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], +; GCN: s_swappc_b64 +define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { + %alloca = alloca i32, align 4 + store volatile i32 999, i32* %alloca + call void @too_many_args_use_workitem_id_x_byval( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320, + i32* %alloca) + ret void +} + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 + +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @too_many_args_use_workitem_id_xyz( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val0, i32 addrspace(1)* undef + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val1, i32 addrspace(1)* undef + %val2 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val2, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + store volatile i32 %arg31, i32 addrspace(1)* undef + + ret void +} + +; frame[0] = kernel emergency stack slot +; frame[1] = callee emergency stack slot +; frame[2] = ID X +; frame[3] = ID Y +; frame[4] = ID Z + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: +; GCN: enable_vgpr_workitem_id = 2 + +; GCN: s_mov_b32 s33, s7 +; GCN: s_mov_b32 s32, s33 + +; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:12 +; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:16 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { + call void @too_many_args_use_workitem_id_xyz( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320) + ret void +} + +; workitem ID X in register, yz on stack +; v31 = workitem ID X +; frame[0] = emergency slot +; frame[1] = workitem Y +; frame[2] = workitem Z + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz: +; GCN: s_mov_b32 s5, s32 +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 +; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:4{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 +; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:8{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 + +; GCN: s_waitcnt +; GCN-NEXT: s_setpc_b64 +; GCN: ScratchSize: 12 +define void @too_many_args_use_workitem_id_x_stack_yz( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30) #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val0, i32 addrspace(1)* undef + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val1, i32 addrspace(1)* undef + %val2 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val2, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + + ret void +} + +; frame[0] = kernel emergency stack slot +; frame[1] = callee emergency stack slot +; frame[2] = ID Y +; frame[3] = ID Z + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: +; GCN: enable_vgpr_workitem_id = 2 + +; GCN: s_mov_b32 s33, s7 +; GCN: s_mov_b32 s32, s33 + +; GCN-DAG: v_mov_b32_e32 v31, v0 +; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 { + call void @too_many_args_use_workitem_id_x_stack_yz( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workitem.id.y() #0 +declare i32 @llvm.amdgcn.workitem.id.z() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind noinline }