AMDGPU: Pass special input registers to functions

llvm-svn: 309998
This commit is contained in:
Matt Arsenault 2017-08-03 23:00:29 +00:00
parent 52854dcd34
commit 8623e8d864
13 changed files with 1714 additions and 237 deletions

View File

@ -41,7 +41,7 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
unsigned Offset) const {
MachineFunction &MF = MIRBuilder.getMF();
const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = *MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
@ -49,7 +49,7 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
LLT PtrType = getLLTForType(*PtrTy, DL);
unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
unsigned KernArgSegmentPtr =
TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));

View File

@ -3582,6 +3582,49 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
}
SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
EVT VT,
const SDLoc &SL,
int64_t Offset) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
}
SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
SDValue StackPtr,
SDValue ArgVal,
int64_t Offset) const {
MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
SDValue PtrOffset = DAG.getConstant(Offset, SL, MVT::i32);
SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, StackPtr, PtrOffset);
SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
MachineMemOperand::MODereferenceable);
return Store;
}
SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
const TargetRegisterClass *RC,
EVT VT, const SDLoc &SL,
const ArgDescriptor &Arg) const {
assert(Arg && "Attempting to load missing argument");
if (Arg.isRegister())
return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
}
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();

View File

@ -24,7 +24,7 @@ namespace llvm {
class AMDGPUMachineFunction;
class AMDGPUSubtarget;
class MachineRegisterInfo;
struct ArgDescriptor;
class AMDGPUTargetLowering : public TargetLowering {
private:
@ -237,6 +237,25 @@ public:
return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
}
/// Similar to CreateLiveInRegister, except value maybe loaded from a stack
/// slot rather than passed in a register.
SDValue loadStackInputValue(SelectionDAG &DAG,
EVT VT,
const SDLoc &SL,
int64_t Offset) const;
SDValue storeStackInputValue(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
SDValue StackPtr,
SDValue ArgVal,
int64_t Offset) const;
SDValue loadInputValue(SelectionDAG &DAG,
const TargetRegisterClass *RC,
EVT VT, const SDLoc &SL,
const ArgDescriptor &Arg) const;
enum ImplicitParameter {
FIRST_IMPLICIT,
GRID_DIM = FIRST_IMPLICIT,

View File

@ -38,6 +38,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
MachineBasicBlock &MBB) const {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo* TRI = &TII->getRegisterInfo();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// We don't need this if we only have spills since there is no user facing
// scratch.
@ -55,7 +56,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
MachineBasicBlock::iterator I = MBB.begin();
unsigned FlatScratchInitReg
= TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
= MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(FlatScratchInitReg);
@ -64,7 +65,6 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
// Do a 64-bit pointer add.
@ -283,13 +283,13 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
// We need to insert initialization of the scratch resource descriptor.
unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
if (ST.isAmdCodeObjectV2(MF)) {
PreloadedPrivateBufferReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
PreloadedPrivateBufferReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
}
bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);

View File

@ -45,6 +45,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
@ -895,14 +896,19 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
uint64_t Offset) const {
const DataLayout &DL = DAG.getDataLayout();
MachineFunction &MF = DAG.getMachineFunction();
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
unsigned InputPtrReg = TRI->getPreloadedValue(MF,
SIRegisterInfo::KERNARG_SEGMENT_PTR);
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const ArgDescriptor *InputPtrReg;
const TargetRegisterClass *RC;
std::tie(InputPtrReg, RC)
= Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
DAG.getConstant(Offset, SL, PtrVT));
}
@ -1005,6 +1011,17 @@ SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA
return ArgValue;
}
SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
const SIMachineFunctionInfo &MFI,
EVT VT,
AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
const ArgDescriptor *Reg;
const TargetRegisterClass *RC;
std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
}
static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
CallingConv::ID CallConv,
ArrayRef<ISD::InputArg> Ins,
@ -1055,27 +1072,129 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
}
// Allocate special inputs passed in VGPRs.
static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
if (Info.hasWorkItemIDX()) {
unsigned Reg = AMDGPU::VGPR0;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
assert(Reg == AMDGPU::VGPR0);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDY()) {
unsigned Reg = AMDGPU::VGPR1;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
assert(Reg == AMDGPU::VGPR1);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDZ()) {
unsigned Reg = AMDGPU::VGPR2;
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
assert(Reg == AMDGPU::VGPR2);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
}
}
// Try to allocate a VGPR at the end of the argument list, or if no argument
// VGPRs are left allocating a stack slot.
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
ArrayRef<MCPhysReg> ArgVGPRs
= makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
if (RegIdx == ArgVGPRs.size()) {
// Spill to stack required.
int64_t Offset = CCInfo.AllocateStack(4, 4);
return ArgDescriptor::createStack(Offset);
}
unsigned Reg = ArgVGPRs[RegIdx];
Reg = CCInfo.AllocateReg(Reg);
assert(Reg != AMDGPU::NoRegister);
MachineFunction &MF = CCInfo.getMachineFunction();
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
return ArgDescriptor::createRegister(Reg);
}
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
const TargetRegisterClass *RC,
unsigned NumArgRegs) {
ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
if (RegIdx == ArgSGPRs.size())
report_fatal_error("ran out of SGPRs for arguments");
unsigned Reg = ArgSGPRs[RegIdx];
Reg = CCInfo.AllocateReg(Reg);
assert(Reg != AMDGPU::NoRegister);
MachineFunction &MF = CCInfo.getMachineFunction();
MF.addLiveIn(Reg, RC);
return ArgDescriptor::createRegister(Reg);
}
static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
}
static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
}
static void allocateSpecialInputVGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
if (Info.hasWorkItemIDX()) {
unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkItemIDX())
Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
if (Info.hasWorkItemIDY()) {
unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkItemIDY())
Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
if (Info.hasWorkItemIDZ()) {
unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkItemIDZ())
Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
}
static void allocateSpecialInputSGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
auto &ArgInfo = Info.getArgInfo();
// TODO: Unify handling with private memory pointers.
if (Info.hasDispatchPtr())
ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
if (Info.hasQueuePtr())
ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
if (Info.hasKernargSegmentPtr())
ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
if (Info.hasDispatchID())
ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
// flat_scratch_init is not applicable for non-kernel functions.
if (Info.hasWorkGroupIDX())
ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
if (Info.hasWorkGroupIDY())
ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
if (Info.hasWorkGroupIDZ())
ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
}
// Allocate special inputs passed in user SGPRs.
@ -1212,8 +1331,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
if (MFI.hasCalls()) {
@ -1229,8 +1348,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
= TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
} else {
unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
}
} else {
@ -1256,8 +1375,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
Info.setScratchRSrcReg(ReservedBufferReg);
if (HasStackObjects && !MFI.hasCalls()) {
unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
} else {
unsigned ReservedOffsetReg
@ -1390,7 +1509,7 @@ SDValue SITargetLowering::LowerFormalArguments(
}
if (IsEntryFunc) {
allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
}
@ -1509,6 +1628,11 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
if (!IsEntryFunc) {
// Special inputs come after user arguments.
allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
}
// Start adding system SGPRs.
if (IsEntryFunc) {
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
@ -1516,8 +1640,13 @@ SDValue SITargetLowering::LowerFormalArguments(
CCInfo.AllocateReg(Info->getScratchRSrcReg());
CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
CCInfo.AllocateReg(Info->getFrameOffsetReg());
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo());
return Chains.empty() ? Chain :
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
@ -1741,6 +1870,81 @@ SDValue SITargetLowering::LowerCallResult(
return Chain;
}
// Add code to pass special inputs required depending on used features separate
// from the explicit user arguments present in the IR.
void SITargetLowering::passSpecialInputs(
CallLoweringInfo &CLI,
const SIMachineFunctionInfo &Info,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &MemOpChains,
SDValue Chain,
SDValue StackPtr) const {
// If we don't have a call site, this was a call inserted by
// legalization. These can never use special inputs.
if (!CLI.CS)
return;
const Function *CalleeFunc = CLI.CS.getCalledFunction();
if (!CalleeFunc)
report_fatal_error("indirect calls not handled");
SelectionDAG &DAG = CLI.DAG;
const SDLoc &DL = CLI.DL;
const SISubtarget *ST = getSubtarget();
const SIRegisterInfo *TRI = ST->getRegisterInfo();
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
const AMDGPUFunctionArgInfo &CalleeArgInfo
= ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
// TODO: Unify with private memory register handling. This is complicated by
// the fact that at least in kernels, the input argument is not necessarily
// in the same location as the input.
AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
AMDGPUFunctionArgInfo::DISPATCH_PTR,
AMDGPUFunctionArgInfo::QUEUE_PTR,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
AMDGPUFunctionArgInfo::DISPATCH_ID,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
AMDGPUFunctionArgInfo::WORKITEM_ID_X,
AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
AMDGPUFunctionArgInfo::WORKITEM_ID_Z
};
for (auto InputID : InputRegs) {
const ArgDescriptor *OutgoingArg;
const TargetRegisterClass *ArgRC;
std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
if (!OutgoingArg)
continue;
const ArgDescriptor *IncomingArg;
const TargetRegisterClass *IncomingArgRC;
std::tie(IncomingArg, IncomingArgRC)
= CallerArgInfo.getPreloadedValue(InputID);
assert(IncomingArgRC == ArgRC);
// All special arguments are ints for now.
EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
SDValue InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
if (OutgoingArg->isRegister()) {
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
} else {
SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
InputReg,
OutgoingArg->getStackOffset());
MemOpChains.push_back(ArgStore);
}
}
}
// The wave scratch offset register is used as the global base pointer.
SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
@ -1897,6 +2101,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
}
// Copy special input registers after user input arguments.
passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
@ -3424,7 +3631,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
auto MFI = MF.getInfo<SIMachineFunctionInfo>();
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
EVT VT = Op.getValueType();
SDLoc DL(Op);
@ -3436,10 +3642,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_implicit_buffer_ptr: {
if (getSubtarget()->isAmdCodeObjectV2(MF))
return emitNonHSAIntrinsicError(DAG, DL, VT);
unsigned Reg = TRI->getPreloadedValue(MF,
SIRegisterInfo::IMPLICIT_BUFFER_PTR);
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
}
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
@ -3451,10 +3655,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getUNDEF(VT);
}
auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
TRI->getPreloadedValue(MF, Reg), VT);
auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
return getPreloadedValue(DAG, *MFI, VT, RegID);
}
case Intrinsic::amdgcn_implicitarg_ptr: {
if (MFI->isEntryFunction())
@ -3462,13 +3665,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
report_fatal_error("amdgcn.implicitarg.ptr not implemented for functions");
}
case Intrinsic::amdgcn_kernarg_segment_ptr: {
unsigned Reg
= TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
}
case Intrinsic::amdgcn_dispatch_id: {
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID);
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
}
case Intrinsic::amdgcn_rcp:
return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
@ -3553,28 +3754,32 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SI::KernelInputOffsets::LOCAL_SIZE_Z);
case Intrinsic::amdgcn_workgroup_id_x:
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
case Intrinsic::amdgcn_workgroup_id_y:
case Intrinsic::r600_read_tgid_y:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::r600_read_tgid_z:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
case Intrinsic::amdgcn_workitem_id_x:
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
case Intrinsic::amdgcn_workitem_id_x: {
case Intrinsic::r600_read_tidig_x:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDX);
}
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDY);
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDZ);
case AMDGPUIntrinsic::SI_load_const: {
SDValue Ops[] = {
Op.getOperand(1),

View File

@ -16,6 +16,7 @@
#define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
#include "AMDGPUISelLowering.h"
#include "AMDGPUArgumentUsageInfo.h"
#include "SIInstrInfo.h"
namespace llvm {
@ -32,6 +33,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
const SDLoc &SL, SDValue Chain,
const ISD::InputArg &Arg) const;
SDValue getPreloadedValue(SelectionDAG &DAG,
const SIMachineFunctionInfo &MFI,
EVT VT,
AMDGPUFunctionArgInfo::PreloadedValue) const;
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
@ -205,6 +210,14 @@ public:
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
void passSpecialInputs(
CallLoweringInfo &CLI,
const SIMachineFunctionInfo &Info,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &MemOpChains,
SDValue Chain,
SDValue StackPtr) const;
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,

View File

@ -916,7 +916,6 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
unsigned WavefrontSize = ST.getWavefrontSize();
@ -936,13 +935,13 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
WorkGroupSize > WavefrontSize) {
unsigned TIDIGXReg
= TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
= MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
unsigned TIDIGYReg
= TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
= MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
unsigned TIDIGZReg
= TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
= MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
unsigned InputPtrReg =
TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
if (!Entry.isLiveIn(Reg))
Entry.addLiveIn(Reg);

View File

@ -27,24 +27,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG),
FrameOffsetReg(AMDGPU::FP_REG),
StackPtrOffsetReg(AMDGPU::SP_REG),
PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
DispatchPtrUserSGPR(AMDGPU::NoRegister),
QueuePtrUserSGPR(AMDGPU::NoRegister),
KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
DispatchIDUserSGPR(AMDGPU::NoRegister),
FlatScratchInitUserSGPR(AMDGPU::NoRegister),
PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
WorkItemIDXVGPR(AMDGPU::NoRegister),
WorkItemIDYVGPR(AMDGPU::NoRegister),
WorkItemIDZVGPR(AMDGPU::NoRegister),
ArgInfo(),
PSInputAddr(0),
PSInputEnable(0),
ReturnsVoid(true),
@ -91,8 +74,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
FrameOffsetReg = AMDGPU::SGPR5;
StackPtrOffsetReg = AMDGPU::SGPR32;
// FIXME: Not really a system SGPR.
PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg;
ArgInfo.PrivateSegmentBuffer =
ArgDescriptor::createRegister(ScratchRSrcReg);
ArgInfo.PrivateSegmentWaveByteOffset =
ArgDescriptor::createRegister(ScratchWaveOffsetReg);
if (F->hasFnAttribute("amdgpu-implicitarg-ptr"))
ImplicitArgPtr = true;
} else {
@ -151,10 +137,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (HasStackObjects || MaySpill) {
PrivateSegmentWaveByteOffset = true;
// HS and GS always have the scratch wave offset in SGPR5 on GFX9.
if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
(CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
// HS and GS always have the scratch wave offset in SGPR5 on GFX9.
if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
(CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
ArgInfo.PrivateSegmentWaveByteOffset
= ArgDescriptor::createRegister(AMDGPU::SGPR5);
}
}
@ -189,52 +176,54 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
const SIRegisterInfo &TRI) {
PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
ArgInfo.PrivateSegmentBuffer =
ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
NumUserSGPRs += 4;
return PrivateSegmentBufferUserSGPR;
return ArgInfo.PrivateSegmentBuffer.getRegister();
}
unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
return DispatchPtrUserSGPR;
return ArgInfo.DispatchPtr.getRegister();
}
unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
QueuePtrUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
return QueuePtrUserSGPR;
return ArgInfo.QueuePtr.getRegister();
}
unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
ArgInfo.KernargSegmentPtr
= ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
return KernargSegmentPtrUserSGPR;
return ArgInfo.KernargSegmentPtr.getRegister();
}
unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
DispatchIDUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
return DispatchIDUserSGPR;
return ArgInfo.DispatchID.getRegister();
}
unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
return FlatScratchInitUserSGPR;
return ArgInfo.FlatScratchInit.getRegister();
}
unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
return ImplicitBufferPtrUserSGPR;
return ArgInfo.ImplicitBufferPtr.getRegister();
}
static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {

View File

@ -16,6 +16,7 @@
#include "AMDGPUMachineFunction.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "AMDGPUArgumentUsageInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/MC/MCRegisterInfo.h"
@ -96,33 +97,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
// Top of the stack SGPR offset derived from the ScratchWaveOffsetReg.
unsigned StackPtrOffsetReg;
// Input registers for non-HSA ABI
unsigned ImplicitBufferPtrUserSGPR;
// Input registers setup for the HSA ABI.
// User SGPRs in allocation order.
unsigned PrivateSegmentBufferUserSGPR;
unsigned DispatchPtrUserSGPR;
unsigned QueuePtrUserSGPR;
unsigned KernargSegmentPtrUserSGPR;
unsigned DispatchIDUserSGPR;
unsigned FlatScratchInitUserSGPR;
unsigned PrivateSegmentSizeUserSGPR;
unsigned GridWorkGroupCountXUserSGPR;
unsigned GridWorkGroupCountYUserSGPR;
unsigned GridWorkGroupCountZUserSGPR;
// System SGPRs in allocation order.
unsigned WorkGroupIDXSystemSGPR;
unsigned WorkGroupIDYSystemSGPR;
unsigned WorkGroupIDZSystemSGPR;
unsigned WorkGroupInfoSystemSGPR;
unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
// VGPR inputs. These are always v0, v1 and v2 for entry functions.
unsigned WorkItemIDXVGPR;
unsigned WorkItemIDYVGPR;
unsigned WorkItemIDZVGPR;
AMDGPUFunctionArgInfo ArgInfo;
// Graphics info.
unsigned PSInputAddr;
@ -235,7 +210,6 @@ private:
SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
public:
SIMachineFunctionInfo(const MachineFunction &MF);
ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
@ -266,37 +240,52 @@ public:
// Add system SGPRs.
unsigned addWorkGroupIDX() {
WorkGroupIDXSystemSGPR = getNextSystemSGPR();
ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return WorkGroupIDXSystemSGPR;
return ArgInfo.WorkGroupIDX.getRegister();
}
unsigned addWorkGroupIDY() {
WorkGroupIDYSystemSGPR = getNextSystemSGPR();
ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return WorkGroupIDYSystemSGPR;
return ArgInfo.WorkGroupIDY.getRegister();
}
unsigned addWorkGroupIDZ() {
WorkGroupIDZSystemSGPR = getNextSystemSGPR();
ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return WorkGroupIDZSystemSGPR;
return ArgInfo.WorkGroupIDZ.getRegister();
}
unsigned addWorkGroupInfo() {
WorkGroupInfoSystemSGPR = getNextSystemSGPR();
ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return WorkGroupInfoSystemSGPR;
return ArgInfo.WorkGroupInfo.getRegister();
}
// Add special VGPR inputs
void setWorkItemIDX(ArgDescriptor Arg) {
ArgInfo.WorkItemIDX = Arg;
}
void setWorkItemIDY(ArgDescriptor Arg) {
ArgInfo.WorkItemIDY = Arg;
}
void setWorkItemIDZ(ArgDescriptor Arg) {
ArgInfo.WorkItemIDZ = Arg;
}
unsigned addPrivateSegmentWaveByteOffset() {
PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
ArgInfo.PrivateSegmentWaveByteOffset
= ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return PrivateSegmentWaveByteOffsetSystemSGPR;
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
void setPrivateSegmentWaveByteOffset(unsigned Reg) {
PrivateSegmentWaveByteOffsetSystemSGPR = Reg;
ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg);
}
bool hasPrivateSegmentBuffer() const {
@ -375,6 +364,23 @@ public:
return ImplicitBufferPtr;
}
AMDGPUFunctionArgInfo &getArgInfo() {
return ArgInfo;
}
const AMDGPUFunctionArgInfo &getArgInfo() const {
return ArgInfo;
}
std::pair<const ArgDescriptor *, const TargetRegisterClass *>
getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
return ArgInfo.getPreloadedValue(Value);
}
unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
return ArgInfo.getPreloadedValue(Value).first->getRegister();
}
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
@ -384,7 +390,7 @@ public:
}
unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
return PrivateSegmentWaveByteOffsetSystemSGPR;
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
/// \brief Returns the physical register reserved for use as the resource
@ -426,11 +432,11 @@ public:
}
unsigned getQueuePtrUserSGPR() const {
return QueuePtrUserSGPR;
return ArgInfo.QueuePtr.getRegister();
}
unsigned getImplicitBufferPtrUserSGPR() const {
return ImplicitBufferPtrUserSGPR;
return ArgInfo.ImplicitBufferPtr.getRegister();
}
bool hasSpilledSGPRs() const {
@ -562,13 +568,13 @@ public:
switch (Dim) {
case 0:
assert(hasWorkGroupIDX());
return WorkGroupIDXSystemSGPR;
return ArgInfo.WorkGroupIDX.getRegister();
case 1:
assert(hasWorkGroupIDY());
return WorkGroupIDYSystemSGPR;
return ArgInfo.WorkGroupIDY.getRegister();
case 2:
assert(hasWorkGroupIDZ());
return WorkGroupIDZSystemSGPR;
return ArgInfo.WorkGroupIDZ.getRegister();
}
llvm_unreachable("unexpected dimension");
}

View File

@ -1338,61 +1338,6 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
return getCommonSubClass(DefRC, SrcRC) != nullptr;
}
// FIXME: Most of these are flexible with HSA and we don't need to reserve them
// as input registers if unused. Whether the dispatch ptr is necessary should be
// easy to detect from used intrinsics. Scratch setup is harder to know.
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
(void)ST;
switch (Value) {
case SIRegisterInfo::WORKGROUP_ID_X:
assert(MFI->hasWorkGroupIDX());
return MFI->WorkGroupIDXSystemSGPR;
case SIRegisterInfo::WORKGROUP_ID_Y:
assert(MFI->hasWorkGroupIDY());
return MFI->WorkGroupIDYSystemSGPR;
case SIRegisterInfo::WORKGROUP_ID_Z:
assert(MFI->hasWorkGroupIDZ());
return MFI->WorkGroupIDZSystemSGPR;
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
assert(MFI->hasPrivateSegmentBuffer());
return MFI->PrivateSegmentBufferUserSGPR;
case SIRegisterInfo::IMPLICIT_BUFFER_PTR:
assert(MFI->hasImplicitBufferPtr());
return MFI->ImplicitBufferPtrUserSGPR;
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
assert(MFI->hasKernargSegmentPtr());
return MFI->KernargSegmentPtrUserSGPR;
case SIRegisterInfo::DISPATCH_ID:
assert(MFI->hasDispatchID());
return MFI->DispatchIDUserSGPR;
case SIRegisterInfo::FLAT_SCRATCH_INIT:
assert(MFI->hasFlatScratchInit());
return MFI->FlatScratchInitUserSGPR;
case SIRegisterInfo::DISPATCH_PTR:
assert(MFI->hasDispatchPtr());
return MFI->DispatchPtrUserSGPR;
case SIRegisterInfo::QUEUE_PTR:
assert(MFI->hasQueuePtr());
return MFI->QueuePtrUserSGPR;
case SIRegisterInfo::WORKITEM_ID_X:
assert(MFI->hasWorkItemIDX());
return AMDGPU::VGPR0;
case SIRegisterInfo::WORKITEM_ID_Y:
assert(MFI->hasWorkItemIDY());
return AMDGPU::VGPR1;
case SIRegisterInfo::WORKITEM_ID_Z:
assert(MFI->hasWorkItemIDZ());
return AMDGPU::VGPR2;
}
llvm_unreachable("unexpected preloaded value type");
}
/// \brief Returns a register that is not used at any point in the function.
/// If all registers are used, then this function will return
// AMDGPU::NoRegister.

View File

@ -186,31 +186,6 @@ public:
OpType <= AMDGPU::OPERAND_SRC_LAST;
}
enum PreloadedValue {
// SGPRS:
PRIVATE_SEGMENT_BUFFER = 0,
DISPATCH_PTR = 1,
QUEUE_PTR = 2,
KERNARG_SEGMENT_PTR = 3,
DISPATCH_ID = 4,
FLAT_SCRATCH_INIT = 5,
WORKGROUP_ID_X = 10,
WORKGROUP_ID_Y = 11,
WORKGROUP_ID_Z = 12,
PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
IMPLICIT_BUFFER_PTR = 15,
// VGPRS:
FIRST_VGPR_VALUE = 16,
WORKITEM_ID_X = FIRST_VGPR_VALUE,
WORKITEM_ID_Y = 17,
WORKITEM_ID_Z = 18
};
/// \brief Returns the physical register that \p Value is stored in.
unsigned getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const;
unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC,
const MachineFunction &MF) const;

View File

@ -0,0 +1,612 @@
; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}use_dispatch_ptr:
; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
define void @use_dispatch_ptr() #1 {
%dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
%header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
%value = load volatile i32, i32 addrspace(2)* %header_ptr
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_dispatch_ptr:
; GCN: enable_sgpr_dispatch_ptr = 1
; GCN: s_mov_b64 s[6:7], s[4:5]
define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
call void @use_dispatch_ptr()
ret void
}
; GCN-LABEL: {{^}}use_queue_ptr:
; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
define void @use_queue_ptr() #1 {
%queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
%header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
%value = load volatile i32, i32 addrspace(2)* %header_ptr
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr:
; GCN: enable_sgpr_queue_ptr = 1
; GCN: s_mov_b64 s[6:7], s[4:5]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
call void @use_queue_ptr()
ret void
}
; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast:
; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10
; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]]
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}}
define void @use_queue_ptr_addrspacecast() #1 {
%asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32 addrspace(4)*
store volatile i32 0, i32 addrspace(4)* %asc
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr_addrspacecast:
; CIVI: enable_sgpr_queue_ptr = 1
; CIVI: s_mov_b64 s[6:7], s[4:5]
; GFX9-NOT: s_mov_b64
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 {
call void @use_queue_ptr_addrspacecast()
ret void
}
; GCN-LABEL: {{^}}use_kernarg_segment_ptr:
; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
define void @use_kernarg_segment_ptr() #1 {
%kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
%header_ptr = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
%value = load volatile i32, i32 addrspace(2)* %header_ptr
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_kernarg_segment_ptr:
; GCN: enable_sgpr_kernarg_segment_ptr = 1
; GCN: s_mov_b64 s[6:7], s[4:5]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_kernarg_segment_ptr(i32) #1 {
call void @use_kernarg_segment_ptr()
ret void
}
; GCN-LABEL: {{^}}use_dispatch_id:
; GCN: ; use s[6:7]
define void @use_dispatch_id() #1 {
%id = call i64 @llvm.amdgcn.dispatch.id()
call void asm sideeffect "; use $0", "s"(i64 %id)
ret void
}
; No kernarg segment so that there is a mov to check. With kernarg
; pointer enabled, it happens to end up in the right place anyway.
; GCN-LABEL: {{^}}kern_indirect_use_dispatch_id:
; GCN: enable_sgpr_dispatch_id = 1
; GCN: s_mov_b64 s[6:7], s[4:5]
define amdgpu_kernel void @kern_indirect_use_dispatch_id() #1 {
call void @use_dispatch_id()
ret void
}
; GCN-LABEL: {{^}}use_workgroup_id_x:
; GCN: s_waitcnt
; GCN: ; use s6
define void @use_workgroup_id_x() #1 {
%val = call i32 @llvm.amdgcn.workgroup.id.x()
call void asm sideeffect "; use $0", "s"(i32 %val)
ret void
}
; GCN-LABEL: {{^}}use_stack_workgroup_id_x:
; GCN: s_waitcnt
; GCN: s_mov_b32 s5, s32
; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4
; GCN: ; use s6
; GCN: s_setpc_b64
define void @use_stack_workgroup_id_x() #1 {
%alloca = alloca i32
store volatile i32 0, i32* %alloca
%val = call i32 @llvm.amdgcn.workgroup.id.x()
call void asm sideeffect "; use $0", "s"(i32 %val)
ret void
}
; GCN-LABEL: {{^}}use_workgroup_id_y:
; GCN: s_waitcnt
; GCN: ; use s6
define void @use_workgroup_id_y() #1 {
%val = call i32 @llvm.amdgcn.workgroup.id.y()
call void asm sideeffect "; use $0", "s"(i32 %val)
ret void
}
; GCN-LABEL: {{^}}use_workgroup_id_z:
; GCN: s_waitcnt
; GCN: ; use s6
define void @use_workgroup_id_z() #1 {
%val = call i32 @llvm.amdgcn.workgroup.id.z()
call void asm sideeffect "; use $0", "s"(i32 %val)
ret void
}
; GCN-LABEL: {{^}}use_workgroup_id_xy:
; GCN: ; use s6
; GCN: ; use s7
define void @use_workgroup_id_xy() #1 {
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
%val1 = call i32 @llvm.amdgcn.workgroup.id.y()
call void asm sideeffect "; use $0", "s"(i32 %val0)
call void asm sideeffect "; use $0", "s"(i32 %val1)
ret void
}
; GCN-LABEL: {{^}}use_workgroup_id_xyz:
; GCN: ; use s6
; GCN: ; use s7
; GCN: ; use s8
define void @use_workgroup_id_xyz() #1 {
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
%val1 = call i32 @llvm.amdgcn.workgroup.id.y()
%val2 = call i32 @llvm.amdgcn.workgroup.id.z()
call void asm sideeffect "; use $0", "s"(i32 %val0)
call void asm sideeffect "; use $0", "s"(i32 %val1)
call void asm sideeffect "; use $0", "s"(i32 %val2)
ret void
}
; GCN-LABEL: {{^}}use_workgroup_id_xz:
; GCN: ; use s6
; GCN: ; use s7
define void @use_workgroup_id_xz() #1 {
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
%val1 = call i32 @llvm.amdgcn.workgroup.id.z()
call void asm sideeffect "; use $0", "s"(i32 %val0)
call void asm sideeffect "; use $0", "s"(i32 %val1)
ret void
}
; GCN-LABEL: {{^}}use_workgroup_id_yz:
; GCN: ; use s6
; GCN: ; use s7
define void @use_workgroup_id_yz() #1 {
%val0 = call i32 @llvm.amdgcn.workgroup.id.y()
%val1 = call i32 @llvm.amdgcn.workgroup.id.z()
call void asm sideeffect "; use $0", "s"(i32 %val0)
call void asm sideeffect "; use $0", "s"(i32 %val1)
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x:
; GCN: enable_sgpr_workgroup_id_x = 1
; GCN: enable_sgpr_workgroup_id_y = 0
; GCN: enable_sgpr_workgroup_id_z = 0
; GCN-NOT: s6
; GCN: s_mov_b32 s33, s7
; GCN-NOT: s6
; GCN: s_mov_b32 s4, s33
; GCN-NOT: s6
; GCN: s_mov_b32 s32, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
call void @use_workgroup_id_x()
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y:
; GCN: enable_sgpr_workgroup_id_x = 1
; GCN: enable_sgpr_workgroup_id_y = 1
; GCN: enable_sgpr_workgroup_id_z = 0
; GCN: s_mov_b32 s33, s8
; GCN: s_mov_b32 s4, s33
; GCN: s_mov_b32 s6, s7
; GCN: s_mov_b32 s32, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
call void @use_workgroup_id_y()
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z:
; GCN: enable_sgpr_workgroup_id_x = 1
; GCN: enable_sgpr_workgroup_id_y = 0
; GCN: enable_sgpr_workgroup_id_z = 1
; GCN: s_mov_b32 s33, s8
; GCN: s_mov_b32 s4, s33
; GCN: s_mov_b32 s6, s7
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
call void @use_workgroup_id_z()
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy:
; GCN: enable_sgpr_workgroup_id_x = 1
; GCN: enable_sgpr_workgroup_id_y = 1
; GCN: enable_sgpr_workgroup_id_z = 0
; GCN: s_mov_b32 s33, s8
; GCN-NOT: s6
; GCN-NOT: s7
; GCN: s_mov_b32 s4, s33
; GCN-NOT: s6
; GCN-NOT: s7
; GCN: s_mov_b32 s32, s33
; GCN-NOT: s6
; GCN-NOT: s7
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
call void @use_workgroup_id_xy()
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz:
; GCN: enable_sgpr_workgroup_id_x = 1
; GCN: enable_sgpr_workgroup_id_y = 1
; GCN: enable_sgpr_workgroup_id_z = 1
; GCN: s_mov_b32 s33, s9
; GCN-NOT: s6
; GCN-NOT: s7
; GCN-NOT: s8
; GCN: s_mov_b32 s4, s33
; GCN-NOT: s6
; GCN-NOT: s7
; GCN-NOT: s8
; GCN: s_mov_b32 s32, s33
; GCN-NOT: s6
; GCN-NOT: s7
; GCN-NOT: s8
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
call void @use_workgroup_id_xyz()
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz:
; GCN: enable_sgpr_workgroup_id_x = 1
; GCN: enable_sgpr_workgroup_id_y = 0
; GCN: enable_sgpr_workgroup_id_z = 1
; GCN: s_mov_b32 s33, s8
; GCN-NOT: s6
; GCN-NOT: s7
; GCN: s_mov_b32 s4, s33
; GCN-NOT: s6
; GCN-NOT: s7
; GCN: s_mov_b32 s32, s33
; GCN-NOT: s6
; GCN-NOT: s7
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
call void @use_workgroup_id_xz()
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz:
; GCN: enable_sgpr_workgroup_id_x = 1
; GCN: enable_sgpr_workgroup_id_y = 1
; GCN: enable_sgpr_workgroup_id_z = 1
; GCN: s_mov_b32 s33, s9
; GCN: s_mov_b32 s6, s7
; GCN: s_mov_b32 s4, s33
; GCN: s_mov_b32 s7, s8
; GCN: s_mov_b32 s32, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 {
call void @use_workgroup_id_yz()
ret void
}
; Argument is in right place already
; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x:
; GCN-NOT: s6
define void @func_indirect_use_workgroup_id_x() #1 {
call void @use_workgroup_id_x()
ret void
}
; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y:
; GCN-NOT: s6
define void @func_indirect_use_workgroup_id_y() #1 {
call void @use_workgroup_id_y()
ret void
}
; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z:
; GCN-NOT: s6
define void @func_indirect_use_workgroup_id_z() #1 {
call void @use_workgroup_id_z()
ret void
}
; GCN-LABEL: {{^}}other_arg_use_workgroup_id_x:
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN: ; use s6
define void @other_arg_use_workgroup_id_x(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workgroup.id.x()
store volatile i32 %arg0, i32 addrspace(1)* undef
call void asm sideeffect "; use $0", "s"(i32 %val)
ret void
}
; GCN-LABEL: {{^}}other_arg_use_workgroup_id_y:
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN: ; use s6
define void @other_arg_use_workgroup_id_y(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workgroup.id.y()
store volatile i32 %arg0, i32 addrspace(1)* undef
call void asm sideeffect "; use $0", "s"(i32 %val)
ret void
}
; GCN-LABEL: {{^}}other_arg_use_workgroup_id_z:
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN: ; use s6
define void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workgroup.id.z()
store volatile i32 %arg0, i32 addrspace(1)* undef
call void asm sideeffect "; use $0", "s"(i32 %val)
ret void
}
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_x:
; GCN: enable_sgpr_workgroup_id_x = 1
; GCN: enable_sgpr_workgroup_id_y = 0
; GCN: enable_sgpr_workgroup_id_z = 0
; GCN-DAG: s_mov_b32 s33, s7
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
; GCN-NOT: s6
; GCN: s_mov_b32 s4, s33
; GCN-NOT: s6
; GCN-DAG: s_mov_b32 s32, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
call void @other_arg_use_workgroup_id_x(i32 555)
ret void
}
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y:
; GCN: enable_sgpr_workgroup_id_x = 1
; GCN: enable_sgpr_workgroup_id_y = 1
; GCN: enable_sgpr_workgroup_id_z = 0
; GCN-DAG: s_mov_b32 s33, s8
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
; GCN: s_mov_b32 s4, s33
; GCN-DAG: s_mov_b32 s6, s7
; GCN-DAG: s_mov_b32 s32, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
call void @other_arg_use_workgroup_id_y(i32 555)
ret void
}
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z:
; GCN: enable_sgpr_workgroup_id_x = 1
; GCN: enable_sgpr_workgroup_id_y = 0
; GCN: enable_sgpr_workgroup_id_z = 1
; GCN: s_mov_b32 s33, s8
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
; GCN: s_mov_b32 s4, s33
; GCN-DAG: s_mov_b32 s6, s7
; GCN: s_mov_b32 s32, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 {
call void @other_arg_use_workgroup_id_z(i32 555)
ret void
}
; GCN-LABEL: {{^}}use_every_sgpr_input:
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
; GCN: s_load_dword s{{[0-9]+}}, s[10:11], 0x0
; GCN: ; use s[12:13]
; GCN: ; use s14
; GCN: ; use s15
; GCN: ; use s16
define void @use_every_sgpr_input() #1 {
%alloca = alloca i32, align 4
store volatile i32 0, i32* %alloca
%dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
%dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
%val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
%queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
%queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
%val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
%kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
%kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
%val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
%val3 = call i64 @llvm.amdgcn.dispatch.id()
call void asm sideeffect "; use $0", "s"(i64 %val3)
%val4 = call i32 @llvm.amdgcn.workgroup.id.x()
call void asm sideeffect "; use $0", "s"(i32 %val4)
%val5 = call i32 @llvm.amdgcn.workgroup.id.y()
call void asm sideeffect "; use $0", "s"(i32 %val5)
%val6 = call i32 @llvm.amdgcn.workgroup.id.z()
call void asm sideeffect "; use $0", "s"(i32 %val6)
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input:
; GCN: enable_sgpr_workgroup_id_x = 1
; GCN: enable_sgpr_workgroup_id_y = 1
; GCN: enable_sgpr_workgroup_id_z = 1
; GCN: enable_sgpr_workgroup_info = 0
; GCN: enable_sgpr_private_segment_buffer = 1
; GCN: enable_sgpr_dispatch_ptr = 1
; GCN: enable_sgpr_queue_ptr = 1
; GCN: enable_sgpr_kernarg_segment_ptr = 1
; GCN: enable_sgpr_dispatch_id = 1
; GCN: enable_sgpr_flat_scratch_init = 1
; GCN: s_mov_b32 s33, s17
; GCN: s_mov_b64 s[12:13], s[10:11]
; GCN: s_mov_b64 s[10:11], s[8:9]
; GCN: s_mov_b64 s[8:9], s[6:7]
; GCN: s_mov_b64 s[6:7], s[4:5]
; GCN: s_mov_b32 s4, s33
; GCN: s_mov_b32 s32, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 {
call void @use_every_sgpr_input()
ret void
}
; GCN-LABEL: {{^}}func_indirect_use_every_sgpr_input:
; GCN-NOT: s6
; GCN-NOT: s7
; GCN-NOT: s8
; GCN-NOT: s9
; GCN-NOT: s10
; GCN-NOT: s11
; GCN-NOT: s12
; GCN-NOT: s13
; GCN-NOT: s[6:7]
; GCN-NOT: s[8:9]
; GCN-NOT: s[10:11]
; GCN-NOT: s[12:13]
define void @func_indirect_use_every_sgpr_input() #1 {
call void @use_every_sgpr_input()
ret void
}
; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz:
; GCN-DAG: s_mov_b32 s6, s14
; GCN-DAG: s_mov_b32 s7, s15
; GCN-DAG: s_mov_b32 s8, s16
; GCN: s_swappc_b64
define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
%alloca = alloca i32, align 4
store volatile i32 0, i32* %alloca
%dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
%dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
%val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
%queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
%queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
%val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
%kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
%kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
%val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
%val3 = call i64 @llvm.amdgcn.dispatch.id()
call void asm sideeffect "; use $0", "s"(i64 %val3)
%val4 = call i32 @llvm.amdgcn.workgroup.id.x()
call void asm sideeffect "; use $0", "s"(i32 %val4)
%val5 = call i32 @llvm.amdgcn.workgroup.id.y()
call void asm sideeffect "; use $0", "s"(i32 %val5)
%val6 = call i32 @llvm.amdgcn.workgroup.id.z()
call void asm sideeffect "; use $0", "s"(i32 %val6)
call void @use_workgroup_id_xyz()
ret void
}
; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill:
; GCN: s_mov_b32 s5, s32
; GCN: s_add_u32 s32, s32, 0x300
; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-9]+]], s14
; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-9]+]], s15
; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-9]+]], s16
; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[6:7]
; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[8:9]
; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[10:11]
; GCN-DAG: s_mov_b32 s6, [[SAVE_X]]
; GCN-DAG: s_mov_b32 s7, [[SAVE_Y]]
; GCN-DAG: s_mov_b32 s8, [[SAVE_Z]]
; GCN: s_swappc_b64
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
; GCN: s_load_dword s{{[0-9]+}},
; GCN: s_load_dword s{{[0-9]+}},
; GCN: s_load_dword s{{[0-9]+}},
; GCN: ; use
; GCN: ; use [[SAVE_X]]
; GCN: ; use [[SAVE_Y]]
; GCN: ; use [[SAVE_Z]]
define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill() #1 {
%alloca = alloca i32, align 4
call void @use_workgroup_id_xyz()
store volatile i32 0, i32* %alloca
%dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
%dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
%val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
%queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
%queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
%val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
%kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
%kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
%val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
%val3 = call i64 @llvm.amdgcn.dispatch.id()
call void asm sideeffect "; use $0", "s"(i64 %val3)
%val4 = call i32 @llvm.amdgcn.workgroup.id.x()
call void asm sideeffect "; use $0", "s"(i32 %val4)
%val5 = call i32 @llvm.amdgcn.workgroup.id.y()
call void asm sideeffect "; use $0", "s"(i32 %val5)
%val6 = call i32 @llvm.amdgcn.workgroup.id.z()
call void asm sideeffect "; use $0", "s"(i32 %val6)
ret void
}
declare i32 @llvm.amdgcn.workgroup.id.x() #0
declare i32 @llvm.amdgcn.workgroup.id.y() #0
declare i32 @llvm.amdgcn.workgroup.id.z() #0
declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
declare noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
declare i64 @llvm.amdgcn.dispatch.id() #0
declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind noinline }

View File

@ -0,0 +1,671 @@
; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; GCN-LABEL: {{^}}use_workitem_id_x:
; GCN: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_x() #1 {
%val = call i32 @llvm.amdgcn.workitem.id.x()
store volatile i32 %val, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}use_workitem_id_y:
; GCN: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_y() #1 {
%val = call i32 @llvm.amdgcn.workitem.id.y()
store volatile i32 %val, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}use_workitem_id_z:
; GCN: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_z() #1 {
%val = call i32 @llvm.amdgcn.workitem.id.z()
store volatile i32 %val, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}use_workitem_id_xy:
; GCN: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_xy() #1 {
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
%val1 = call i32 @llvm.amdgcn.workitem.id.y()
store volatile i32 %val0, i32 addrspace(1)* undef
store volatile i32 %val1, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}use_workitem_id_xyz:
; GCN: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v2
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_xyz() #1 {
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
%val1 = call i32 @llvm.amdgcn.workitem.id.y()
%val2 = call i32 @llvm.amdgcn.workitem.id.z()
store volatile i32 %val0, i32 addrspace(1)* undef
store volatile i32 %val1, i32 addrspace(1)* undef
store volatile i32 %val2, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}use_workitem_id_xz:
; GCN: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_xz() #1 {
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
%val1 = call i32 @llvm.amdgcn.workitem.id.z()
store volatile i32 %val0, i32 addrspace(1)* undef
store volatile i32 %val1, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}use_workitem_id_yz:
; GCN: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_yz() #1 {
%val0 = call i32 @llvm.amdgcn.workitem.id.y()
%val1 = call i32 @llvm.amdgcn.workitem.id.z()
store volatile i32 %val0, i32 addrspace(1)* undef
store volatile i32 %val1, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x:
; GCN: enable_vgpr_workitem_id = 0
; GCN-NOT: v0
; GCN: s_swappc_b64
; GCN-NOT: v0
define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
call void @use_workitem_id_x()
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y:
; GCN: enable_vgpr_workitem_id = 1
; GCN-NOT: v0
; GCN-NOT: v1
; GCN: v_mov_b32_e32 v0, v1
; GCN-NOT: v0
; GCN-NOT: v1
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
call void @use_workitem_id_y()
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z:
; GCN: enable_vgpr_workitem_id = 2
; GCN-NOT: v0
; GCN-NOT: v2
; GCN: v_mov_b32_e32 v0, v2
; GCN-NOT: v0
; GCN-NOT: v2
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
call void @use_workitem_id_z()
ret void
}
; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x:
; GCN-NOT: v0
; GCN: s_swappc_b64
; GCN-NOT: v0
define void @func_indirect_use_workitem_id_x() #1 {
call void @use_workitem_id_x()
ret void
}
; GCN-LABEL: {{^}}func_indirect_use_workitem_id_y:
; GCN-NOT: v0
; GCN: s_swappc_b64
; GCN-NOT: v0
define void @func_indirect_use_workitem_id_y() #1 {
call void @use_workitem_id_y()
ret void
}
; GCN-LABEL: {{^}}func_indirect_use_workitem_id_z:
; GCN-NOT: v0
; GCN: s_swappc_b64
; GCN-NOT: v0
define void @func_indirect_use_workitem_id_z() #1 {
call void @use_workitem_id_z()
ret void
}
; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
; GCN: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workitem.id.x()
store volatile i32 %arg0, i32 addrspace(1)* undef
store volatile i32 %val, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
; GCN: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workitem.id.y()
store volatile i32 %arg0, i32 addrspace(1)* undef
store volatile i32 %val, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
; GCN: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workitem.id.z()
store volatile i32 %arg0, i32 addrspace(1)* undef
store volatile i32 %val, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x:
; GCN: enable_vgpr_workitem_id = 0
; GCN: v_mov_b32_e32 v1, v0
; GCN: v_mov_b32_e32 v0, 0x22b
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
call void @other_arg_use_workitem_id_x(i32 555)
ret void
}
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
; GCN: enable_vgpr_workitem_id = 1
; GCN-NOT: v1
; GCN: v_mov_b32_e32 v0, 0x22b
; GCN-NOT: v1
; GCN: s_swappc_b64
; GCN-NOT: v0
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
call void @other_arg_use_workitem_id_y(i32 555)
ret void
}
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
; GCN: enable_vgpr_workitem_id = 2
; GCN: v_mov_b32_e32 v0, 0x22b
; GCN: v_mov_b32_e32 v1, v2
; GCN: s_swappc_b64
; GCN-NOT: v0
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
call void @other_arg_use_workitem_id_z(i32 555)
ret void
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
; GCN: s_mov_b32 s5, s32
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}}
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @too_many_args_use_workitem_id_x(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
%val = call i32 @llvm.amdgcn.workitem.id.x()
store volatile i32 %val, i32 addrspace(1)* undef
store volatile i32 %arg0, i32 addrspace(1)* undef
store volatile i32 %arg1, i32 addrspace(1)* undef
store volatile i32 %arg2, i32 addrspace(1)* undef
store volatile i32 %arg3, i32 addrspace(1)* undef
store volatile i32 %arg4, i32 addrspace(1)* undef
store volatile i32 %arg5, i32 addrspace(1)* undef
store volatile i32 %arg6, i32 addrspace(1)* undef
store volatile i32 %arg7, i32 addrspace(1)* undef
store volatile i32 %arg8, i32 addrspace(1)* undef
store volatile i32 %arg9, i32 addrspace(1)* undef
store volatile i32 %arg10, i32 addrspace(1)* undef
store volatile i32 %arg11, i32 addrspace(1)* undef
store volatile i32 %arg12, i32 addrspace(1)* undef
store volatile i32 %arg13, i32 addrspace(1)* undef
store volatile i32 %arg14, i32 addrspace(1)* undef
store volatile i32 %arg15, i32 addrspace(1)* undef
store volatile i32 %arg16, i32 addrspace(1)* undef
store volatile i32 %arg17, i32 addrspace(1)* undef
store volatile i32 %arg18, i32 addrspace(1)* undef
store volatile i32 %arg19, i32 addrspace(1)* undef
store volatile i32 %arg20, i32 addrspace(1)* undef
store volatile i32 %arg21, i32 addrspace(1)* undef
store volatile i32 %arg22, i32 addrspace(1)* undef
store volatile i32 %arg23, i32 addrspace(1)* undef
store volatile i32 %arg24, i32 addrspace(1)* undef
store volatile i32 %arg25, i32 addrspace(1)* undef
store volatile i32 %arg26, i32 addrspace(1)* undef
store volatile i32 %arg27, i32 addrspace(1)* undef
store volatile i32 %arg28, i32 addrspace(1)* undef
store volatile i32 %arg29, i32 addrspace(1)* undef
store volatile i32 %arg30, i32 addrspace(1)* undef
store volatile i32 %arg31, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
; GCN: enable_vgpr_workitem_id = 0
; GCN: s_mov_b32 s33, s7
; GCN: s_mov_b32 s32, s33
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GCN: s_mov_b32 s4, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
call void @too_many_args_use_workitem_id_x(
i32 10, i32 20, i32 30, i32 40,
i32 50, i32 60, i32 70, i32 80,
i32 90, i32 100, i32 110, i32 120,
i32 130, i32 140, i32 150, i32 160,
i32 170, i32 180, i32 190, i32 200,
i32 210, i32 220, i32 230, i32 240,
i32 250, i32 260, i32 270, i32 280,
i32 290, i32 300, i32 310, i32 320)
ret void
}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
; GCN: s_mov_b32 s5, s32
; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:8
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
store volatile i32 %arg0, i32 addrspace(1)* undef
call void @too_many_args_use_workitem_id_x(
i32 10, i32 20, i32 30, i32 40,
i32 50, i32 60, i32 70, i32 80,
i32 90, i32 100, i32 110, i32 120,
i32 130, i32 140, i32 150, i32 160,
i32 170, i32 180, i32 190, i32 200,
i32 210, i32 220, i32 230, i32 240,
i32 250, i32 260, i32 270, i32 280,
i32 290, i32 300, i32 310, i32 320)
ret void
}
; Requires loading and storing to stack slot.
; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
; GCN: s_add_u32 s32, s32, 0x400{{$}}
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}}
; GCN: s_swappc_b64
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload
; GCN: s_sub_u32 s32, s32, 0x400{{$}}
; GCN: s_setpc_b64
define void @too_many_args_call_too_many_args_use_workitem_id_x(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
call void @too_many_args_use_workitem_id_x(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31)
ret void
}
; stack layout:
; frame[0] = emergency stack slot
; frame[1] = byval arg32
; frame[2] = stack passed workitem ID x
; frame[3] = VGPR spill slot
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
; GCN-NEXT: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
; GCN: buffer_load_dword v0, off, s[0:3], s5 offset:4
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload
; GCN: s_setpc_b64
define void @too_many_args_use_workitem_id_x_byval(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31, i32* byval %arg32) #1 {
%val = call i32 @llvm.amdgcn.workitem.id.x()
store volatile i32 %val, i32 addrspace(1)* undef
store volatile i32 %arg0, i32 addrspace(1)* undef
store volatile i32 %arg1, i32 addrspace(1)* undef
store volatile i32 %arg2, i32 addrspace(1)* undef
store volatile i32 %arg3, i32 addrspace(1)* undef
store volatile i32 %arg4, i32 addrspace(1)* undef
store volatile i32 %arg5, i32 addrspace(1)* undef
store volatile i32 %arg6, i32 addrspace(1)* undef
store volatile i32 %arg7, i32 addrspace(1)* undef
store volatile i32 %arg8, i32 addrspace(1)* undef
store volatile i32 %arg9, i32 addrspace(1)* undef
store volatile i32 %arg10, i32 addrspace(1)* undef
store volatile i32 %arg11, i32 addrspace(1)* undef
store volatile i32 %arg12, i32 addrspace(1)* undef
store volatile i32 %arg13, i32 addrspace(1)* undef
store volatile i32 %arg14, i32 addrspace(1)* undef
store volatile i32 %arg15, i32 addrspace(1)* undef
store volatile i32 %arg16, i32 addrspace(1)* undef
store volatile i32 %arg17, i32 addrspace(1)* undef
store volatile i32 %arg18, i32 addrspace(1)* undef
store volatile i32 %arg19, i32 addrspace(1)* undef
store volatile i32 %arg20, i32 addrspace(1)* undef
store volatile i32 %arg21, i32 addrspace(1)* undef
store volatile i32 %arg22, i32 addrspace(1)* undef
store volatile i32 %arg23, i32 addrspace(1)* undef
store volatile i32 %arg24, i32 addrspace(1)* undef
store volatile i32 %arg25, i32 addrspace(1)* undef
store volatile i32 %arg26, i32 addrspace(1)* undef
store volatile i32 %arg27, i32 addrspace(1)* undef
store volatile i32 %arg28, i32 addrspace(1)* undef
store volatile i32 %arg29, i32 addrspace(1)* undef
store volatile i32 %arg30, i32 addrspace(1)* undef
store volatile i32 %arg31, i32 addrspace(1)* undef
%private = load volatile i32, i32* %arg32
ret void
}
; frame[0] = emergency stack slot
; frame[1] =
; sp[0] = callee emergency stack slot reservation
; sp[1] = byval
; sp[2] = ??
; sp[3] = stack passed workitem ID x
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
; GCN: enable_vgpr_workitem_id = 0
; GCN: s_mov_b32 s33, s7
; GCN: s_add_u32 s32, s33, 0x200{{$}}
; GCN-DAG: s_add_u32 s32, s32, 0x100{{$}}
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
%alloca = alloca i32, align 4
store volatile i32 999, i32* %alloca
call void @too_many_args_use_workitem_id_x_byval(
i32 10, i32 20, i32 30, i32 40,
i32 50, i32 60, i32 70, i32 80,
i32 90, i32 100, i32 110, i32 120,
i32 130, i32 140, i32 150, i32 160,
i32 170, i32 180, i32 190, i32 200,
i32 210, i32 220, i32 230, i32 240,
i32 250, i32 260, i32 270, i32 280,
i32 290, i32 300, i32 310, i32 320,
i32* %alloca)
ret void
}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
%alloca = alloca i32, align 4
store volatile i32 999, i32* %alloca
call void @too_many_args_use_workitem_id_x_byval(
i32 10, i32 20, i32 30, i32 40,
i32 50, i32 60, i32 70, i32 80,
i32 90, i32 100, i32 110, i32 120,
i32 130, i32 140, i32 150, i32 160,
i32 170, i32 180, i32 190, i32 200,
i32 210, i32 220, i32 230, i32 240,
i32 250, i32 260, i32 270, i32 280,
i32 290, i32 300, i32 310, i32 320,
i32* %alloca)
ret void
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
; GCN: s_mov_b32 s5, s32
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}}
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8{{$}}
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12{{$}}
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @too_many_args_use_workitem_id_xyz(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
store volatile i32 %val0, i32 addrspace(1)* undef
%val1 = call i32 @llvm.amdgcn.workitem.id.y()
store volatile i32 %val1, i32 addrspace(1)* undef
%val2 = call i32 @llvm.amdgcn.workitem.id.z()
store volatile i32 %val2, i32 addrspace(1)* undef
store volatile i32 %arg0, i32 addrspace(1)* undef
store volatile i32 %arg1, i32 addrspace(1)* undef
store volatile i32 %arg2, i32 addrspace(1)* undef
store volatile i32 %arg3, i32 addrspace(1)* undef
store volatile i32 %arg4, i32 addrspace(1)* undef
store volatile i32 %arg5, i32 addrspace(1)* undef
store volatile i32 %arg6, i32 addrspace(1)* undef
store volatile i32 %arg7, i32 addrspace(1)* undef
store volatile i32 %arg8, i32 addrspace(1)* undef
store volatile i32 %arg9, i32 addrspace(1)* undef
store volatile i32 %arg10, i32 addrspace(1)* undef
store volatile i32 %arg11, i32 addrspace(1)* undef
store volatile i32 %arg12, i32 addrspace(1)* undef
store volatile i32 %arg13, i32 addrspace(1)* undef
store volatile i32 %arg14, i32 addrspace(1)* undef
store volatile i32 %arg15, i32 addrspace(1)* undef
store volatile i32 %arg16, i32 addrspace(1)* undef
store volatile i32 %arg17, i32 addrspace(1)* undef
store volatile i32 %arg18, i32 addrspace(1)* undef
store volatile i32 %arg19, i32 addrspace(1)* undef
store volatile i32 %arg20, i32 addrspace(1)* undef
store volatile i32 %arg21, i32 addrspace(1)* undef
store volatile i32 %arg22, i32 addrspace(1)* undef
store volatile i32 %arg23, i32 addrspace(1)* undef
store volatile i32 %arg24, i32 addrspace(1)* undef
store volatile i32 %arg25, i32 addrspace(1)* undef
store volatile i32 %arg26, i32 addrspace(1)* undef
store volatile i32 %arg27, i32 addrspace(1)* undef
store volatile i32 %arg28, i32 addrspace(1)* undef
store volatile i32 %arg29, i32 addrspace(1)* undef
store volatile i32 %arg30, i32 addrspace(1)* undef
store volatile i32 %arg31, i32 addrspace(1)* undef
ret void
}
; frame[0] = kernel emergency stack slot
; frame[1] = callee emergency stack slot
; frame[2] = ID X
; frame[3] = ID Y
; frame[4] = ID Z
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
; GCN: enable_vgpr_workitem_id = 2
; GCN: s_mov_b32 s33, s7
; GCN: s_mov_b32 s32, s33
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:12
; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:16
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
call void @too_many_args_use_workitem_id_xyz(
i32 10, i32 20, i32 30, i32 40,
i32 50, i32 60, i32 70, i32 80,
i32 90, i32 100, i32 110, i32 120,
i32 130, i32 140, i32 150, i32 160,
i32 170, i32 180, i32 190, i32 200,
i32 210, i32 220, i32 230, i32 240,
i32 250, i32 260, i32 270, i32 280,
i32 290, i32 300, i32 310, i32 320)
ret void
}
; workitem ID X in register, yz on stack
; v31 = workitem ID X
; frame[0] = emergency slot
; frame[1] = workitem Y
; frame[2] = workitem Z
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz:
; GCN: s_mov_b32 s5, s32
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:4{{$}}
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:8{{$}}
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
; GCN: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN: ScratchSize: 12
define void @too_many_args_use_workitem_id_x_stack_yz(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30) #1 {
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
store volatile i32 %val0, i32 addrspace(1)* undef
%val1 = call i32 @llvm.amdgcn.workitem.id.y()
store volatile i32 %val1, i32 addrspace(1)* undef
%val2 = call i32 @llvm.amdgcn.workitem.id.z()
store volatile i32 %val2, i32 addrspace(1)* undef
store volatile i32 %arg0, i32 addrspace(1)* undef
store volatile i32 %arg1, i32 addrspace(1)* undef
store volatile i32 %arg2, i32 addrspace(1)* undef
store volatile i32 %arg3, i32 addrspace(1)* undef
store volatile i32 %arg4, i32 addrspace(1)* undef
store volatile i32 %arg5, i32 addrspace(1)* undef
store volatile i32 %arg6, i32 addrspace(1)* undef
store volatile i32 %arg7, i32 addrspace(1)* undef
store volatile i32 %arg8, i32 addrspace(1)* undef
store volatile i32 %arg9, i32 addrspace(1)* undef
store volatile i32 %arg10, i32 addrspace(1)* undef
store volatile i32 %arg11, i32 addrspace(1)* undef
store volatile i32 %arg12, i32 addrspace(1)* undef
store volatile i32 %arg13, i32 addrspace(1)* undef
store volatile i32 %arg14, i32 addrspace(1)* undef
store volatile i32 %arg15, i32 addrspace(1)* undef
store volatile i32 %arg16, i32 addrspace(1)* undef
store volatile i32 %arg17, i32 addrspace(1)* undef
store volatile i32 %arg18, i32 addrspace(1)* undef
store volatile i32 %arg19, i32 addrspace(1)* undef
store volatile i32 %arg20, i32 addrspace(1)* undef
store volatile i32 %arg21, i32 addrspace(1)* undef
store volatile i32 %arg22, i32 addrspace(1)* undef
store volatile i32 %arg23, i32 addrspace(1)* undef
store volatile i32 %arg24, i32 addrspace(1)* undef
store volatile i32 %arg25, i32 addrspace(1)* undef
store volatile i32 %arg26, i32 addrspace(1)* undef
store volatile i32 %arg27, i32 addrspace(1)* undef
store volatile i32 %arg28, i32 addrspace(1)* undef
store volatile i32 %arg29, i32 addrspace(1)* undef
store volatile i32 %arg30, i32 addrspace(1)* undef
ret void
}
; frame[0] = kernel emergency stack slot
; frame[1] = callee emergency stack slot
; frame[2] = ID Y
; frame[3] = ID Z
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
; GCN: enable_vgpr_workitem_id = 2
; GCN: s_mov_b32 s33, s7
; GCN: s_mov_b32 s32, s33
; GCN-DAG: v_mov_b32_e32 v31, v0
; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8
; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
call void @too_many_args_use_workitem_id_x_stack_yz(
i32 10, i32 20, i32 30, i32 40,
i32 50, i32 60, i32 70, i32 80,
i32 90, i32 100, i32 110, i32 120,
i32 130, i32 140, i32 150, i32 160,
i32 170, i32 180, i32 190, i32 200,
i32 210, i32 220, i32 230, i32 240,
i32 250, i32 260, i32 270, i32 280,
i32 290, i32 300, i32 310)
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.workitem.id.y() #0
declare i32 @llvm.amdgcn.workitem.id.z() #0
attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind noinline }