AMDGPU: Initial implementation of calls

Includes a hack to fix the type selected for
the GlobalAddress of the function, which will be
fixed by changing the default datalayout to use
generic pointers for 0.

llvm-svn: 309732
This commit is contained in:
Matt Arsenault 2017-08-01 19:54:18 +00:00
parent 253be33610
commit b62a4eb524
25 changed files with 1953 additions and 15 deletions

View File

@ -162,6 +162,10 @@ def CC_AMDGPU : CallingConv<[
"(State.getMachineFunction().getSubtarget()).getGeneration() >= "
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
CCDelegateTo<CC_SI>>,
CCIf<"static_cast<const AMDGPUSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >= "
"AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
CCDelegateTo<CC_AMDGPU_Func>>,
CCIf<"static_cast<const AMDGPUSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() < "
"AMDGPUSubtarget::SOUTHERN_ISLANDS",

View File

@ -33,10 +33,6 @@ public:
/// \returns The number of 32-bit sub-registers that are used when storing
/// values to the stack.
unsigned getStackWidth(const MachineFunction &MF) const;
bool hasFP(const MachineFunction &MF) const override {
return false;
}
};
} // end namespace llvm

View File

@ -20,6 +20,7 @@
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "R600MachineFunctionInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"

View File

@ -30,7 +30,9 @@ using namespace llvm;
void AMDGPUInstrInfo::anchor() {}
AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
: AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {}
: AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
ST(ST),
AMDGPUASI(ST.getAMDGPUAS()) {}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
// the first 16 loads will be interleaved with the stores, and the next 16 will

View File

@ -82,6 +82,22 @@ def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
def callseq_start : SDNode<"ISD::CALLSEQ_START",
SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
[SDNPHasChain, SDNPOutGlue]
>;
def callseq_end : SDNode<"ISD::CALLSEQ_END",
SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
>;
def AMDGPUcall : SDNode<"AMDGPUISD::CALL",
SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]
>;
def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]

View File

@ -121,6 +121,9 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
MCOp = MCOperand::createExpr(Expr);
return true;
}
case MachineOperand::MO_RegisterMask:
// Regmasks are like implicit defs.
return false;
}
}

View File

@ -56,6 +56,20 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
}
}
const MCPhysReg *
SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
// FIXME
static MCPhysReg Regs[2];
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
assert(!MFI->isEntryFunction());
Regs[0] = MFI->getFrameOffsetReg();
Regs[1] = AMDGPU::NoRegister;
return Regs;
}
const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
switch (CC) {

View File

@ -123,6 +123,12 @@ static cl::opt<bool> LateCFGStructurize(
cl::init(false),
cl::Hidden);
static cl::opt<bool> EnableAMDGPUFunctionCalls(
"amdgpu-function-calls",
cl::Hidden,
cl::desc("Enable AMDGPU function call support"),
cl::init(false));
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@ -269,6 +275,11 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
bool AMDGPUTargetMachine::enableFunctionCalls() const {
return EnableAMDGPUFunctionCalls &&
getTargetTriple().getArch() == Triple::amdgcn;
}
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
return GPUAttr.hasAttribute(Attribute::None) ?

View File

@ -69,6 +69,9 @@ public:
return -1;
return 0;
}
LLVM_READONLY
bool enableFunctionCalls() const;
};
//===----------------------------------------------------------------------===//

View File

@ -27,6 +27,10 @@ public:
MachineBasicBlock &MBB) const override {}
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
bool hasFP(const MachineFunction &MF) const override {
return false;
}
};
} // end namespace llvm

View File

@ -575,6 +575,41 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
}
}
MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
int64_t Amount = I->getOperand(0).getImm();
if (Amount == 0)
return MBB.erase(I);
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = I->getDebugLoc();
unsigned Opc = I->getOpcode();
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
if (!TFI->hasReservedCallFrame(MF)) {
unsigned Align = getStackAlignment();
Amount = alignTo(Amount, Align);
assert(isUInt<32>(Amount) && "exceeded stack address space size");
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned SPReg = MFI->getStackPtrOffsetReg();
unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
BuildMI(MBB, I, DL, TII->get(Op), SPReg)
.addReg(SPReg)
.addImm(Amount * ST.getWavefrontSize());
} else if (CalleePopAmount != 0) {
llvm_unreachable("is this used?");
}
return MBB.erase(I);
}
void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

View File

@ -39,6 +39,11 @@ public:
MachineFunction &MF,
RegScavenger *RS = nullptr) const override;
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
private:
void emitFlatScratchInit(const SISubtarget &ST,
MachineFunction &MF,

View File

@ -1201,9 +1201,13 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
if (TM.getOptLevel() == CodeGenOpt::None)
HasStackObjects = true;
// For now assume stack access is needed in any callee functions, so we need
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
if (ST.isAmdCodeObjectV2(MF)) {
if (HasStackObjects) {
if (RequiresStackAccess) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
@ -1212,9 +1216,23 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
if (MFI.hasCalls()) {
// If we have calls, we need to keep the frame register in a register
// that won't be clobbered by a call, so ensure it is copied somewhere.
// This is not a problem for the scratch wave offset, because the same
// registers are reserved in all functions.
// FIXME: Nothing is really ensuring this is a call preserved register,
// it's just selected from the end so it happens to be.
unsigned ReservedOffsetReg
= TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
} else {
unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
}
} else {
unsigned ReservedBufferReg
= TRI.reservedPrivateSegmentBufferReg(MF);
@ -1237,7 +1255,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// offset is still in an input SGPR.
Info.setScratchRSrcReg(ReservedBufferReg);
if (HasStackObjects) {
if (HasStackObjects && !MFI.hasCalls()) {
unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
@ -1249,6 +1267,50 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
}
}
bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
return !Info->isEntryFunction();
}
void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
}
void SITargetLowering::insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
if (!IStart)
return;
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
MachineBasicBlock::iterator MBBI = Entry->begin();
for (const MCPhysReg *I = IStart; *I; ++I) {
const TargetRegisterClass *RC = nullptr;
if (AMDGPU::SReg_64RegClass.contains(*I))
RC = &AMDGPU::SGPR_64RegClass;
else if (AMDGPU::SReg_32RegClass.contains(*I))
RC = &AMDGPU::SGPR_32RegClass;
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
unsigned NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
Entry->addLiveIn(*I);
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
.addReg(*I);
// Insert the copy-back instructions right before the terminator.
for (auto *Exit : Exits)
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
TII->get(TargetOpcode::COPY), *I)
.addReg(NewVR);
}
}
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@ -1589,6 +1651,22 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
// FIXME: Does sret work properly?
if (!Info->isEntryFunction()) {
const SIRegisterInfo *TRI
= static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
for (; *I; ++I) {
if (AMDGPU::SReg_64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
else if (AMDGPU::SReg_32RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i32));
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
}
}
}
// Update chain and glue.
RetOps[0] = Chain;
@ -1601,6 +1679,296 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
SDValue SITargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
SDValue ThisVal) const {
CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC);
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign VA = RVLocs[i];
SDValue Val;
if (VA.isRegLoc()) {
Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
Chain = Val.getValue(1);
InFlag = Val.getValue(2);
} else if (VA.isMemLoc()) {
report_fatal_error("TODO: return values in memory");
} else
llvm_unreachable("unknown argument location type");
switch (VA.getLocInfo()) {
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
break;
case CCValAssign::ZExt:
Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
DAG.getValueType(VA.getValVT()));
Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
break;
case CCValAssign::SExt:
Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
DAG.getValueType(VA.getValVT()));
Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
break;
case CCValAssign::AExt:
Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
break;
default:
llvm_unreachable("Unknown loc info!");
}
InVals.push_back(Val);
}
return Chain;
}
// The wave scratch offset register is used as the global base pointer.
SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
const AMDGPUTargetMachine &TM =
static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
if (!TM.enableFunctionCalls())
return AMDGPUTargetLowering::LowerCall(CLI, InVals);
SelectionDAG &DAG = CLI.DAG;
const SDLoc &DL = CLI.DL;
SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
bool IsSibCall = false;
bool IsThisReturn = false;
MachineFunction &MF = DAG.getMachineFunction();
// TODO: Implement tail calls.
IsTailCall = false;
if (IsVarArg || MF.getTarget().Options.GuaranteedTailCallOpt) {
report_fatal_error("varargs and tail calls not implemented");
}
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
// FIXME: Remove this hack for function pointer types.
const GlobalValue *GV = GA->getGlobal();
assert(Callee.getValueType() == MVT::i32);
Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(),
false, GA->getTargetFlags());
}
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
if (IsSibCall) {
// Since we're not changing the ABI to make this a tail call, the memory
// operands are already available in the caller's incoming argument space.
NumBytes = 0;
}
// FPDiff is the byte offset of the call's argument area from the callee's.
// Stores to callee stack arguments will be placed in FixedStackSlots offset
// by this amount for a tail call. In a sibling call it must be 0 because the
// caller will deallocate the entire stack and the callee still expects its
// arguments to begin at SP+0. Completely unused for non-tail calls.
int FPDiff = 0;
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall) {
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
unsigned OffsetReg = Info->getScratchWaveOffsetReg();
// In the HSA case, this should be an identity copy.
SDValue ScratchRSrcReg
= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
// TODO: Don't hardcode these registers and get from the callee function.
SDValue ScratchWaveOffsetReg
= DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
}
// Stack pointer relative accesses are done by changing the offset SGPR. This
// is just the VGPR offset component.
SDValue StackPtr = DAG.getConstant(0, DL, MVT::i32);
SmallVector<SDValue, 8> MemOpChains;
MVT PtrVT = MVT::i32;
// Walk the register/memloc assignments, inserting copies/loads.
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
++i, ++realArgIdx) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[realArgIdx];
// Promote the value if needed.
switch (VA.getLocInfo()) {
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::AExt:
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::FPExt:
Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
break;
default:
llvm_unreachable("Unknown loc info!");
}
if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else {
assert(VA.isMemLoc());
SDValue DstAddr;
MachinePointerInfo DstInfo;
unsigned LocMemOffset = VA.getLocMemOffset();
int32_t Offset = LocMemOffset;
SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
if (!IsTailCall) {
SDValue PtrOff = DAG.getTargetConstant(Offset, DL, MVT::i32);
DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
}
if (Outs[i].Flags.isByVal()) {
SDValue SizeNode =
DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
SDValue Cpy = DAG.getMemcpy(
Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
/*isVol = */ false, /*AlwaysInline = */ true,
/*isTailCall = */ false,
DstInfo, MachinePointerInfo());
MemOpChains.push_back(Cpy);
} else {
SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
MemOpChains.push_back(Store);
}
}
}
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
for (auto &RegToPass : RegsToPass) {
Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
RegToPass.second, InFlag);
InFlag = Chain.getValue(1);
}
// We don't usually want to end the call-sequence here because we would tidy
// the frame up *after* the call, however in the ABI-changing tail-call case
// we've carefully laid out the parameters so that when sp is reset they'll be
// in the correct location.
if (IsTailCall && !IsSibCall) {
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getTargetConstant(NumBytes, DL, MVT::i32),
DAG.getTargetConstant(0, DL, MVT::i32),
InFlag, DL);
InFlag = Chain.getValue(1);
}
std::vector<SDValue> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
if (IsTailCall) {
// Each tail call may have to adjust the stack by a different amount, so
// this information must travel along with the operation for eventual
// consumption by emitEpilogue.
Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
}
// Add argument registers to the end of the list so that they are known live
// into the call.
for (auto &RegToPass : RegsToPass) {
Ops.push_back(DAG.getRegister(RegToPass.first,
RegToPass.second.getValueType()));
}
// Add a register mask operand representing the call-preserved registers.
const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
if (InFlag.getNode())
Ops.push_back(InFlag);
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
// If we're doing a tall call, use a TC_RETURN here rather than an
// actual call instruction.
if (IsTailCall) {
MF.getFrameInfo().setHasTailCall();
llvm_unreachable("not implemented");
}
// Returns a chain and a flag for retval copy to use.
SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
Chain = Call.getValue(0);
InFlag = Call.getValue(1);
uint64_t CalleePopBytes = 0;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(NumBytes, DL, MVT::i32),
DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
InFlag, DL);
if (!Ins.empty())
InFlag = Chain.getValue(1);
// Handle result values, copying them out of physregs into vregs that we
// return.
return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
InVals, IsThisReturn,
IsThisReturn ? OutVals[0] : SDValue());
}
unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
unsigned Reg = StringSwitch<unsigned>(RegName)
@ -2266,6 +2634,27 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
case AMDGPU::ADJCALLSTACKUP:
case AMDGPU::ADJCALLSTACKDOWN: {
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
MachineInstrBuilder MIB(*MF, &MI);
MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
.addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
return BB;
}
case AMDGPU::SI_CALL: {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
const DebugLoc &DL = MI.getDebugLoc();
unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
MachineInstrBuilder MIB =
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_SWAPPC_B64), ReturnAddrReg);
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
MIB.add(MI.getOperand(I));
MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
MI.eraseFromParent();
return BB;
}
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
@ -2931,13 +3320,16 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS)
GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
// FIXME: It isn't correct to rely on the type of the pointer. This should
// be removed when address space 0 is 64-bit.
!GV->getType()->getElementType()->isFunctionTy())
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
SDLoc DL(GSD);
const GlobalValue *GV = GSD->getGlobal();
EVT PtrVT = Op.getValueType();
if (shouldEmitFixup(GV))

View File

@ -183,6 +183,12 @@ public:
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
bool supportSplitCSR(MachineFunction *MF) const override;
void initializeSplitCSR(MachineBasicBlock *Entry) const override;
void insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
@ -199,6 +205,15 @@ public:
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const;
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
unsigned getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const override;

View File

@ -317,6 +317,45 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
let DisableWQM = 1;
}
// Return for returning function calls.
def SI_RETURN : SPseudoInstSI <
(outs), (ins), [],
"; return"> {
let isTerminator = 1;
let isBarrier = 1;
let isReturn = 1;
let SchedRW = [WriteBranch];
}
// Return for returning function calls.
def SI_CALL : SPseudoInstSI <
(outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)],
"; call $src0"> {
let Size = 4;
let isCall = 1;
let SchedRW = [WriteBranch];
let usesCustomInserter = 1;
}
def ADJCALLSTACKUP : SPseudoInstSI<
(outs), (ins i32imm:$amt0, i32imm:$amt1),
[(callseq_start timm:$amt0, timm:$amt1)],
"; adjcallstackup $amt0 $amt1"> {
let Size = 8; // Worst case. (s_add_u32 + constant)
let FixedSize = 1;
let hasSideEffects = 1;
let usesCustomInserter = 1;
}
def ADJCALLSTACKDOWN : SPseudoInstSI<
(outs), (ins i32imm:$amt1, i32imm:$amt2),
[(callseq_end timm:$amt1, timm:$amt2)],
"; adjcallstackdown $amt1"> {
let Size = 8; // Worst case. (s_add_u32 + constant)
let hasSideEffects = 1;
let usesCustomInserter = 1;
}
let Defs = [M0, EXEC],
UseNamedOperandTable = 1 in {

View File

@ -236,8 +236,15 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const
return true;
}
bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
return MF.getFrameInfo().hasStackObjects();
bool SIRegisterInfo::requiresFrameIndexScavenging(
const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
if (MFI.hasStackObjects())
return true;
// May need to deal with callee saved registers.
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
return !Info->isEntryFunction();
}
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(

View File

@ -63,6 +63,7 @@ public:
BitVector getReservedRegs(const MachineFunction &MF) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;

View File

@ -269,6 +269,18 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
// Register classes used as source and destination
//===----------------------------------------------------------------------===//
def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
}
def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
(add PRIVATE_RSRC_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
}
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,

View File

@ -0,0 +1,27 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
define void @void_func_void() #2 {
ret void
}
; GCN-LABEL: {{^}}test_call_void_func_void:
define amdgpu_kernel void @test_call_void_func_void() {
call void @void_func_void()
ret void
}
define void @void_func_void_clobber_s40_s41() #2 {
call void asm sideeffect "", "~{SGPR40_SGPR41}"() #0
ret void
}
define amdgpu_kernel void @test_call_void_func_void_clobber_s40_s41() {
call void @void_func_void_clobber_s40_s41()
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind noinline }

View File

@ -0,0 +1,235 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
%struct.ByValStruct = type { [4 x i32] }
; GCN-LABEL: {{^}}void_func_byval_struct:
; GCN: s_mov_b32 s5, s32
; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
; GCN-NOT: s32
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5{{$}}
; GCN-NOT: s32
; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}}
; GCN-NOT: s32
; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:16{{$}}
; GCN-NOT: s32
define void @void_func_byval_struct(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 {
entry:
%arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
%tmp = load volatile i32, i32* %arrayidx, align 4
%add = add nsw i32 %tmp, 1
store volatile i32 %add, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
%tmp1 = load volatile i32, i32* %arrayidx2, align 4
%add3 = add nsw i32 %tmp1, 2
store volatile i32 %add3, i32* %arrayidx2, align 4
store volatile i32 9, i32 addrspace(1)* null, align 4
ret void
}
; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
; GCN: s_mov_b32 s5, s32
; GCN: buffer_store_dword v32
; GCN: v_writelane_b32
; GCN-DAG: s_add_u32 s32, s32, 0x900{{$}}
; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
; GCN: v_add_i32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s5{{$}}
; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}}
; GCN: v_add_i32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
; GCN: s_swappc_b64
; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:16{{$}}
; GCN: v_readlane_b32
; GCN: buffer_load_dword v32,
; GCN: s_sub_u32 s32, s32, 0x900{{$}}
; GCN: s_setpc_b64
define void @void_func_byval_struct_non_leaf(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 {
entry:
%arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
%tmp = load volatile i32, i32* %arrayidx, align 4
%add = add nsw i32 %tmp, 1
store volatile i32 %add, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
%tmp1 = load volatile i32, i32* %arrayidx2, align 4
%add3 = add nsw i32 %tmp1, 2
call void @external_void_func_void()
store volatile i32 %add3, i32* %arrayidx2, align 4
store volatile i32 9, i32 addrspace(1)* null, align 4
ret void
}
; GCN-LABEL: {{^}}call_void_func_byval_struct_func:
; GCN: s_mov_b32 s5, s32
; GCN: s_add_u32 s32, s32, 0xa00{{$}}
; GCN: v_writelane_b32
; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}}
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
; VI-DAG: v_lshrrev_b32_e64 v{{[0-9]+}}, 6
; CI-DAG: v_lshr_b32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 6
; GCN-DAG: v_add_i32_e64 [[FI_ADD0:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 8,
; GCN-DAG: v_or_b32_e32 [[FI_OR0:v[0-9]+]], 4, [[FI_ADD0]]
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], [[FI_OR0]], s[0:3], s4 offen offset:4
; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], [[FI_OR0]], s[0:3], s4 offen offset:8
; FIXME: or fails to combine with add, so FI doesn't fold and scratch wave offset is used
; VI-DAG: v_lshrrev_b32_e64 v{{[0-9]+}}, 6
; CI-DAG: v_lshr_b32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 6
; GCN-DAG: v_add_i32_e64 [[FI_ADD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 24,
; GCN-DAG: v_or_b32_e32 [[FI_OR1:v[0-9]+]], 4, [[FI_ADD1]]
; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8
; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32{{$}}
; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:4
; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], [[FI_OR1]], s[0:3], s4 offen offset:4
; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], [[FI_OR1]], s[0:3], s4 offen offset:8
; GCN: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
; GCN: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:16
; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:20
; GCN: s_swappc_b64
; GCN-NEXT: s_sub_u32 s32, s32, 0x800{{$}}
; GCN: v_readlane_b32
; GCN: s_sub_u32 s32, s32, 0xa00{{$}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @call_void_func_byval_struct_func() #0 {
entry:
%arg0 = alloca %struct.ByValStruct, align 4
%arg1 = alloca %struct.ByValStruct, align 4
%tmp = bitcast %struct.ByValStruct* %arg0 to i8*
call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
%tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
%arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
store volatile i32 9, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
store volatile i32 13, i32* %arrayidx2, align 4
call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
ret void
}
; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel:
; GCN: s_mov_b32 s33, s7
; GCN: s_add_u32 s32, s33, 0xa00{{$}}
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8
; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24
; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}}
; FIXME: Fold offset
; GCN-DAG: v_or_b32_e32 [[OR_FI0:v[0-9]+]], 4,
; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], [[OR_FI0]], s[0:3], s33 offen offset:4
; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], [[OR_FI0]], s[0:3], s33 offen offset:8
; FIXME: Fold offset
; GCN-DAG: v_or_b32_e32 [[OR_FI1:v[0-9]+]], 4,
; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8
; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:8
; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:12
; GCN: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:4
; GCN: buffer_store_dword [[LOAD2]], off, s[0:3], s32{{$}}
; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], [[OR_FI1]], s[0:3], s33 offen offset:4
; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], [[OR_FI1]], s[0:3], s33 offen offset:8
; GCN: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
; GCN: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:24
; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:28
; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:16
; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:20
; GCN: s_swappc_b64
; FIXME: Dead SP modfication
; GCN-NEXT: s_sub_u32 s32, s32, 0x800{{$}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 {
entry:
%arg0 = alloca %struct.ByValStruct, align 4
%arg1 = alloca %struct.ByValStruct, align 4
%tmp = bitcast %struct.ByValStruct* %arg0 to i8*
call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
%tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
%arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
store volatile i32 9, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
store volatile i32 13, i32* %arrayidx2, align 4
call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
ret void
}
; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim:
define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 {
entry:
%arg0 = alloca %struct.ByValStruct, align 4
%arg1 = alloca %struct.ByValStruct, align 4
%tmp = bitcast %struct.ByValStruct* %arg0 to i8*
call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
%tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
%arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
store volatile i32 9, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
store volatile i32 13, i32* %arrayidx2, align 4
call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
ret void
}
declare void @external_void_func_void() #0
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #3
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #3
attributes #0 = { nounwind }
attributes #1 = { noinline norecurse nounwind }
attributes #2 = { nounwind norecurse "no-frame-pointer-elim"="true" }

View File

@ -0,0 +1,527 @@
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s
; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s
declare void @external_void_func_i1(i1) #0
declare void @external_void_func_i1_signext(i1 signext) #0
declare void @external_void_func_i1_zeroext(i1 zeroext) #0
declare void @external_void_func_i8(i8) #0
declare void @external_void_func_i8_signext(i8 signext) #0
declare void @external_void_func_i8_zeroext(i8 zeroext) #0
declare void @external_void_func_i16(i16) #0
declare void @external_void_func_i16_signext(i16 signext) #0
declare void @external_void_func_i16_zeroext(i16 zeroext) #0
declare void @external_void_func_i32(i32) #0
declare void @external_void_func_i64(i64) #0
declare void @external_void_func_f16(half) #0
declare void @external_void_func_f32(float) #0
declare void @external_void_func_f64(double) #0
declare void @external_void_func_v2i16(<2 x i16>) #0
declare void @external_void_func_v2f16(<2 x half>) #0
declare void @external_void_func_v2i32(<2 x i32>) #0
declare void @external_void_func_v3i32(<3 x i32>) #0
declare void @external_void_func_v4i32(<4 x i32>) #0
declare void @external_void_func_v8i32(<8 x i32>) #0
declare void @external_void_func_v16i32(<16 x i32>) #0
declare void @external_void_func_v32i32(<32 x i32>) #0
declare void @external_void_func_v32i32_i32(<32 x i32>, i32) #0
; return value and argument
declare i32 @external_i32_func_i32(i32) #0
; Structs
declare void @external_void_func_struct_i8_i32({ i8, i32 }) #0
declare void @external_void_func_byval_struct_i8_i32({ i8, i32 }* byval) #0
declare void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* sret, { i8, i32 }* byval) #0
declare void @external_void_func_v16i8(<16 x i8>) #0
; FIXME: Should be passing -1
; GCN-LABEL: {{^}}test_call_external_void_func_i1_imm:
; MESA: s_mov_b32 s36, SCRATCH_RSRC_DWORD
; MESA-DAG: s_mov_b64 s[0:1], s[36:37]
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4
; GCN-DAG: v_mov_b32_e32 v0, 1{{$}}
; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
call void @external_void_func_i1(i1 true)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext:
; MESA: s_mov_b32 s33, s3{{$}}
; HSA: s_mov_b32 s33, s9{{$}}
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+4
; GCN-NEXT: buffer_load_ubyte [[VAR:v[0-9]+]]
; HSA-NEXT: s_mov_b32 s4, s33
; HSA-NEXT: s_mov_b32 s32, s33
; MESA-DAG: s_mov_b32 s4, s33{{$}}
; MESA-DAG: s_mov_b32 s32, s33{{$}}
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
%var = load volatile i1, i1 addrspace(1)* undef
call void @external_void_func_i1_signext(i1 %var)
ret void
}
; FIXME: load should be scheduled before getpc
; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext:
; MESA: s_mov_b32 s33, s3{{$}}
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+4
; GCN-NEXT: buffer_load_ubyte v0
; GCN-DAG: s_mov_b32 s4, s33{{$}}
; GCN-DAG: s_mov_b32 s32, s33{{$}}
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
%var = load volatile i1, i1 addrspace(1)* undef
call void @external_void_func_i1_zeroext(i1 %var)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm:
; MESA-DAG: s_mov_b32 s33, s3{{$}}
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4
; GCN-NEXT: v_mov_b32_e32 v0, 0x7b
; HSA-DAG: s_mov_b32 s4, s33{{$}}
; GCN-DAG: s_mov_b32 s32, s33{{$}}
; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
call void @external_void_func_i8(i8 123)
ret void
}
; FIXME: don't wait before call
; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext:
; HSA-DAG: s_mov_b32 s33, s9{{$}}
; MESA-DAG: s_mov_b32 s33, s3{{$}}
; GCN-DAG: buffer_load_sbyte v0
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4
; GCN-DAG: s_mov_b32 s4, s33
; GCN-DAG: s_mov_b32 s32, s3
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
%var = load volatile i8, i8 addrspace(1)* undef
call void @external_void_func_i8_signext(i8 %var)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext:
; MESA-DAG: s_mov_b32 s33, s3{{$}}
; HSA-DAG: s_mov_b32 s33, s9{{$}}
; GCN-DAG: buffer_load_ubyte v0
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4
; GCN-DAG: s_mov_b32 s4, s33
; GCN-DAG: s_mov_b32 s32, s33
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
%var = load volatile i8, i8 addrspace(1)* undef
call void @external_void_func_i8_zeroext(i8 %var)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm:
; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}}
; GCN-DAG: s_mov_b32 s4, s33
; GCN-DAG: s_mov_b32 s32, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
call void @external_void_func_i16(i16 123)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext:
; MESA-DAG: s_mov_b32 s33, s3{{$}}
; GCN-DAG: buffer_load_sshort v0
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4
; GCN-DAG: s_mov_b32 s4, s33
; GCN-DAG: s_mov_b32 s32, s33
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
%var = load volatile i16, i16 addrspace(1)* undef
call void @external_void_func_i16_signext(i16 %var)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext:
; MESA-DAG: s_mov_b32 s33, s3{{$}}
; GCN-DAG: buffer_load_ushort v0
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4
; GCN-DAG: s_mov_b32 s4, s33
; GCN-DAG: s_mov_b32 s32, s33
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
%var = load volatile i16, i16 addrspace(1)* undef
call void @external_void_func_i16_zeroext(i16 %var)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm:
; MESA-DAG: s_mov_b32 s33, s3{{$}}
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4
; GCN: v_mov_b32_e32 v0, 42
; GCN-DAG: s_mov_b32 s4, s33
; GCN-DAG: s_mov_b32 s32, s33
; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
call void @external_void_func_i32(i32 42)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_i64_imm:
; GCN-DAG: s_movk_i32 [[K0:s[0-9]+]], 0x7b{{$}}
; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v0, [[K0]]
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i64@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i64@rel32@hi+4
; GCN-DAG: v_mov_b32_e32 v1, [[K1]]
; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
call void @external_void_func_i64(i64 123)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm:
; VI: v_mov_b32_e32 v0, 0x4400
; CI: v_mov_b32_e32 v0, 4.0
; GCN-NOT: v0
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
call void @external_void_func_f16(half 4.0)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_f32_imm:
; GCN: v_mov_b32_e32 v0, 4.0
; GCN-NOT: v0
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
call void @external_void_func_f32(float 4.0)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm:
; GCN: v_mov_b32_e32 v0, 0{{$}}
; GCN: v_mov_b32_e32 v1, 0x40100000
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
call void @external_void_func_f64(double 4.0)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v2i16:
; GFX9: buffer_load_dword v0
; GFX9-NOT: v0
; GFX9: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
%val = load <2 x i16>, <2 x i16> addrspace(1)* undef
call void @external_void_func_v2i16(<2 x i16> %val)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v2f16:
; GFX9: buffer_load_dword v0
; GFX9-NOT: v0
; GFX9: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
%val = load <2 x half>, <2 x half> addrspace(1)* undef
call void @external_void_func_v2f16(<2 x half> %val)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v2i32:
; GCN: buffer_load_dwordx2 v[0:1]
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
%val = load <2 x i32>, <2 x i32> addrspace(1)* undef
call void @external_void_func_v2i32(<2 x i32> %val)
ret void
}
; FIXME: Passing 4th
; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm:
; HSA-DAG: s_mov_b32 s33, s9
; MESA-DAG: s_mov_b32 s33, s3{{$}}
; GCN-DAG: v_mov_b32_e32 v0
; GCN-DAG: v_mov_b32_e32 v1
; GCN-DAG: v_mov_b32_e32 v2
; GCN-DAG: v_mov_b32_e32 v3
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
; GCN: buffer_load_dwordx4 v[0:3]
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
%val = load <4 x i32>, <4 x i32> addrspace(1)* undef
call void @external_void_func_v4i32(<4 x i32> %val)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
%ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
%val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
call void @external_void_func_v8i32(<8 x i32> %val)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v16i32:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
%ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
%val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
call void @external_void_func_v16i32(<16 x i32> %val)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v32i32:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
; GCN-DAG: buffer_load_dwordx4 v[16:19], off
; GCN-DAG: buffer_load_dwordx4 v[20:23], off
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
%ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
%val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
call void @external_void_func_v32i32(<32 x i32> %val)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32:
; HSA-DAG: s_mov_b32 s33, s9
; HSA-DAG: s_add_u32 [[SP_REG:s[0-9]+]], s33, 0x100{{$}}
; MESA-DAG: s_mov_b32 s33, s3{{$}}
; MESA-DAG: s_add_u32 [[SP_REG:s[0-9]+]], s33, 0x100{{$}}
; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
; GCN-DAG: buffer_load_dwordx4 v[16:19], off
; GCN-DAG: buffer_load_dwordx4 v[20:23], off
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SP_REG]]{{$}}
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
%ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
%val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0
%val1 = load i32, i32 addrspace(1)* undef
call void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1)
ret void
}
; FIXME: No wait after call
; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm:
; GCN: v_mov_b32_e32 v0, 42
; GCN: s_swappc_b64 s[30:31],
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[36:39], 0
define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
%val = call i32 @external_i32_func_i32(i32 42)
store volatile i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
; GCN: buffer_load_ubyte v0, off
; GCN: buffer_load_dword v1, off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
%ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(2)* undef
%val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
call void @external_void_func_struct_i8_i32({ i8, i32 } %val)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32:
; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}}
; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], s33 offset:8
; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], s33 offset:12
; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8
; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12
; GCN: s_add_u32 [[SP]], [[SP]], 0x200
; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8
; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12
; HSA: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4
; HSA: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}}
; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8
; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12
; MESA: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4
; MESA: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}}
; GCN-NEXT: s_swappc_b64
; GCN-NEXT: s_sub_u32 [[SP]], [[SP]], 0x200
define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 {
%val = alloca { i8, i32 }, align 4
%gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 0
%gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 1
store i8 3, i8* %gep0
store i32 8, i32* %gep1
call void @external_void_func_byval_struct_i8_i32({ i8, i32 }* %val)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}}
; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}}
; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8
; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8
; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
; GCN-DAG: s_add_u32 [[SP]], [[SP]], 0x200
; GCN: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
; GCN: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}}
; GCN-NEXT: s_swappc_b64
; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
; GCN: s_sub_u32 [[SP]], [[SP]], 0x200
; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off
; GCN: buffer_store_dword [[LOAD_OUT_VAL1]], off
define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
%in.val = alloca { i8, i32 }, align 4
%out.val = alloca { i8, i32 }, align 4
%in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 0
%in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 1
store i8 3, i8* %in.gep0
store i32 8, i32* %in.gep1
call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* %out.val, { i8, i32 }* %in.val)
%out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 0
%out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 1
%out.val0 = load i8, i8* %out.gep0
%out.val1 = load i32, i32* %out.gep1
store volatile i8 %out.val0, i8 addrspace(1)* undef
store volatile i32 %out.val1, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v16i8:
define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
%ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
%val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
call void @external_void_func_v16i8(<16 x i8> %val)
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind noinline }

View File

@ -0,0 +1,251 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
declare void @external_void_func_void() #0
; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
; GCN: s_mov_b32 s33, s7
; GCN: s_getpc_b64 s[34:35]
; GCN-NEXT: s_add_u32 s34, s34,
; GCN-NEXT: s_addc_u32 s35, s35,
; GCN-NEXT: s_mov_b32 s4, s33
; GCN-NEXT: s_mov_b32 s32, s33
; GCN: s_swappc_b64 s[30:31], s[34:35]
; GCN-NEXT: s_mov_b32 s4, s33
; GCN-NEXT: #ASMSTART
; GCN-NEXT: #ASMEND
; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35]
define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
call void @external_void_func_void()
call void asm sideeffect "", ""() #0
call void @external_void_func_void()
ret void
}
; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
; GCN: v_writelane_b32 v32, s33, 0
; GCN: v_writelane_b32 v32, s34, 1
; GCN: v_writelane_b32 v32, s35, 2
; GCN: v_writelane_b32 v32, s36, 3
; GCN: v_writelane_b32 v32, s37, 4
; GCN: s_mov_b32 s33, s5
; GCN: s_swappc_b64
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_swappc_b64
; GCN: s_mov_b32 s5, s33
; GCN: v_readlane_b32 s37, v32, 4
; GCN: v_readlane_b32 s36, v32, 3
; GCN: v_readlane_b32 s35, v32, 2
; GCN: v_readlane_b32 s34, v32, 1
; GCN: v_readlane_b32 s33, v32, 0
; GCN: s_setpc_b64
define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
call void @external_void_func_void()
call void asm sideeffect "", ""() #0
call void @external_void_func_void()
ret void
}
; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31:
; GCN: s_waitcnt
; GCN-NEXT: s_mov_b64 [[SAVEPC:s\[[0-9]+:[0-9]+\]]], s[30:31]
; GCN-NEXT: #ASMSTART
; GCN: ; clobber
; GCN-NEXT: #ASMEND
; GCN-NEXT: s_mov_b64 s[30:31], [[SAVEPC]]
; GCN-NEXT: s_setpc_b64 s[30:31]
define void @void_func_void_clobber_s30_s31() #2 {
call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
ret void
}
; GCN-LABEL: {{^}}void_func_void_clobber_vcc:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_setpc_b64 s[30:31]
define void @void_func_void_clobber_vcc() #2 {
call void asm sideeffect "", "~{VCC}"() #0
ret void
}
; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc:
; GCN: s_getpc_b64
; GCN-NEXT: s_add_u32
; GCN-NEXT: s_addc_u32
; GCN: s_mov_b64 s[34:35], vcc
; GCN-NEXT: s_swappc_b64
; GCN: s_mov_b64 vcc, s[34:35]
define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(i32 addrspace(1)* %out) #0 {
%vcc = call i64 asm sideeffect "; def $0", "={vcc}"()
call void @void_func_void_clobber_vcc()
%val0 = load volatile i32, i32 addrspace(1)* undef
%val1 = load volatile i32, i32 addrspace(1)* undef
call void asm sideeffect "; use $0", "{vcc}"(i64 %vcc)
ret void
}
; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31:
; GCN: s_mov_b32 s33, s31
; GCN-NEXT: s_swappc_b64
; GCN-NEXT: s_mov_b32 s31, s33
define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 {
%s31 = call i32 asm sideeffect "; def $0", "={s31}"()
call void @external_void_func_void()
call void asm sideeffect "; use $0", "{s31}"(i32 %s31)
ret void
}
; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
; GCN: v_mov_b32_e32 v32, v31
; GCN-NEXT: s_swappc_b64
; GCN-NEXT: v_mov_b32_e32 v31, v32
define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
%v31 = call i32 asm sideeffect "; def $0", "={v31}"()
call void @external_void_func_void()
call void asm sideeffect "; use $0", "{v31}"(i32 %v31)
ret void
}
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
; GCN: s_mov_b32 s34, s9
; GCN: ; def s33
; GCN-NEXT: #ASMEND
; GCN: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
; GCN-NEXT: s_mov_b32 s4, s34
; GCN-NEXT: s_mov_b32 s32, s34
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s33
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(1)* %out) #0 {
%s33 = call i32 asm sideeffect "; def $0", "={s33}"()
call void @external_void_func_void()
call void asm sideeffect "; use $0", "{s33}"(i32 %s33)
ret void
}
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32:
; GCN: s_mov_b32 s33, s9
; GCN: ; def v32
; GCN-NEXT: #ASMEND
; GCN: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
; GCN-NEXT: s_mov_b32 s4, s33
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use v32
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_preserves_v32(i32 addrspace(1)* %out) #0 {
%v32 = call i32 asm sideeffect "; def $0", "={v32}"()
call void @external_void_func_void()
call void asm sideeffect "; use $0", "{v32}"(i32 %v32)
ret void
}
; GCN-LABEL: {{^}}void_func_void_clobber_s33:
; GCN: v_writelane_b32 v0, s33, 0
; GCN-NEXT: #ASMSTART
; GCN-NEXT: ; clobber
; GCN-NEXT: #ASMEND
; GCN-NEXT: v_readlane_b32 s33, v0, 0
; GCN-NEXT: s_setpc_b64
define void @void_func_void_clobber_s33() #2 {
call void asm sideeffect "; clobber", "~{s33}"() #0
ret void
}
; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33:
; GCN: s_mov_b32 s33, s7
; GCN: s_getpc_b64
; GCN-NEXT: s_add_u32
; GCN-NEXT: s_addc_u32
; GCN-NEXT: s_mov_b32 s4, s33
; GCN-NEXT: s_mov_b32 s32, s33
; GCN: s_swappc_b64
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
call void @void_func_void_clobber_s33()
ret void
}
; GCN-LABEL: {{^}}callee_saved_sgpr_func:
; GCN-NOT: s40
; GCN: v_writelane_b32 v32, s40
; GCN: s_swappc_b64
; GCN-NOT: s40
; GCN: ; use s40
; GCN-NOT: s40
; GCN: v_readlane_b32 s40, v32
; GCN-NOT: s40
define void @callee_saved_sgpr_func() #2 {
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
call void @external_void_func_void()
call void asm sideeffect "; use $0", "s"(i32 %s40) #0
ret void
}
; GCN-LABEL: {{^}}callee_saved_sgpr_kernel:
; GCN-NOT: s40
; GCN: ; def s40
; GCN-NOT: s40
; GCN: s_swappc_b64
; GCN-NOT: s40
; GCN: ; use s40
; GCN-NOT: s40
define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 {
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
call void @external_void_func_void()
call void asm sideeffect "; use $0", "s"(i32 %s40) #0
ret void
}
; First call preserved VGPR is used so it can't be used for SGPR spills.
; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func:
; GCN-NOT: s40
; GCN: v_writelane_b32 v33, s40
; GCN: s_swappc_b64
; GCN-NOT: s40
; GCN: ; use s40
; GCN-NOT: s40
; GCN: v_readlane_b32 s40, v33
; GCN-NOT: s40
define void @callee_saved_sgpr_vgpr_func() #2 {
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
%v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
call void @external_void_func_void()
call void asm sideeffect "; use $0", "s"(i32 %s40) #0
call void asm sideeffect "; use $0", "v"(i32 %v32) #0
ret void
}
; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_kernel:
; GCN-NOT: s40
; GCN: ; def s40
; GCN-NOT: s40
; GCN: s_swappc_b64
; GCN-NOT: s40
; GCN: ; use s40
; GCN-NOT: s40
define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 {
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
%v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
call void @external_void_func_void()
call void asm sideeffect "; use $0", "s"(i32 %s40) #0
call void asm sideeffect "; use $0", "v"(i32 %v32) #0
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind noinline }

View File

@ -0,0 +1,241 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
declare void @external_void_func_void() #0
declare i1 @external_i1_func_void() #0
declare zeroext i1 @external_i1_zeroext_func_void() #0
declare signext i1 @external_i1_signext_func_void() #0
declare i8 @external_i8_func_void() #0
declare zeroext i8 @external_i8_zeroext_func_void() #0
declare signext i8 @external_i8_signext_func_void() #0
declare i16 @external_i16_func_void() #0
declare zeroext i16 @external_i16_zeroext_func_void() #0
declare signext i16 @external_i16_signext_func_void() #0
declare i32 @external_i32_func_void() #0
declare i64 @external_i64_func_void() #0
declare half @external_f16_func_void() #0
declare float @external_f32_func_void() #0
declare double @external_f64_func_void() #0
declare <2 x i32> @external_v2i32_func_void() #0
declare <3 x i32> @external_v3i32_func_void() #0
declare <4 x i32> @external_v4i32_func_void() #0
declare <5 x i32> @external_v5i32_func_void() #0
declare <8 x i32> @external_v8i32_func_void() #0
declare <16 x i32> @external_v16i32_func_void() #0
declare <32 x i32> @external_v32i32_func_void() #0
declare { <32 x i32>, i32 } @external_v32i32_i32_func_void() #0
declare <2 x i16> @external_v2i16_func_void() #0
declare <2 x half> @external_v2f16_func_void() #0
declare { i32, i64 } @external_i32_i64_func_void() #0
; GCN-LABEL: {{^}}test_call_external_void_func_void:
define amdgpu_kernel void @test_call_external_void_func_void() #0 {
call void @external_void_func_void()
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_void_x2:
define amdgpu_kernel void @test_call_external_void_func_void_x2() #0 {
call void @external_void_func_void()
call void @external_void_func_void()
ret void
}
; GCN-LABEL: {{^}}test_call_external_i1_func_void:
define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
%val = call i1 @external_i1_func_void()
store volatile i1 %val, i1 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i1_zeroext_func_void:
define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
%val = call i1 @external_i1_zeroext_func_void()
%val.ext = zext i1 %val to i32
store volatile i32 %val.ext, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i1_signext_func_void:
define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
%val = call i1 @external_i1_signext_func_void()
%val.ext = zext i1 %val to i32
store volatile i32 %val.ext, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i8_func_void:
define amdgpu_kernel void @test_call_external_i8_func_void() #0 {
%val = call i8 @external_i8_func_void()
store volatile i8 %val, i8 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i8_zeroext_func_void:
define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 {
%val = call i8 @external_i8_zeroext_func_void()
%val.ext = zext i8 %val to i32
store volatile i32 %val.ext, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i8_signext_func_void:
define amdgpu_kernel void @test_call_external_i8_signext_func_void() #0 {
%val = call i8 @external_i8_signext_func_void()
%val.ext = zext i8 %val to i32
store volatile i32 %val.ext, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i16_func_void:
define amdgpu_kernel void @test_call_external_i16_func_void() #0 {
%val = call i16 @external_i16_func_void()
store volatile i16 %val, i16 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i16_zeroext_func_void:
define amdgpu_kernel void @test_call_external_i16_zeroext_func_void() #0 {
%val = call i16 @external_i16_zeroext_func_void()
%val.ext = zext i16 %val to i32
store volatile i32 %val.ext, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i16_signext_func_void:
define amdgpu_kernel void @test_call_external_i16_signext_func_void() #0 {
%val = call i16 @external_i16_signext_func_void()
%val.ext = zext i16 %val to i32
store volatile i32 %val.ext, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i32_func_void:
define amdgpu_kernel void @test_call_external_i32_func_void() #0 {
%val = call i32 @external_i32_func_void()
store volatile i32 %val, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i64_func_void:
define amdgpu_kernel void @test_call_external_i64_func_void() #0 {
%val = call i64 @external_i64_func_void()
store volatile i64 %val, i64 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_f16_func_void:
define amdgpu_kernel void @test_call_external_f16_func_void() #0 {
%val = call half @external_f16_func_void()
store volatile half %val, half addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_f32_func_void:
define amdgpu_kernel void @test_call_external_f32_func_void() #0 {
%val = call float @external_f32_func_void()
store volatile float %val, float addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_f64_func_void:
define amdgpu_kernel void @test_call_external_f64_func_void() #0 {
%val = call double @external_f64_func_void()
store volatile double %val, double addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_v2i32_func_void:
define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 {
%val = call <2 x i32> @external_v2i32_func_void()
store volatile <2 x i32> %val, <2 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_v3i32_func_void:
define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 {
%val = call <3 x i32> @external_v3i32_func_void()
store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8
ret void
}
; GCN-LABEL: {{^}}test_call_external_v4i32_func_void:
define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 {
%val = call <4 x i32> @external_v4i32_func_void()
store volatile <4 x i32> %val, <4 x i32> addrspace(1)* undef, align 8
ret void
}
; GCN-LABEL: {{^}}test_call_external_v5i32_func_void:
define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 {
%val = call <5 x i32> @external_v5i32_func_void()
store volatile <5 x i32> %val, <5 x i32> addrspace(1)* undef, align 8
ret void
}
; GCN-LABEL: {{^}}test_call_external_v8i32_func_void:
define amdgpu_kernel void @test_call_external_v8i32_func_void() #0 {
%val = call <8 x i32> @external_v8i32_func_void()
store volatile <8 x i32> %val, <8 x i32> addrspace(1)* undef, align 8
ret void
}
; GCN-LABEL: {{^}}test_call_external_v16i32_func_void:
define amdgpu_kernel void @test_call_external_v16i32_func_void() #0 {
%val = call <16 x i32> @external_v16i32_func_void()
store volatile <16 x i32> %val, <16 x i32> addrspace(1)* undef, align 8
ret void
}
; GCN-LABEL: {{^}}test_call_external_v32i32_func_void:
define amdgpu_kernel void @test_call_external_v32i32_func_void() #0 {
%val = call <32 x i32> @external_v32i32_func_void()
store volatile <32 x i32> %val, <32 x i32> addrspace(1)* undef, align 8
ret void
}
; GCN-LABEL: {{^}}test_call_external_v2i16_func_void:
define amdgpu_kernel void @test_call_external_v2i16_func_void() #0 {
%val = call <2 x i16> @external_v2i16_func_void()
store volatile <2 x i16> %val, <2 x i16> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_v2f16_func_void:
define amdgpu_kernel void @test_call_external_v2f16_func_void() #0 {
%val = call <2 x half> @external_v2f16_func_void()
store volatile <2 x half> %val, <2 x half> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i32_i64_func_void:
define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 {
%val = call { i32, i64 } @external_i32_i64_func_void()
%val.0 = extractvalue { i32, i64 } %val, 0
%val.1 = extractvalue { i32, i64 } %val, 1
store volatile i32 %val.0, i32 addrspace(1)* undef
store volatile i64 %val.1, i64 addrspace(1)* undef
ret void
}
; Requires writing results to stack
; GCN-LABEL: {{^}}test_call_external_v32i32_i32_func_void:
define amdgpu_kernel void @test_call_external_v32i32_i32_func_void() #0 {
%val = call { <32 x i32>, i32 } @external_v32i32_i32_func_void()
%val0 = extractvalue { <32 x i32>, i32 } %val, 0
%val1 = extractvalue { <32 x i32>, i32 } %val, 1
store volatile <32 x i32> %val0, <32 x i32> addrspace(1)* undef, align 8
store volatile i32 %val1, i32 addrspace(1)* undef
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind noinline }

View File

@ -1,4 +1,5 @@
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
; RUN: llc -amdgpu-function-calls -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
; RUN: llc -amdgpu-function-calls -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
; GCN-LABEL: {{^}}callee_no_stack:
; GCN: ; BB#0:
@ -8,6 +9,14 @@ define void @callee_no_stack() #0 {
ret void
}
; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim:
; GCN: ; BB#0:
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @callee_no_stack_no_fp_elim() #1 {
ret void
}
; Requires frame pointer for access to local regular object.
; GCN-LABEL: {{^}}callee_with_stack:
@ -24,4 +33,51 @@ define void @callee_with_stack() #0 {
ret void
}
; GCN-LABEL: {{^}}callee_with_stack_and_call:
; GCN: ; BB#0:
; GCN-NEXT: s_waitcnt
; GCN-DAG: s_mov_b32 s5, s32
; GCN-DAG: v_writelane_b32 v32, s33,
; GCN-DAG: v_writelane_b32 v32, s34,
; GCN-DAG: v_writelane_b32 v32, s35,
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: s_add_u32 s32, s32, 0x200{{$}}
; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
; GCN-DAG: s_mov_b32 s33, s5
; GCN: s_swappc_b64
; GCN: s_mov_b32 s5, s33
; GCN-DAG: v_readlane_b32 s35,
; GCN-DAG: v_readlane_b32 s34,
; GCN-DAG: v_readlane_b32 s33,
; GCN: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_and_call() #0 {
%alloca = alloca i32
store volatile i32 0, i32* %alloca
call void @external_void_func_void()
ret void
}
; Should be able to copy incoming stack pointer directly to inner
; call's stack pointer argument.
; GCN-LABEL: {{^}}callee_no_stack_with_call:
; GCN: s_waitcnt
; GCN-NOT: s32
; GCN: s_mov_b32 s33, s5
; GCN: s_swappc_b64
; GCN: s_mov_b32 s5, s33
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @callee_no_stack_with_call() #0 {
call void @external_void_func_void()
ret void
}
declare void @external_void_func_void() #0
attributes #0 = { nounwind }
attributes #1 = { nounwind "no-frame-pointer-elim"="true" }

View File

@ -0,0 +1,41 @@
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-function-calls -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VI %s
; Test calls when called by other callable functions rather than
; kernels.
declare void @external_void_func_i32(i32) #0
; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
; GCN: s_waitcnt
; GCN-NOT: s32
; GCN: s_swappc_b64
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @test_func_call_external_void_func_i32_imm() #0 {
call void @external_void_func_i32(i32 42)
ret void
}
; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
; GCN: s_waitcnt
; GCN: s_mov_b32 s5, s32
; GCN: s_add_u32 s32, s32, 0x1100{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
; GCN: s_swappc_b64
; GCN: s_sub_u32 s32, s32, 0x1100{{$}}
; GCN: s_setpc_b64
define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
%alloca = alloca [16 x i32], align 4
%gep0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 0
%gep15 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 16
store volatile i32 0, i32* %gep0
store volatile i32 0, i32* %gep15
call void @external_void_func_i32(i32 42)
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind noinline }