AMDGPU: Initial implementation of calls

Includes a hack to fix the type selected for the GlobalAddress of the function, which will be fixed by changing the default datalayout to use generic pointers for 0. llvm-svn: 309732
2017-08-01 19:54:18 +00:00 · 2017-08-01 19:54:18 +00:00 · b62a4eb524
parent 253be33610
commit b62a4eb524
25 changed files with 1953 additions and 15 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@ -162,6 +162,10 @@ def CC_AMDGPU : CallingConv<[
         "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
           "AMDGPUSubtarget::SOUTHERN_ISLANDS",
        CCDelegateTo<CC_SI>>,
+   CCIf<"static_cast<const AMDGPUSubtarget&>"
+         "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
+           "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
+        CCDelegateTo<CC_AMDGPU_Func>>,
   CCIf<"static_cast<const AMDGPUSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() < "
            "AMDGPUSubtarget::SOUTHERN_ISLANDS",
--- a/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@ -33,10 +33,6 @@ public:
  /// \returns The number of 32-bit sub-registers that are used when storing
  /// values to the stack.
  unsigned getStackWidth(const MachineFunction &MF) const;
-
-  bool hasFP(const MachineFunction &MF) const override {
-    return false;
-  }
 };

 } // end namespace llvm
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -20,6 +20,7 @@
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
 #include "R600MachineFunctionInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@ -30,7 +30,9 @@ using namespace llvm;
 void AMDGPUInstrInfo::anchor() {}

 AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
-  : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {}
+  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+    ST(ST),
+    AMDGPUASI(ST.getAMDGPUAS()) {}

 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
 // the first 16 loads will be interleaved with the stores, and the next 16 will
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@ -82,6 +82,22 @@ def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
 def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
 def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;

+def callseq_start : SDNode<"ISD::CALLSEQ_START",
+  SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
+  [SDNPHasChain, SDNPOutGlue]
+>;
+
+def callseq_end : SDNode<"ISD::CALLSEQ_END",
+ SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
+  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
+>;
+
+def AMDGPUcall : SDNode<"AMDGPUISD::CALL",
+  SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+  SDNPVariadic]
+>;
+
 def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
  SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
    [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@ -121,6 +121,9 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
    MCOp = MCOperand::createExpr(Expr);
    return true;
  }
+  case MachineOperand::MO_RegisterMask:
+    // Regmasks are like implicit defs.
+    return false;
  }
 }

--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@ -56,6 +56,20 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
  }
 }

+const MCPhysReg *
+SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
+  // FIXME
+  static MCPhysReg Regs[2];
+
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  assert(!MFI->isEntryFunction());
+
+  Regs[0] = MFI->getFrameOffsetReg();
+  Regs[1] = AMDGPU::NoRegister;
+
+  return Regs;
+}
+
 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                                     CallingConv::ID CC) const {
  switch (CC) {
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -123,6 +123,12 @@ static cl::opt<bool> LateCFGStructurize(
  cl::init(false),
  cl::Hidden);

+static cl::opt<bool> EnableAMDGPUFunctionCalls(
+  "amdgpu-function-calls",
+  cl::Hidden,
+  cl::desc("Enable AMDGPU function call support"),
+  cl::init(false));
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
  // Register the target
  RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@ -269,6 +275,11 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,

 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;

+bool AMDGPUTargetMachine::enableFunctionCalls() const {
+  return EnableAMDGPUFunctionCalls &&
+         getTargetTriple().getArch() == Triple::amdgcn;
+}
+
 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
  Attribute GPUAttr = F.getFnAttribute("target-cpu");
  return GPUAttr.hasAttribute(Attribute::None) ?
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@ -69,6 +69,9 @@ public:
      return -1;
    return 0;
  }
+
+  LLVM_READONLY
+  bool enableFunctionCalls() const;
 };

 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.h
@ -27,6 +27,10 @@ public:
                    MachineBasicBlock &MBB) const override {}
  int getFrameIndexReference(const MachineFunction &MF, int FI,
                             unsigned &FrameReg) const override;
+
+  bool hasFP(const MachineFunction &MF) const override {
+    return false;
+  }
 };

 } // end namespace llvm
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@ -575,6 +575,41 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
  }
 }

+MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
+  MachineFunction &MF,
+  MachineBasicBlock &MBB,
+  MachineBasicBlock::iterator I) const {
+  int64_t Amount = I->getOperand(0).getImm();
+  if (Amount == 0)
+    return MBB.erase(I);
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const DebugLoc &DL = I->getDebugLoc();
+  unsigned Opc = I->getOpcode();
+  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  if (!TFI->hasReservedCallFrame(MF)) {
+    unsigned Align = getStackAlignment();
+
+    Amount = alignTo(Amount, Align);
+    assert(isUInt<32>(Amount) && "exceeded stack address space size");
+    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    unsigned SPReg = MFI->getStackPtrOffsetReg();
+
+    unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
+    BuildMI(MBB, I, DL, TII->get(Op), SPReg)
+      .addReg(SPReg)
+      .addImm(Amount * ST.getWavefrontSize());
+  } else if (CalleePopAmount != 0) {
+    llvm_unreachable("is this used?");
+  }
+
+  return MBB.erase(I);
+}
+
 void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
                                           MachineBasicBlock &MBB) const {
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@ -39,6 +39,11 @@ public:
    MachineFunction &MF,
    RegScavenger *RS = nullptr) const override;

+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
+
 private:
  void emitFlatScratchInit(const SISubtarget &ST,
                           MachineFunction &MF,
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -1201,9 +1201,13 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
  if (TM.getOptLevel() == CodeGenOpt::None)
    HasStackObjects = true;

+  // For now assume stack access is needed in any callee functions, so we need
+  // the scratch registers to pass in.
+  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
+
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
  if (ST.isAmdCodeObjectV2(MF)) {
-    if (HasStackObjects) {
+    if (RequiresStackAccess) {
      // If we have stack objects, we unquestionably need the private buffer
      // resource. For the Code Object V2 ABI, this will be the first 4 user
      // SGPR inputs. We can reserve those and use them directly.
@ -1212,9 +1216,23 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
        MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
      Info.setScratchRSrcReg(PrivateSegmentBufferReg);

-      unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
-        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-      Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+      if (MFI.hasCalls()) {
+        // If we have calls, we need to keep the frame register in a register
+        // that won't be clobbered by a call, so ensure it is copied somewhere.
+
+        // This is not a problem for the scratch wave offset, because the same
+        // registers are reserved in all functions.
+
+        // FIXME: Nothing is really ensuring this is a call preserved register,
+        // it's just selected from the end so it happens to be.
+        unsigned ReservedOffsetReg
+          = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+        Info.setScratchWaveOffsetReg(ReservedOffsetReg);
+      } else {
+        unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
+          MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+        Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+      }
    } else {
      unsigned ReservedBufferReg
        = TRI.reservedPrivateSegmentBufferReg(MF);
@ -1237,7 +1255,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
    // offset is still in an input SGPR.
    Info.setScratchRSrcReg(ReservedBufferReg);

-    if (HasStackObjects) {
+    if (HasStackObjects && !MFI.hasCalls()) {
      unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
      Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
@ -1249,6 +1267,50 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
  }
 }

+bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
+  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+  return !Info->isEntryFunction();
+}
+
+void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+
+}
+
+void SITargetLowering::insertCopiesSplitCSR(
+  MachineBasicBlock *Entry,
+  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (AMDGPU::SReg_64RegClass.contains(*I))
+      RC = &AMDGPU::SGPR_64RegClass;
+    else if (AMDGPU::SReg_32RegClass.contains(*I))
+      RC = &AMDGPU::SGPR_32RegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+      .addReg(*I);
+
+    // Insert the copy-back instructions right before the terminator.
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
+        .addReg(NewVR);
+  }
+}
+
 SDValue SITargetLowering::LowerFormalArguments(
    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@ -1589,6 +1651,22 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
  }

  // FIXME: Does sret work properly?
+  if (!Info->isEntryFunction()) {
+    const SIRegisterInfo *TRI
+      = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+    const MCPhysReg *I =
+      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+    if (I) {
+      for (; *I; ++I) {
+        if (AMDGPU::SReg_64RegClass.contains(*I))
+          RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+        else if (AMDGPU::SReg_32RegClass.contains(*I))
+          RetOps.push_back(DAG.getRegister(*I, MVT::i32));
+        else
+          llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+      }
+    }
+  }

  // Update chain and glue.
  RetOps[0] = Chain;
@ -1601,6 +1679,296 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }

+SDValue SITargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
+    SDValue ThisVal) const {
+  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+    SDValue Val;
+
+    if (VA.isRegLoc()) {
+      Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+      Chain = Val.getValue(1);
+      InFlag = Val.getValue(2);
+    } else if (VA.isMemLoc()) {
+      report_fatal_error("TODO: return values in memory");
+    } else
+      llvm_unreachable("unknown argument location type");
+
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+      break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+// The wave scratch offset register is used as the global base pointer.
+SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+  const AMDGPUTargetMachine &TM =
+    static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
+  if (!TM.enableFunctionCalls())
+    return AMDGPUTargetLowering::LowerCall(CLI, InVals);
+
+  SelectionDAG &DAG = CLI.DAG;
+  const SDLoc &DL = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+  bool IsSibCall = false;
+  bool IsThisReturn = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // TODO: Implement tail calls.
+  IsTailCall = false;
+
+  if (IsVarArg || MF.getTarget().Options.GuaranteedTailCallOpt) {
+    report_fatal_error("varargs and tail calls not implemented");
+  }
+
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // FIXME: Remove this hack for function pointer types.
+    const GlobalValue *GV = GA->getGlobal();
+    assert(Callee.getValueType() == MVT::i32);
+    Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(),
+                                  false, GA->getTargetFlags());
+  }
+
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  if (IsSibCall) {
+    // Since we're not changing the ABI to make this a tail call, the memory
+    // operands are already available in the caller's incoming argument space.
+    NumBytes = 0;
+  }
+
+  // FPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0. Completely unused for non-tail calls.
+  int FPDiff = 0;
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!IsSibCall) {
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
+
+    unsigned OffsetReg = Info->getScratchWaveOffsetReg();
+
+    // In the HSA case, this should be an identity copy.
+    SDValue ScratchRSrcReg
+      = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
+    RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+
+    // TODO: Don't hardcode these registers and get from the callee function.
+    SDValue ScratchWaveOffsetReg
+      = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
+    RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
+  }
+
+  // Stack pointer relative accesses are done by changing the offset SGPR. This
+  // is just the VGPR offset component.
+  SDValue StackPtr = DAG.getConstant(0, DL, MVT::i32);
+
+  SmallVector<SDValue, 8> MemOpChains;
+  MVT PtrVT = MVT::i32;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::FPExt:
+      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      SDValue DstAddr;
+      MachinePointerInfo DstInfo;
+
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      int32_t Offset = LocMemOffset;
+      SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
+      PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+
+      if (!IsTailCall) {
+        SDValue PtrOff = DAG.getTargetConstant(Offset, DL, MVT::i32);
+
+        DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+        DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
+      }
+
+      if (Outs[i].Flags.isByVal()) {
+        SDValue SizeNode =
+            DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
+        SDValue Cpy = DAG.getMemcpy(
+            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+            /*isVol = */ false, /*AlwaysInline = */ true,
+            /*isTailCall = */ false,
+            DstInfo, MachinePointerInfo());
+
+        MemOpChains.push_back(Cpy);
+      } else {
+        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+        MemOpChains.push_back(Store);
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (auto &RegToPass : RegsToPass) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
+                             RegToPass.second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // We don't usually want to end the call-sequence here because we would tidy
+  // the frame up *after* the call, however in the ABI-changing tail-call case
+  // we've carefully laid out the parameters so that when sp is reset they'll be
+  // in the correct location.
+  if (IsTailCall && !IsSibCall) {
+    Chain = DAG.getCALLSEQ_END(Chain,
+                               DAG.getTargetConstant(NumBytes, DL, MVT::i32),
+                               DAG.getTargetConstant(0, DL, MVT::i32),
+                               InFlag, DL);
+    InFlag = Chain.getValue(1);
+  }
+
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (IsTailCall) {
+    // Each tail call may have to adjust the stack by a different amount, so
+    // this information must travel along with the operation for eventual
+    // consumption by emitEpilogue.
+    Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
+  }
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (auto &RegToPass : RegsToPass) {
+    Ops.push_back(DAG.getRegister(RegToPass.first,
+                                  RegToPass.second.getValueType()));
+  }
+
+  // Add a register mask operand representing the call-preserved registers.
+
+  const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // If we're doing a tall call, use a TC_RETURN here rather than an
+  // actual call instruction.
+  if (IsTailCall) {
+    MF.getFrameInfo().setHasTailCall();
+    llvm_unreachable("not implemented");
+  }
+
+  // Returns a chain and a flag for retval copy to use.
+  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
+  Chain = Call.getValue(0);
+  InFlag = Call.getValue(1);
+
+  uint64_t CalleePopBytes = 0;
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(NumBytes, DL, MVT::i32),
+                             DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
+                             InFlag, DL);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals, IsThisReturn,
+                         IsThisReturn ? OutVals[0] : SDValue());
+}
+
 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                             SelectionDAG &DAG) const {
  unsigned Reg = StringSwitch<unsigned>(RegName)
@ -2266,6 +2634,27 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
    MI.eraseFromParent();
    return BB;
  }
+  case AMDGPU::ADJCALLSTACKUP:
+  case AMDGPU::ADJCALLSTACKDOWN: {
+    const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+    MachineInstrBuilder MIB(*MF, &MI);
+    MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
+        .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
+    return BB;
+  }
+  case AMDGPU::SI_CALL: {
+    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+    const DebugLoc &DL = MI.getDebugLoc();
+    unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
+    MachineInstrBuilder MIB =
+      BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_SWAPPC_B64), ReturnAddrReg);
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+      MIB.add(MI.getOperand(I));
+    MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+    MI.eraseFromParent();
+    return BB;
+  }
  default:
    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
  }
@ -2931,13 +3320,16 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                             SDValue Op,
                                             SelectionDAG &DAG) const {
  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GSD->getGlobal();

  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
-      GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS)
+      GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
+      // FIXME: It isn't correct to rely on the type of the pointer. This should
+      // be removed when address space 0 is 64-bit.
+      !GV->getType()->getElementType()->isFunctionTy())
    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);

  SDLoc DL(GSD);
-  const GlobalValue *GV = GSD->getGlobal();
  EVT PtrVT = Op.getValueType();

  if (shouldEmitFixup(GV))
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@ -183,6 +183,12 @@ public:

  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;

+  bool supportSplitCSR(MachineFunction *MF) const override;
+  void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+  void insertCopiesSplitCSR(
+    MachineBasicBlock *Entry,
+    const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
@ -199,6 +205,15 @@ public:
                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                      SelectionDAG &DAG) const override;

+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &DL, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+                          SDValue ThisVal) const;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
  unsigned getRegisterByName(const char* RegName, EVT VT,
                             SelectionDAG &DAG) const override;

--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@ -317,6 +317,45 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
  let DisableWQM = 1;
 }

+// Return for returning function calls.
+def SI_RETURN : SPseudoInstSI <
+  (outs), (ins), [],
+  "; return"> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+  let SchedRW = [WriteBranch];
+}
+
+// Return for returning function calls.
+def SI_CALL : SPseudoInstSI <
+  (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)],
+  "; call $src0"> {
+  let Size = 4;
+  let isCall = 1;
+  let SchedRW = [WriteBranch];
+  let usesCustomInserter = 1;
+}
+
+def ADJCALLSTACKUP : SPseudoInstSI<
+  (outs), (ins i32imm:$amt0, i32imm:$amt1),
+  [(callseq_start timm:$amt0, timm:$amt1)],
+  "; adjcallstackup $amt0 $amt1"> {
+  let Size = 8; // Worst case. (s_add_u32 + constant)
+  let FixedSize = 1;
+  let hasSideEffects = 1;
+  let usesCustomInserter = 1;
+}
+
+def ADJCALLSTACKDOWN : SPseudoInstSI<
+  (outs), (ins i32imm:$amt1, i32imm:$amt2),
+  [(callseq_end timm:$amt1, timm:$amt2)],
+  "; adjcallstackdown $amt1"> {
+  let Size = 8; // Worst case. (s_add_u32 + constant)
+  let hasSideEffects = 1;
+  let usesCustomInserter = 1;
+}
+
 let Defs = [M0, EXEC],
  UseNamedOperandTable = 1 in {

--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -236,8 +236,15 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const
  return true;
 }

-bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
-  return MF.getFrameInfo().hasStackObjects();
+bool SIRegisterInfo::requiresFrameIndexScavenging(
+  const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (MFI.hasStackObjects())
+    return true;
+
+  // May need to deal with callee saved registers.
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  return !Info->isEntryFunction();
 }

 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@ -63,6 +63,7 @@ public:
  BitVector getReservedRegs(const MachineFunction &MF) const override;

  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                       CallingConv::ID) const override;

--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@ -269,6 +269,18 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
 //  Register classes used as source and destination
 //===----------------------------------------------------------------------===//

+def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+  (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
+  let isAllocatable = 0;
+  let CopyCost = -1;
+}
+
+def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
+  (add PRIVATE_RSRC_REG)> {
+  let isAllocatable = 0;
+  let CopyCost = -1;
+}
+
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
 def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
--- a/llvm/test/CodeGen/AMDGPU/basic-call-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-call-return.ll
@ -0,0 +1,27 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define void @void_func_void() #2 {
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void:
+define amdgpu_kernel void @test_call_void_func_void() {
+  call void @void_func_void()
+  ret void
+}
+
+define void @void_func_void_clobber_s40_s41() #2 {
+  call void asm sideeffect "", "~{SGPR40_SGPR41}"() #0
+  ret void
+}
+
+define amdgpu_kernel void @test_call_void_func_void_clobber_s40_s41() {
+  call void @void_func_void_clobber_s40_s41()
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@ -0,0 +1,235 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+
+%struct.ByValStruct = type { [4 x i32] }
+
+; GCN-LABEL: {{^}}void_func_byval_struct:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5{{$}}
+; GCN-NOT: s32
+
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:16{{$}}
+; GCN-NOT: s32
+define void @void_func_byval_struct(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 {
+entry:
+  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
+  %tmp = load volatile i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, 1
+  store volatile i32 %add, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
+  %tmp1 = load volatile i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %tmp1, 2
+  store volatile i32 %add3, i32* %arrayidx2, align 4
+  store volatile i32 9, i32 addrspace(1)* null, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v32
+; GCN: v_writelane_b32
+
+; GCN-DAG: s_add_u32 s32, s32, 0x900{{$}}
+
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
+; GCN: v_add_i32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
+; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s5{{$}}
+
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN: v_add_i32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
+
+; GCN: s_swappc_b64
+
+; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:16{{$}}
+
+; GCN: v_readlane_b32
+; GCN: buffer_load_dword v32,
+; GCN: s_sub_u32 s32, s32, 0x900{{$}}
+; GCN: s_setpc_b64
+define void  @void_func_byval_struct_non_leaf(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 {
+entry:
+  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
+  %tmp = load volatile i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, 1
+  store volatile i32 %add, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
+  %tmp1 = load volatile i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %tmp1, 2
+  call void @external_void_func_void()
+  store volatile i32 %add3, i32* %arrayidx2, align 4
+  store volatile i32 9, i32 addrspace(1)* null, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}call_void_func_byval_struct_func:
+; GCN: s_mov_b32 s5, s32
+; GCN: s_add_u32 s32, s32, 0xa00{{$}}
+; GCN: v_writelane_b32
+
+; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}}
+; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
+; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
+
+; VI-DAG: v_lshrrev_b32_e64 v{{[0-9]+}}, 6
+; CI-DAG: v_lshr_b32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 6
+
+; GCN-DAG: v_add_i32_e64 [[FI_ADD0:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 8,
+; GCN-DAG: v_or_b32_e32 [[FI_OR0:v[0-9]+]], 4, [[FI_ADD0]]
+
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
+
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], [[FI_OR0]], s[0:3], s4 offen offset:4
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], [[FI_OR0]], s[0:3], s4 offen offset:8
+
+; FIXME: or fails to combine with add, so FI doesn't fold and scratch wave offset is used
+; VI-DAG: v_lshrrev_b32_e64 v{{[0-9]+}}, 6
+; CI-DAG: v_lshr_b32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 6
+
+; GCN-DAG: v_add_i32_e64 [[FI_ADD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 24,
+; GCN-DAG: v_or_b32_e32 [[FI_OR1:v[0-9]+]], 4, [[FI_ADD1]]
+
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
+
+
+
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:4
+
+
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], [[FI_OR1]], s[0:3], s4 offen offset:4
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], [[FI_OR1]], s[0:3], s4 offen offset:8
+; GCN: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
+; GCN: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
+
+
+; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
+; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:20
+
+; GCN: s_swappc_b64
+; GCN-NEXT: s_sub_u32 s32, s32, 0x800{{$}}
+
+; GCN: v_readlane_b32
+
+; GCN: s_sub_u32 s32, s32, 0xa00{{$}}
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @call_void_func_byval_struct_func() #0 {
+entry:
+  %arg0 = alloca %struct.ByValStruct, align 4
+  %arg1 = alloca %struct.ByValStruct, align 4
+  %tmp = bitcast %struct.ByValStruct* %arg0 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
+  %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
+  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
+  store volatile i32 9, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
+  store volatile i32 13, i32* %arrayidx2, align 4
+  call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
+  ret void
+}
+
+; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel:
+; GCN: s_mov_b32 s33, s7
+; GCN: s_add_u32 s32, s33, 0xa00{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
+; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8
+; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24
+
+; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}}
+
+; FIXME: Fold offset
+; GCN-DAG: v_or_b32_e32 [[OR_FI0:v[0-9]+]], 4,
+
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], [[OR_FI0]], s[0:3], s33 offen offset:4
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], [[OR_FI0]], s[0:3], s33 offen offset:8
+
+; FIXME: Fold offset
+; GCN-DAG: v_or_b32_e32 [[OR_FI1:v[0-9]+]], 4,
+
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8
+
+
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:12
+; GCN: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword [[LOAD2]], off, s[0:3], s32{{$}}
+
+
+
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], [[OR_FI1]], s[0:3], s33 offen offset:4
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], [[OR_FI1]], s[0:3], s33 offen offset:8
+; GCN: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
+; GCN: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
+
+
+; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:24
+; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:28
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:20
+
+
+; GCN: s_swappc_b64
+; FIXME: Dead SP modfication
+; GCN-NEXT: s_sub_u32 s32, s32, 0x800{{$}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 {
+entry:
+  %arg0 = alloca %struct.ByValStruct, align 4
+  %arg1 = alloca %struct.ByValStruct, align 4
+  %tmp = bitcast %struct.ByValStruct* %arg0 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
+  %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
+  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
+  store volatile i32 9, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
+  store volatile i32 13, i32* %arrayidx2, align 4
+  call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
+  ret void
+}
+
+; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim:
+define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 {
+entry:
+  %arg0 = alloca %struct.ByValStruct, align 4
+  %arg1 = alloca %struct.ByValStruct, align 4
+  %tmp = bitcast %struct.ByValStruct* %arg0 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
+  %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
+  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
+  store volatile i32 9, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
+  store volatile i32 13, i32* %arrayidx2, align 4
+  call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
+  ret void
+}
+
+declare void @external_void_func_void() #0
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #3
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #3
+
+attributes #0 = { nounwind }
+attributes #1 = { noinline norecurse nounwind }
+attributes #2 = { nounwind norecurse "no-frame-pointer-elim"="true" }
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@ -0,0 +1,527 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s
+
+declare void @external_void_func_i1(i1) #0
+declare void @external_void_func_i1_signext(i1 signext) #0
+declare void @external_void_func_i1_zeroext(i1 zeroext) #0
+
+declare void @external_void_func_i8(i8) #0
+declare void @external_void_func_i8_signext(i8 signext) #0
+declare void @external_void_func_i8_zeroext(i8 zeroext) #0
+
+declare void @external_void_func_i16(i16) #0
+declare void @external_void_func_i16_signext(i16 signext) #0
+declare void @external_void_func_i16_zeroext(i16 zeroext) #0
+
+declare void @external_void_func_i32(i32) #0
+declare void @external_void_func_i64(i64) #0
+
+declare void @external_void_func_f16(half) #0
+declare void @external_void_func_f32(float) #0
+declare void @external_void_func_f64(double) #0
+
+declare void @external_void_func_v2i16(<2 x i16>) #0
+declare void @external_void_func_v2f16(<2 x half>) #0
+
+declare void @external_void_func_v2i32(<2 x i32>) #0
+declare void @external_void_func_v3i32(<3 x i32>) #0
+declare void @external_void_func_v4i32(<4 x i32>) #0
+declare void @external_void_func_v8i32(<8 x i32>) #0
+declare void @external_void_func_v16i32(<16 x i32>) #0
+declare void @external_void_func_v32i32(<32 x i32>) #0
+declare void @external_void_func_v32i32_i32(<32 x i32>, i32) #0
+
+; return value and argument
+declare i32 @external_i32_func_i32(i32) #0
+
+; Structs
+declare void @external_void_func_struct_i8_i32({ i8, i32 }) #0
+declare void @external_void_func_byval_struct_i8_i32({ i8, i32 }* byval) #0
+declare void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* sret, { i8, i32 }* byval) #0
+
+declare void @external_void_func_v16i8(<16 x i8>) #0
+
+
+; FIXME: Should be passing -1
+; GCN-LABEL: {{^}}test_call_external_void_func_i1_imm:
+; MESA: s_mov_b32 s36, SCRATCH_RSRC_DWORD
+
+; MESA-DAG: s_mov_b64 s[0:1], s[36:37]
+
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4
+; GCN-DAG: v_mov_b32_e32 v0, 1{{$}}
+; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
+
+; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
+  call void @external_void_func_i1(i1 true)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext:
+; MESA: s_mov_b32 s33, s3{{$}}
+; HSA: s_mov_b32 s33, s9{{$}}
+
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+4
+; GCN-NEXT: buffer_load_ubyte [[VAR:v[0-9]+]]
+; HSA-NEXT: s_mov_b32 s4, s33
+; HSA-NEXT: s_mov_b32 s32, s33
+
+; MESA-DAG: s_mov_b32 s4, s33{{$}}
+; MESA-DAG: s_mov_b32 s32, s33{{$}}
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
+  %var = load volatile i1, i1 addrspace(1)* undef
+  call void @external_void_func_i1_signext(i1 %var)
+  ret void
+}
+
+; FIXME: load should be scheduled before getpc
+; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext:
+; MESA: s_mov_b32 s33, s3{{$}}
+
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+4
+; GCN-NEXT: buffer_load_ubyte v0
+
+; GCN-DAG: s_mov_b32 s4, s33{{$}}
+; GCN-DAG: s_mov_b32 s32, s33{{$}}
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
+  %var = load volatile i1, i1 addrspace(1)* undef
+  call void @external_void_func_i1_zeroext(i1 %var)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm:
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4
+; GCN-NEXT: v_mov_b32_e32 v0, 0x7b
+
+; HSA-DAG: s_mov_b32 s4, s33{{$}}
+; GCN-DAG: s_mov_b32 s32, s33{{$}}
+
+; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
+  call void @external_void_func_i8(i8 123)
+  ret void
+}
+
+; FIXME: don't wait before call
+; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext:
+; HSA-DAG: s_mov_b32 s33, s9{{$}}
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+; GCN-DAG: buffer_load_sbyte v0
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4
+
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s3
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
+  %var = load volatile i8, i8 addrspace(1)* undef
+  call void @external_void_func_i8_signext(i8 %var)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext:
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+; HSA-DAG: s_mov_b32 s33, s9{{$}}
+
+; GCN-DAG: buffer_load_ubyte v0
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4
+
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
+  %var = load volatile i8, i8 addrspace(1)* undef
+  call void @external_void_func_i8_zeroext(i8 %var)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm:
+; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}}
+
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
+  call void @external_void_func_i16(i16 123)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext:
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+; GCN-DAG: buffer_load_sshort v0
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4
+
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
+  %var = load volatile i16, i16 addrspace(1)* undef
+  call void @external_void_func_i16_signext(i16 %var)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext:
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+
+; GCN-DAG: buffer_load_ushort v0
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4
+
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
+  %var = load volatile i16, i16 addrspace(1)* undef
+  call void @external_void_func_i16_zeroext(i16 %var)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm:
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4
+; GCN: v_mov_b32_e32 v0, 42
+; GCN-DAG: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+
+; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
+  call void @external_void_func_i32(i32 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_i64_imm:
+; GCN-DAG: s_movk_i32 [[K0:s[0-9]+]], 0x7b{{$}}
+; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, [[K0]]
+; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i64@rel32@lo+4
+; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i64@rel32@hi+4
+; GCN-DAG: v_mov_b32_e32 v1, [[K1]]
+; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
+  call void @external_void_func_i64(i64 123)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm:
+; VI: v_mov_b32_e32 v0, 0x4400
+; CI: v_mov_b32_e32 v0, 4.0
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
+  call void @external_void_func_f16(half 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_f32_imm:
+; GCN: v_mov_b32_e32 v0, 4.0
+; GCN-NOT: v0
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
+  call void @external_void_func_f32(float 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm:
+; GCN: v_mov_b32_e32 v0, 0{{$}}
+; GCN: v_mov_b32_e32 v1, 0x40100000
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
+  call void @external_void_func_f64(double 4.0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v2i16:
+; GFX9: buffer_load_dword v0
+; GFX9-NOT: v0
+; GFX9: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
+  %val = load <2 x i16>, <2 x i16> addrspace(1)* undef
+  call void @external_void_func_v2i16(<2 x i16> %val)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v2f16:
+; GFX9: buffer_load_dword v0
+; GFX9-NOT: v0
+; GFX9: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* undef
+  call void @external_void_func_v2f16(<2 x half> %val)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v2i32:
+; GCN: buffer_load_dwordx2 v[0:1]
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
+  call void @external_void_func_v2i32(<2 x i32> %val)
+  ret void
+}
+
+; FIXME: Passing 4th
+; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm:
+; HSA-DAG: s_mov_b32 s33, s9
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+
+; GCN-DAG: v_mov_b32_e32 v0
+; GCN-DAG: v_mov_b32_e32 v1
+; GCN-DAG: v_mov_b32_e32 v2
+; GCN-DAG: v_mov_b32_e32 v3
+
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
+  call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
+; GCN: buffer_load_dwordx4 v[0:3]
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
+  call void @external_void_func_v4i32(<4 x i32> %val)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
+  %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
+  %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
+  call void @external_void_func_v8i32(<8 x i32> %val)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v16i32:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
+  %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
+  %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
+  call void @external_void_func_v16i32(<16 x i32> %val)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v32i32:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN-DAG: buffer_load_dwordx4 v[16:19], off
+; GCN-DAG: buffer_load_dwordx4 v[20:23], off
+; GCN-DAG: buffer_load_dwordx4 v[24:27], off
+; GCN-DAG: buffer_load_dwordx4 v[28:31], off
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
+  %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
+  %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
+  call void @external_void_func_v32i32(<32 x i32> %val)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32:
+; HSA-DAG: s_mov_b32 s33, s9
+; HSA-DAG: s_add_u32 [[SP_REG:s[0-9]+]], s33, 0x100{{$}}
+
+; MESA-DAG: s_mov_b32 s33, s3{{$}}
+; MESA-DAG: s_add_u32 [[SP_REG:s[0-9]+]], s33, 0x100{{$}}
+
+; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN-DAG: buffer_load_dwordx4 v[16:19], off
+; GCN-DAG: buffer_load_dwordx4 v[20:23], off
+; GCN-DAG: buffer_load_dwordx4 v[24:27], off
+; GCN-DAG: buffer_load_dwordx4 v[28:31], off
+
+; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SP_REG]]{{$}}
+; GCN: s_waitcnt
+; GCN-NEXT: s_swappc_b64
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
+  %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
+  %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0
+  %val1 = load i32, i32 addrspace(1)* undef
+  call void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1)
+  ret void
+}
+
+; FIXME: No wait after call
+; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm:
+; GCN: v_mov_b32_e32 v0, 42
+; GCN: s_swappc_b64 s[30:31],
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[36:39], 0
+define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
+  %val = call i32 @external_i32_func_i32(i32 42)
+  store volatile i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
+; GCN: buffer_load_ubyte v0, off
+; GCN: buffer_load_dword v1, off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
+  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(2)* undef
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+  call void @external_void_func_struct_i8_i32({ i8, i32 } %val)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32:
+; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
+; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
+; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], s33 offset:8
+; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], s33 offset:12
+
+; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8
+; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12
+
+; GCN: s_add_u32 [[SP]], [[SP]], 0x200
+
+; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8
+; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12
+
+; HSA: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4
+; HSA: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}}
+
+
+; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8
+; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12
+
+; MESA: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4
+; MESA: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}}
+
+; GCN-NEXT: s_swappc_b64
+; GCN-NEXT: s_sub_u32 [[SP]], [[SP]], 0x200
+define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 {
+  %val = alloca { i8, i32 }, align 4
+  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 1
+  store i8 3, i8* %gep0
+  store i32 8, i32* %gep1
+  call void @external_void_func_byval_struct_i8_i32({ i8, i32 }* %val)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}}
+; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
+; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
+; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8
+; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
+
+; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8
+; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
+
+; GCN-DAG: s_add_u32 [[SP]], [[SP]], 0x200
+; GCN: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
+; GCN: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}}
+; GCN-NEXT: s_swappc_b64
+; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
+; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
+; GCN: s_sub_u32 [[SP]], [[SP]], 0x200
+
+; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off
+; GCN: buffer_store_dword [[LOAD_OUT_VAL1]], off
+define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
+  %in.val = alloca { i8, i32 }, align 4
+  %out.val = alloca { i8, i32 }, align 4
+  %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 0
+  %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 1
+  store i8 3, i8* %in.gep0
+  store i32 8, i32* %in.gep1
+  call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* %out.val, { i8, i32 }* %in.val)
+  %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 0
+  %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 1
+  %out.val0 = load i8, i8* %out.gep0
+  %out.val1 = load i32, i32* %out.gep1
+
+  store volatile i8 %out.val0, i8 addrspace(1)* undef
+  store volatile i32 %out.val1, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v16i8:
+define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
+  %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
+  %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  call void @external_void_func_v16i8(<16 x i8> %val)
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@ -0,0 +1,251 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare void @external_void_func_void() #0
+
+; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
+; GCN: s_mov_b32 s33, s7
+; GCN: s_getpc_b64 s[34:35]
+; GCN-NEXT: s_add_u32 s34, s34,
+; GCN-NEXT: s_addc_u32 s35, s35,
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_mov_b32 s32, s33
+; GCN: s_swappc_b64 s[30:31], s[34:35]
+
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: #ASMSTART
+; GCN-NEXT: #ASMEND
+; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35]
+define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
+  call void @external_void_func_void()
+  call void asm sideeffect "", ""() #0
+  call void @external_void_func_void()
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
+; GCN: v_writelane_b32 v32, s33, 0
+; GCN: v_writelane_b32 v32, s34, 1
+; GCN: v_writelane_b32 v32, s35, 2
+; GCN: v_writelane_b32 v32, s36, 3
+; GCN: v_writelane_b32 v32, s37, 4
+
+; GCN: s_mov_b32 s33, s5
+; GCN: s_swappc_b64
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_swappc_b64
+; GCN: s_mov_b32 s5, s33
+; GCN: v_readlane_b32 s37, v32, 4
+; GCN: v_readlane_b32 s36, v32, 3
+; GCN: v_readlane_b32 s35, v32, 2
+; GCN: v_readlane_b32 s34, v32, 1
+; GCN: v_readlane_b32 s33, v32, 0
+; GCN: s_setpc_b64
+define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
+  call void @external_void_func_void()
+  call void asm sideeffect "", ""() #0
+  call void @external_void_func_void()
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31:
+; GCN: s_waitcnt
+; GCN-NEXT: s_mov_b64 [[SAVEPC:s\[[0-9]+:[0-9]+\]]], s[30:31]
+; GCN-NEXT: #ASMSTART
+; GCN: ; clobber
+; GCN-NEXT: #ASMEND
+; GCN-NEXT: s_mov_b64 s[30:31], [[SAVEPC]]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+define void @void_func_void_clobber_s30_s31() #2 {
+  call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_void_clobber_vcc:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_setpc_b64 s[30:31]
+define void @void_func_void_clobber_vcc() #2 {
+  call void asm sideeffect "", "~{VCC}"() #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc:
+; GCN: s_getpc_b64
+; GCN-NEXT: s_add_u32
+; GCN-NEXT: s_addc_u32
+; GCN: s_mov_b64 s[34:35], vcc
+; GCN-NEXT: s_swappc_b64
+; GCN: s_mov_b64 vcc, s[34:35]
+define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(i32 addrspace(1)* %out) #0 {
+  %vcc = call i64 asm sideeffect "; def $0", "={vcc}"()
+  call void @void_func_void_clobber_vcc()
+  %val0 = load volatile i32, i32 addrspace(1)* undef
+  %val1 = load volatile i32, i32 addrspace(1)* undef
+  call void asm sideeffect "; use $0", "{vcc}"(i64 %vcc)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31:
+; GCN: s_mov_b32 s33, s31
+; GCN-NEXT: s_swappc_b64
+; GCN-NEXT: s_mov_b32 s31, s33
+define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 {
+  %s31 = call i32 asm sideeffect "; def $0", "={s31}"()
+  call void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{s31}"(i32 %s31)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
+; GCN: v_mov_b32_e32 v32, v31
+; GCN-NEXT: s_swappc_b64
+; GCN-NEXT: v_mov_b32_e32 v31, v32
+define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
+  %v31 = call i32 asm sideeffect "; def $0", "={v31}"()
+  call void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{v31}"(i32 %v31)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
+; GCN: s_mov_b32 s34, s9
+; GCN: ; def s33
+; GCN-NEXT: #ASMEND
+; GCN: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
+; GCN-NEXT: s_mov_b32 s4, s34
+; GCN-NEXT: s_mov_b32 s32, s34
+; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use s33
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(1)* %out) #0 {
+  %s33 = call i32 asm sideeffect "; def $0", "={s33}"()
+  call void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{s33}"(i32 %s33)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32:
+; GCN: s_mov_b32 s33, s9
+; GCN: ; def v32
+; GCN-NEXT: #ASMEND
+; GCN: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_mov_b32 s32, s33
+; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use v32
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_void_func_void_preserves_v32(i32 addrspace(1)* %out) #0 {
+  %v32 = call i32 asm sideeffect "; def $0", "={v32}"()
+  call void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{v32}"(i32 %v32)
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_void_clobber_s33:
+; GCN: v_writelane_b32 v0, s33, 0
+; GCN-NEXT: #ASMSTART
+; GCN-NEXT: ; clobber
+; GCN-NEXT: #ASMEND
+; GCN-NEXT:	v_readlane_b32 s33, v0, 0
+; GCN-NEXT: s_setpc_b64
+define void @void_func_void_clobber_s33() #2 {
+  call void asm sideeffect "; clobber", "~{s33}"() #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33:
+; GCN: s_mov_b32 s33, s7
+; GCN: s_getpc_b64
+; GCN-NEXT: s_add_u32
+; GCN-NEXT: s_addc_u32
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_mov_b32 s32, s33
+; GCN: s_swappc_b64
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
+  call void @void_func_void_clobber_s33()
+  ret void
+}
+
+; GCN-LABEL: {{^}}callee_saved_sgpr_func:
+; GCN-NOT: s40
+; GCN: v_writelane_b32 v32, s40
+; GCN: s_swappc_b64
+; GCN-NOT: s40
+; GCN: ; use s40
+; GCN-NOT: s40
+; GCN: v_readlane_b32 s40, v32
+; GCN-NOT: s40
+define void @callee_saved_sgpr_func() #2 {
+  %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+  call void @external_void_func_void()
+  call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}callee_saved_sgpr_kernel:
+; GCN-NOT: s40
+; GCN: ; def s40
+; GCN-NOT: s40
+; GCN: s_swappc_b64
+; GCN-NOT: s40
+; GCN: ; use s40
+; GCN-NOT: s40
+define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 {
+  %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+  call void @external_void_func_void()
+  call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+  ret void
+}
+
+; First call preserved VGPR is used so it can't be used for SGPR spills.
+; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func:
+; GCN-NOT: s40
+; GCN: v_writelane_b32 v33, s40
+; GCN: s_swappc_b64
+; GCN-NOT: s40
+; GCN: ; use s40
+; GCN-NOT: s40
+; GCN: v_readlane_b32 s40, v33
+; GCN-NOT: s40
+define void @callee_saved_sgpr_vgpr_func() #2 {
+  %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+  %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
+  call void @external_void_func_void()
+  call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+  call void asm sideeffect "; use $0", "v"(i32 %v32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_kernel:
+; GCN-NOT: s40
+; GCN: ; def s40
+; GCN-NOT: s40
+; GCN: s_swappc_b64
+; GCN-NOT: s40
+; GCN: ; use s40
+; GCN-NOT: s40
+define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 {
+  %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+  %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
+  call void @external_void_func_void()
+  call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+  call void asm sideeffect "; use $0", "v"(i32 %v32) #0
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }
--- a/llvm/test/CodeGen/AMDGPU/call-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-return-types.ll
@ -0,0 +1,241 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare void @external_void_func_void() #0
+
+declare i1 @external_i1_func_void() #0
+declare zeroext i1 @external_i1_zeroext_func_void() #0
+declare signext i1 @external_i1_signext_func_void() #0
+
+declare i8 @external_i8_func_void() #0
+declare zeroext i8 @external_i8_zeroext_func_void() #0
+declare signext i8 @external_i8_signext_func_void() #0
+
+declare i16 @external_i16_func_void() #0
+declare zeroext i16 @external_i16_zeroext_func_void() #0
+declare signext i16 @external_i16_signext_func_void() #0
+
+declare i32 @external_i32_func_void() #0
+declare i64 @external_i64_func_void() #0
+declare half @external_f16_func_void() #0
+declare float @external_f32_func_void() #0
+declare double @external_f64_func_void() #0
+
+declare <2 x i32> @external_v2i32_func_void() #0
+declare <3 x i32> @external_v3i32_func_void() #0
+declare <4 x i32> @external_v4i32_func_void() #0
+declare <5 x i32> @external_v5i32_func_void() #0
+declare <8 x i32> @external_v8i32_func_void() #0
+declare <16 x i32> @external_v16i32_func_void() #0
+declare <32 x i32> @external_v32i32_func_void() #0
+declare { <32 x i32>, i32 } @external_v32i32_i32_func_void() #0
+declare <2 x i16> @external_v2i16_func_void() #0
+declare <2 x half> @external_v2f16_func_void() #0
+
+declare { i32, i64 } @external_i32_i64_func_void() #0
+
+; GCN-LABEL: {{^}}test_call_external_void_func_void:
+define amdgpu_kernel void @test_call_external_void_func_void() #0 {
+  call void @external_void_func_void()
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_void_x2:
+define amdgpu_kernel void @test_call_external_void_func_void_x2() #0 {
+  call void @external_void_func_void()
+  call void @external_void_func_void()
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i1_func_void:
+define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
+  %val = call i1 @external_i1_func_void()
+  store volatile i1 %val, i1 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i1_zeroext_func_void:
+define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
+  %val = call i1 @external_i1_zeroext_func_void()
+  %val.ext = zext i1 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i1_signext_func_void:
+define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
+  %val = call i1 @external_i1_signext_func_void()
+  %val.ext = zext i1 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i8_func_void:
+define amdgpu_kernel void @test_call_external_i8_func_void() #0 {
+  %val = call i8 @external_i8_func_void()
+  store volatile i8 %val, i8 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i8_zeroext_func_void:
+define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 {
+  %val = call i8 @external_i8_zeroext_func_void()
+  %val.ext = zext i8 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i8_signext_func_void:
+define amdgpu_kernel void @test_call_external_i8_signext_func_void() #0 {
+  %val = call i8 @external_i8_signext_func_void()
+  %val.ext = zext i8 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i16_func_void:
+define amdgpu_kernel void @test_call_external_i16_func_void() #0 {
+  %val = call i16 @external_i16_func_void()
+  store volatile i16 %val, i16 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i16_zeroext_func_void:
+define amdgpu_kernel void @test_call_external_i16_zeroext_func_void() #0 {
+  %val = call i16 @external_i16_zeroext_func_void()
+  %val.ext = zext i16 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i16_signext_func_void:
+define amdgpu_kernel void @test_call_external_i16_signext_func_void() #0 {
+  %val = call i16 @external_i16_signext_func_void()
+  %val.ext = zext i16 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i32_func_void:
+define amdgpu_kernel void @test_call_external_i32_func_void() #0 {
+  %val = call i32 @external_i32_func_void()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i64_func_void:
+define amdgpu_kernel void @test_call_external_i64_func_void() #0 {
+  %val = call i64 @external_i64_func_void()
+  store volatile i64 %val, i64 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_f16_func_void:
+define amdgpu_kernel void @test_call_external_f16_func_void() #0 {
+  %val = call half @external_f16_func_void()
+  store volatile half %val, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_f32_func_void:
+define amdgpu_kernel void @test_call_external_f32_func_void() #0 {
+  %val = call float @external_f32_func_void()
+  store volatile float %val, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_f64_func_void:
+define amdgpu_kernel void @test_call_external_f64_func_void() #0 {
+  %val = call double @external_f64_func_void()
+  store volatile double %val, double addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v2i32_func_void:
+define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 {
+  %val = call <2 x i32> @external_v2i32_func_void()
+  store volatile <2 x i32> %val, <2 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v3i32_func_void:
+define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 {
+  %val = call <3 x i32> @external_v3i32_func_void()
+  store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v4i32_func_void:
+define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 {
+  %val = call <4 x i32> @external_v4i32_func_void()
+  store volatile <4 x i32> %val, <4 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v5i32_func_void:
+define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 {
+  %val = call <5 x i32> @external_v5i32_func_void()
+  store volatile <5 x i32> %val, <5 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v8i32_func_void:
+define amdgpu_kernel void @test_call_external_v8i32_func_void() #0 {
+  %val = call <8 x i32> @external_v8i32_func_void()
+  store volatile <8 x i32> %val, <8 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v16i32_func_void:
+define amdgpu_kernel void @test_call_external_v16i32_func_void() #0 {
+  %val = call <16 x i32> @external_v16i32_func_void()
+  store volatile <16 x i32> %val, <16 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v32i32_func_void:
+define amdgpu_kernel void @test_call_external_v32i32_func_void() #0 {
+  %val = call <32 x i32> @external_v32i32_func_void()
+  store volatile <32 x i32> %val, <32 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v2i16_func_void:
+define amdgpu_kernel void @test_call_external_v2i16_func_void() #0 {
+  %val = call <2 x i16> @external_v2i16_func_void()
+  store volatile <2 x i16> %val, <2 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_v2f16_func_void:
+define amdgpu_kernel void @test_call_external_v2f16_func_void() #0 {
+  %val = call <2 x half> @external_v2f16_func_void()
+  store volatile <2 x half> %val, <2 x half> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_i32_i64_func_void:
+define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 {
+  %val = call { i32, i64 } @external_i32_i64_func_void()
+  %val.0 = extractvalue { i32, i64 } %val, 0
+  %val.1 = extractvalue { i32, i64 } %val, 1
+  store volatile i32 %val.0, i32 addrspace(1)* undef
+  store volatile i64 %val.1, i64 addrspace(1)* undef
+  ret void
+}
+
+; Requires writing results to stack
+; GCN-LABEL: {{^}}test_call_external_v32i32_i32_func_void:
+define amdgpu_kernel void @test_call_external_v32i32_i32_func_void() #0 {
+  %val = call { <32 x i32>, i32 } @external_v32i32_i32_func_void()
+  %val0 = extractvalue { <32 x i32>, i32 } %val, 0
+  %val1 = extractvalue { <32 x i32>, i32 } %val, 1
+  store volatile <32 x i32> %val0, <32 x i32> addrspace(1)* undef, align 8
+  store volatile i32 %val1, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -amdgpu-function-calls -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -amdgpu-function-calls -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s

 ; GCN-LABEL: {{^}}callee_no_stack:
 ; GCN: ; BB#0:
@ -8,6 +9,14 @@ define void @callee_no_stack() #0 {
  ret void
 }

+; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim:
+; GCN: ; BB#0:
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @callee_no_stack_no_fp_elim() #1 {
+  ret void
+}
+
 ; Requires frame pointer for access to local regular object.

 ; GCN-LABEL: {{^}}callee_with_stack:
@ -24,4 +33,51 @@ define void @callee_with_stack() #0 {
  ret void
 }

+; GCN-LABEL: {{^}}callee_with_stack_and_call:
+; GCN: ; BB#0:
+; GCN-NEXT: s_waitcnt
+
+; GCN-DAG: s_mov_b32 s5, s32
+; GCN-DAG: v_writelane_b32 v32, s33,
+; GCN-DAG: v_writelane_b32 v32, s34,
+; GCN-DAG: v_writelane_b32 v32, s35,
+; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: s_add_u32 s32, s32, 0x200{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
+; GCN-DAG: s_mov_b32 s33, s5
+
+
+; GCN: s_swappc_b64
+; GCN: s_mov_b32 s5, s33
+; GCN-DAG: v_readlane_b32 s35,
+; GCN-DAG: v_readlane_b32 s34,
+; GCN-DAG: v_readlane_b32 s33,
+; GCN: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @callee_with_stack_and_call() #0 {
+  %alloca = alloca i32
+  store volatile i32 0, i32* %alloca
+  call void @external_void_func_void()
+  ret void
+}
+
+; Should be able to copy incoming stack pointer directly to inner
+; call's stack pointer argument.
+
+; GCN-LABEL: {{^}}callee_no_stack_with_call:
+; GCN: s_waitcnt
+; GCN-NOT: s32
+; GCN: s_mov_b32 s33, s5
+; GCN: s_swappc_b64
+; GCN: s_mov_b32 s5, s33
+; GCN-NOT: s32
+; GCN: s_setpc_b64
+define void @callee_no_stack_with_call() #0 {
+  call void @external_void_func_void()
+  ret void
+}
+
+declare void @external_void_func_void() #0
+
 attributes #0 = { nounwind }
+attributes #1 = { nounwind "no-frame-pointer-elim"="true" }
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-function-calls -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VI %s
+
+; Test calls when called by other callable functions rather than
+; kernels.
+
+declare void @external_void_func_i32(i32) #0
+
+; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
+; GCN: s_waitcnt
+; GCN-NOT: s32
+; GCN: s_swappc_b64
+; GCN-NOT: s32
+; GCN: s_setpc_b64
+define void @test_func_call_external_void_func_i32_imm() #0 {
+  call void @external_void_func_i32(i32 42)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
+; GCN: s_waitcnt
+; GCN: s_mov_b32 s5, s32
+; GCN: s_add_u32 s32, s32, 0x1100{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
+; GCN: s_swappc_b64
+; GCN: s_sub_u32 s32, s32, 0x1100{{$}}
+; GCN: s_setpc_b64
+define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
+  %alloca = alloca [16 x i32], align 4
+  %gep0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 0
+  %gep15 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 16
+  store volatile i32 0, i32* %gep0
+  store volatile i32 0, i32* %gep15
+  call void @external_void_func_i32(i32 42)
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }