AMDGPU: Add 32-bit constant address space

Note: This is a candidate for LLVM 6.0, because it was planned to be
      in that release but was delayed due to a long review period.

Merge conflict in release_60 - resolution:
    Add "-p6:32:32" into the second (non-amdgiz) string.

Only scalar loads support 32-bit pointers. An address in a VGPR will
fail to compile. That's OK because the results of loads will only be used
in places where VGPRs are forbidden.

Updated AMDGPUAliasAnalysis and used SReg_64_XEXEC.
The tests cover all uses cases we need for Mesa.

Reviewers: arsenm, nhaehnle

Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D41651

llvm-svn: 324487
This commit is contained in:
Marek Olsak 2018-02-07 16:01:00 +00:00
parent b2cc77985b
commit 871c30e540
14 changed files with 375 additions and 19 deletions

View File

@ -285,6 +285,7 @@ LLVM Address Space number is used throughout LLVM (for example, in LLVM IR).
3 Local (group/LDS) Local (group/LDS) Local (group/LDS) Local (group/LDS)
4 Generic (Flat) Region (GDS) Region (GDS) Constant
5 Region (GDS) Private (Scratch) Private (Scratch) Private (Scratch)
6 Constant 32-bit Constant 32-bit Constant 32-bit Constant 32-bit
================== ================= ================= ================= =================
Current Default

View File

@ -224,6 +224,9 @@ struct AMDGPUAS {
GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
LOCAL_ADDRESS = 3, ///< Address space for local memory.
CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
/// Address space for direct addressible parameter memory (CONST0)
PARAM_D_ADDRESS = 6,
/// Address space for indirect addressible parameter memory (VTX1)

View File

@ -115,7 +115,8 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
bool OrLocal) {
const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) {
if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS ||
Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) {
return true;
}

View File

@ -466,7 +466,8 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
}
bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
canWidenScalarExtLoad(I)) {
IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());

View File

@ -162,6 +162,7 @@ private:
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
bool &Imm) const;
SDValue Expand32BitAddress(SDValue Addr) const;
bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
bool &Imm) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@ -636,7 +637,8 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
if (!N->readMem())
return false;
if (CbId == -1)
return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
}
@ -1438,19 +1440,45 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
return true;
}
SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
if (Addr.getValueType() != MVT::i32)
return Addr;
// Zero-extend a 32-bit address.
SDLoc SL(Addr);
const MachineFunction &MF = CurDAG->getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
unsigned AddrHiVal = Info->get32BitAddressHighBits();
SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
const SDValue Ops[] = {
CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
Addr,
CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
0),
CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
};
return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
Ops), 0);
}
bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
SDValue &Offset, bool &Imm) const {
SDLoc SL(Addr);
if (CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
if (SelectSMRDOffset(N1, Offset, Imm)) {
SBase = N0;
SBase = Expand32BitAddress(N0);
return true;
}
}
SBase = Addr;
SBase = Expand32BitAddress(Addr);
Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
Imm = true;
return true;

View File

@ -229,6 +229,9 @@ static bool isInstrUniform(const MachineInstr &MI) {
isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
return true;
if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return true;
const Instruction *I = dyn_cast<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.uniform");
}
@ -293,7 +296,8 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
if (!I.hasOneMemOperand())
return false;
if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS)
if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
(*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT)
return false;
if (!isInstrUniform(I))

View File

@ -266,7 +266,7 @@ static StringRef computeDataLayout(const Triple &TT) {
// 32-bit private, local, and region pointers. 64-bit global, constant and
// flat.
return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
}

View File

@ -237,6 +237,7 @@ unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
AMDGPUAS AS = ST->getAMDGPUAS();
if (AddrSpace == AS.GLOBAL_ADDRESS ||
AddrSpace == AS.CONSTANT_ADDRESS ||
AddrSpace == AS.CONSTANT_ADDRESS_32BIT ||
AddrSpace == AS.FLAT_ADDRESS)
return 128;
if (AddrSpace == AS.LOCAL_ADDRESS ||

View File

@ -900,7 +900,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AS == AMDGPUASI.GLOBAL_ADDRESS)
return isLegalGlobalAddressingMode(AM);
if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@ -1023,7 +1024,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// If we have an uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
*IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
*IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
(Align % 4 == 0) : true;
}
@ -1066,7 +1068,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
return AS == AMDGPUASI.GLOBAL_ADDRESS ||
AS == AMDGPUASI.FLAT_ADDRESS ||
AS == AMDGPUASI.CONSTANT_ADDRESS;
AS == AMDGPUASI.CONSTANT_ADDRESS ||
AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
}
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@ -4008,13 +4011,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
const Triple &TT = getTargetMachine().getTargetTriple();
return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
AMDGPU::shouldEmitConstantsToTextSection(TT);
}
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
!shouldEmitFixup(GV) &&
!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
}
@ -4391,7 +4396,8 @@ bool
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// We can fold offsets for anything that doesn't require a GOT relocation.
return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
!shouldEmitGOTReloc(GA->getGlobal());
}
@ -4444,6 +4450,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
// FIXME: It isn't correct to rely on the type of the pointer. This should
// be removed when address space 0 is 64-bit.
@ -5378,7 +5385,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
unsigned NumElements = MemVT.getVectorNumElements();
if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
if (isMemOpUniform(Load))
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
@ -5386,7 +5394,9 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
// loads.
//
}
if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
AS == AMDGPUASI.GLOBAL_ADDRESS) {
if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
!Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
return SDValue();
@ -5395,7 +5405,9 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
// loads.
//
}
if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
AS == AMDGPUASI.GLOBAL_ADDRESS ||
AS == AMDGPUASI.FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);

View File

@ -47,7 +47,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDZ(false),
ImplicitBufferPtr(false),
ImplicitArgPtr(false),
GITPtrHigh(0xffffffff) {
GITPtrHigh(0xffffffff),
HighBitsOf32BitAddress(0) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const Function &F = MF.getFunction();
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
@ -164,6 +165,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
StringRef S = A.getValueAsString();
if (!S.empty())
S.consumeInteger(0, GITPtrHigh);
A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
S = A.getValueAsString();
if (!S.empty())
S.consumeInteger(0, HighBitsOf32BitAddress);
}
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(

View File

@ -186,6 +186,8 @@ private:
// current hardware only allows a 16 bit value.
unsigned GITPtrHigh;
unsigned HighBitsOf32BitAddress;
MCPhysReg getNextUserSGPR() const {
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
return AMDGPU::SGPR0 + NumUserSGPRs;
@ -411,6 +413,10 @@ public:
return GITPtrHigh;
}
unsigned get32BitAddressHighBits() const {
return HighBitsOf32BitAddress;
}
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}

View File

@ -223,7 +223,8 @@ def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>
def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
auto Ld = cast<LoadSDNode>(N);
return Ld->getAlignment() >= 4 &&
((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
(((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
(Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
!Ld->isVolatile() &&

View File

@ -447,7 +447,8 @@ bool isGlobalSegment(const GlobalValue *GV) {
}
bool isReadOnlySegment(const GlobalValue *GV) {
return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
}
bool shouldEmitConstantsToTextSection(const Triple &TT) {
@ -916,6 +917,9 @@ bool isUniformMMO(const MachineMemOperand *MMO) {
isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
return true;
if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return true;
if (const Argument *Arg = dyn_cast<Argument>(Ptr))
return isArgPassedInSGPR(Arg);

View File

@ -0,0 +1,288 @@
; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICI,SI %s
; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s
; GCN-LABEL: {{^}}load_i32:
; GCN-DAG: s_mov_b32 s3, 0
; GCN-DAG: s_mov_b32 s2, s1
; GCN-DAG: s_mov_b32 s1, s3
; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2
; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr i32, i32 addrspace(6)* %p1, i64 2
%r0 = load i32, i32 addrspace(6)* %p0
%r1 = load i32, i32 addrspace(6)* %gep1
%r = add i32 %r0, %r1
%r2 = bitcast i32 %r to float
ret float %r2
}
; GCN-LABEL: {{^}}load_v2i32:
; GCN-DAG: s_mov_b32 s3, 0
; GCN-DAG: s_mov_b32 s2, s1
; GCN-DAG: s_mov_b32 s1, s3
; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4
; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr <2 x i32>, <2 x i32> addrspace(6)* %p1, i64 2
%r0 = load <2 x i32>, <2 x i32> addrspace(6)* %p0
%r1 = load <2 x i32>, <2 x i32> addrspace(6)* %gep1
%r = add <2 x i32> %r0, %r1
%r2 = bitcast <2 x i32> %r to <2 x float>
ret <2 x float> %r2
}
; GCN-LABEL: {{^}}load_v4i32:
; GCN-DAG: s_mov_b32 s3, 0
; GCN-DAG: s_mov_b32 s2, s1
; GCN-DAG: s_mov_b32 s1, s3
; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8
; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
define amdgpu_vs <4 x float> @load_v4i32(<4 x i32> addrspace(6)* inreg %p0, <4 x i32> addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr <4 x i32>, <4 x i32> addrspace(6)* %p1, i64 2
%r0 = load <4 x i32>, <4 x i32> addrspace(6)* %p0
%r1 = load <4 x i32>, <4 x i32> addrspace(6)* %gep1
%r = add <4 x i32> %r0, %r1
%r2 = bitcast <4 x i32> %r to <4 x float>
ret <4 x float> %r2
}
; GCN-LABEL: {{^}}load_v8i32:
; GCN-DAG: s_mov_b32 s3, 0
; GCN-DAG: s_mov_b32 s2, s1
; GCN-DAG: s_mov_b32 s1, s3
; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10
; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
define amdgpu_vs <8 x float> @load_v8i32(<8 x i32> addrspace(6)* inreg %p0, <8 x i32> addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr <8 x i32>, <8 x i32> addrspace(6)* %p1, i64 2
%r0 = load <8 x i32>, <8 x i32> addrspace(6)* %p0
%r1 = load <8 x i32>, <8 x i32> addrspace(6)* %gep1
%r = add <8 x i32> %r0, %r1
%r2 = bitcast <8 x i32> %r to <8 x float>
ret <8 x float> %r2
}
; GCN-LABEL: {{^}}load_v16i32:
; GCN-DAG: s_mov_b32 s3, 0
; GCN-DAG: s_mov_b32 s2, s1
; GCN-DAG: s_mov_b32 s1, s3
; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20
; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
define amdgpu_vs <16 x float> @load_v16i32(<16 x i32> addrspace(6)* inreg %p0, <16 x i32> addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr <16 x i32>, <16 x i32> addrspace(6)* %p1, i64 2
%r0 = load <16 x i32>, <16 x i32> addrspace(6)* %p0
%r1 = load <16 x i32>, <16 x i32> addrspace(6)* %gep1
%r = add <16 x i32> %r0, %r1
%r2 = bitcast <16 x i32> %r to <16 x float>
ret <16 x float> %r2
}
; GCN-LABEL: {{^}}load_float:
; GCN-DAG: s_mov_b32 s3, 0
; GCN-DAG: s_mov_b32 s2, s1
; GCN-DAG: s_mov_b32 s1, s3
; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2
; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr float, float addrspace(6)* %p1, i64 2
%r0 = load float, float addrspace(6)* %p0
%r1 = load float, float addrspace(6)* %gep1
%r = fadd float %r0, %r1
ret float %r
}
; GCN-LABEL: {{^}}load_v2float:
; GCN-DAG: s_mov_b32 s3, 0
; GCN-DAG: s_mov_b32 s2, s1
; GCN-DAG: s_mov_b32 s1, s3
; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4
; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr <2 x float>, <2 x float> addrspace(6)* %p1, i64 2
%r0 = load <2 x float>, <2 x float> addrspace(6)* %p0
%r1 = load <2 x float>, <2 x float> addrspace(6)* %gep1
%r = fadd <2 x float> %r0, %r1
ret <2 x float> %r
}
; GCN-LABEL: {{^}}load_v4float:
; GCN-DAG: s_mov_b32 s3, 0
; GCN-DAG: s_mov_b32 s2, s1
; GCN-DAG: s_mov_b32 s1, s3
; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8
; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
define amdgpu_vs <4 x float> @load_v4float(<4 x float> addrspace(6)* inreg %p0, <4 x float> addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr <4 x float>, <4 x float> addrspace(6)* %p1, i64 2
%r0 = load <4 x float>, <4 x float> addrspace(6)* %p0
%r1 = load <4 x float>, <4 x float> addrspace(6)* %gep1
%r = fadd <4 x float> %r0, %r1
ret <4 x float> %r
}
; GCN-LABEL: {{^}}load_v8float:
; GCN-DAG: s_mov_b32 s3, 0
; GCN-DAG: s_mov_b32 s2, s1
; GCN-DAG: s_mov_b32 s1, s3
; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10
; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
define amdgpu_vs <8 x float> @load_v8float(<8 x float> addrspace(6)* inreg %p0, <8 x float> addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr <8 x float>, <8 x float> addrspace(6)* %p1, i64 2
%r0 = load <8 x float>, <8 x float> addrspace(6)* %p0
%r1 = load <8 x float>, <8 x float> addrspace(6)* %gep1
%r = fadd <8 x float> %r0, %r1
ret <8 x float> %r
}
; GCN-LABEL: {{^}}load_v16float:
; GCN-DAG: s_mov_b32 s3, 0
; GCN-DAG: s_mov_b32 s2, s1
; GCN-DAG: s_mov_b32 s1, s3
; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20
; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
define amdgpu_vs <16 x float> @load_v16float(<16 x float> addrspace(6)* inreg %p0, <16 x float> addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr <16 x float>, <16 x float> addrspace(6)* %p1, i64 2
%r0 = load <16 x float>, <16 x float> addrspace(6)* %p0
%r1 = load <16 x float>, <16 x float> addrspace(6)* %gep1
%r = fadd <16 x float> %r0, %r1
ret <16 x float> %r
}
; GCN-LABEL: {{^}}load_i32_hi0:
; GCN: s_mov_b32 s1, 0
; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
define amdgpu_vs i32 @load_i32_hi0(i32 addrspace(6)* inreg %p) #1 {
%r0 = load i32, i32 addrspace(6)* %p
ret i32 %r0
}
; GCN-LABEL: {{^}}load_i32_hi1:
; GCN: s_mov_b32 s1, 1
; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
define amdgpu_vs i32 @load_i32_hi1(i32 addrspace(6)* inreg %p) #2 {
%r0 = load i32, i32 addrspace(6)* %p
ret i32 %r0
}
; GCN-LABEL: {{^}}load_i32_hiffff8000:
; GCN: s_movk_i32 s1, 0x8000
; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
define amdgpu_vs i32 @load_i32_hiffff8000(i32 addrspace(6)* inreg %p) #3 {
%r0 = load i32, i32 addrspace(6)* %p
ret i32 %r0
}
; GCN-LABEL: {{^}}load_i32_hifffffff0:
; GCN: s_mov_b32 s1, -16
; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
define amdgpu_vs i32 @load_i32_hifffffff0(i32 addrspace(6)* inreg %p) #4 {
%r0 = load i32, i32 addrspace(6)* %p
ret i32 %r0
}
; GCN-LABEL: {{^}}load_sampler
; GCN: v_readfirstlane_b32
; GCN-NEXT: v_readfirstlane_b32
; SI: s_nop
; GCN-NEXT: s_load_dwordx8
; GCN-NEXT: s_load_dwordx4
; GCN: image_sample
define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler([0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
main_body:
%22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
%23 = bitcast float %22 to i32
%24 = shl i32 %23, 1
%25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(6)* %1, i32 0, i32 %24, !amdgpu.uniform !0
%26 = load <8 x i32>, <8 x i32> addrspace(6)* %25, align 32, !invariant.load !0
%27 = shl i32 %23, 2
%28 = or i32 %27, 3
%29 = bitcast [0 x <8 x i32>] addrspace(6)* %1 to [0 x <4 x i32>] addrspace(6)*
%30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %29, i32 0, i32 %28, !amdgpu.uniform !0
%31 = load <4 x i32>, <4 x i32> addrspace(6)* %30, align 16, !invariant.load !0
%32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8
%33 = extractelement <4 x float> %32, i32 0
%34 = extractelement <4 x float> %32, i32 1
%35 = extractelement <4 x float> %32, i32 2
%36 = extractelement <4 x float> %32, i32 3
%37 = bitcast float %4 to i32
%38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %37, 4
%39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 5
%40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 6
%41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 7
%42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %36, 8
%43 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42, float %20, 19
ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43
}
; GCN-LABEL: {{^}}load_sampler_nouniform
; GCN: v_readfirstlane_b32
; GCN-NEXT: v_readfirstlane_b32
; SI: s_nop
; GCN-NEXT: s_load_dwordx8
; GCN-NEXT: s_load_dwordx4
; GCN: image_sample
define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler_nouniform([0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
main_body:
%22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
%23 = bitcast float %22 to i32
%24 = shl i32 %23, 1
%25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(6)* %1, i32 0, i32 %24
%26 = load <8 x i32>, <8 x i32> addrspace(6)* %25, align 32, !invariant.load !0
%27 = shl i32 %23, 2
%28 = or i32 %27, 3
%29 = bitcast [0 x <8 x i32>] addrspace(6)* %1 to [0 x <4 x i32>] addrspace(6)*
%30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %29, i32 0, i32 %28
%31 = load <4 x i32>, <4 x i32> addrspace(6)* %30, align 16, !invariant.load !0
%32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8
%33 = extractelement <4 x float> %32, i32 0
%34 = extractelement <4 x float> %32, i32 1
%35 = extractelement <4 x float> %32, i32 2
%36 = extractelement <4 x float> %32, i32 3
%37 = bitcast float %4 to i32
%38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %37, 4
%39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 5
%40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 6
%41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 7
%42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %36, 8
%43 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42, float %20, 19
ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43
}
; Function Attrs: nounwind readnone speculatable
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6
; Function Attrs: nounwind readonly
declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #7
!0 = !{}
attributes #0 = { nounwind }
attributes #1 = { nounwind "amdgpu-32bit-address-high-bits"="0" }
attributes #2 = { nounwind "amdgpu-32bit-address-high-bits"="1" }
attributes #3 = { nounwind "amdgpu-32bit-address-high-bits"="0xffff8000" }
attributes #4 = { nounwind "amdgpu-32bit-address-high-bits"="0xfffffff0" }
attributes #5 = { "InitialPSInputAddr"="45175" }
attributes #6 = { nounwind readnone speculatable }
attributes #7 = { nounwind readonly }
attributes #8 = { nounwind readnone }