R600: Use LDS and vectors for private memory
llvm-svn: 211110
This commit is contained in:
parent
85ad429f1f
commit
880a80ad07
|
@ -17,6 +17,7 @@
|
||||||
namespace llvm {
|
namespace llvm {
|
||||||
|
|
||||||
class AMDGPUInstrPrinter;
|
class AMDGPUInstrPrinter;
|
||||||
|
class AMDGPUSubtarget;
|
||||||
class AMDGPUTargetMachine;
|
class AMDGPUTargetMachine;
|
||||||
class FunctionPass;
|
class FunctionPass;
|
||||||
class MCAsmInfo;
|
class MCAsmInfo;
|
||||||
|
@ -47,6 +48,7 @@ void initializeSILowerI1CopiesPass(PassRegistry &);
|
||||||
extern char &SILowerI1CopiesID;
|
extern char &SILowerI1CopiesID;
|
||||||
|
|
||||||
// Passes common to R600 and SI
|
// Passes common to R600 and SI
|
||||||
|
FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
|
||||||
Pass *createAMDGPUStructurizeCFGPass();
|
Pass *createAMDGPUStructurizeCFGPass();
|
||||||
FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
|
FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
|
||||||
|
|
||||||
|
|
|
@ -86,28 +86,40 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
|
||||||
def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
|
def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
|
||||||
def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
|
def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
|
||||||
|
|
||||||
|
class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
|
||||||
|
"localmemorysize"#Value,
|
||||||
|
"LocalMemorySize",
|
||||||
|
!cast<string>(Value),
|
||||||
|
"The size of local memory in bytes">;
|
||||||
|
|
||||||
class SubtargetFeatureGeneration <string Value,
|
class SubtargetFeatureGeneration <string Value,
|
||||||
list<SubtargetFeature> Implies> :
|
list<SubtargetFeature> Implies> :
|
||||||
SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
|
SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
|
||||||
Value#" GPU generation", Implies>;
|
Value#" GPU generation", Implies>;
|
||||||
|
|
||||||
|
def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
|
||||||
|
def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
|
||||||
|
def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
|
||||||
|
|
||||||
def FeatureR600 : SubtargetFeatureGeneration<"R600",
|
def FeatureR600 : SubtargetFeatureGeneration<"R600",
|
||||||
[FeatureR600ALUInst, FeatureFetchLimit8]>;
|
[FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
|
||||||
|
|
||||||
def FeatureR700 : SubtargetFeatureGeneration<"R700",
|
def FeatureR700 : SubtargetFeatureGeneration<"R700",
|
||||||
[FeatureFetchLimit16]>;
|
[FeatureFetchLimit16, FeatureLocalMemorySize0]>;
|
||||||
|
|
||||||
def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
|
def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
|
||||||
[FeatureFetchLimit16]>;
|
[FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
|
||||||
|
|
||||||
def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
|
def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
|
||||||
[FeatureFetchLimit16, FeatureWavefrontSize64]>;
|
[FeatureFetchLimit16, FeatureWavefrontSize64,
|
||||||
|
FeatureLocalMemorySize32768]
|
||||||
|
>;
|
||||||
|
|
||||||
def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
|
def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
|
||||||
[Feature64BitPtr, FeatureFP64]>;
|
[Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>;
|
||||||
|
|
||||||
def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
|
def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
|
||||||
[Feature64BitPtr, FeatureFP64]>;
|
[Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>;
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
def AMDGPUInstrInfo : InstrInfo {
|
def AMDGPUInstrInfo : InstrInfo {
|
||||||
|
|
|
@ -258,6 +258,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
||||||
return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
|
return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
|
||||||
}
|
}
|
||||||
case ISD::SCALAR_TO_VECTOR:
|
case ISD::SCALAR_TO_VECTOR:
|
||||||
|
case AMDGPUISD::BUILD_VERTICAL_VECTOR:
|
||||||
case ISD::BUILD_VECTOR: {
|
case ISD::BUILD_VECTOR: {
|
||||||
unsigned RegClassID;
|
unsigned RegClassID;
|
||||||
const AMDGPURegisterInfo *TRI =
|
const AMDGPURegisterInfo *TRI =
|
||||||
|
@ -308,7 +309,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
||||||
// can't be bundled by our scheduler.
|
// can't be bundled by our scheduler.
|
||||||
switch(NumVectorElts) {
|
switch(NumVectorElts) {
|
||||||
case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
|
case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
|
||||||
case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
|
case 4:
|
||||||
|
if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
|
||||||
|
RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
|
||||||
|
else
|
||||||
|
RegClassID = AMDGPU::R600_Reg128RegClassID;
|
||||||
|
break;
|
||||||
default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
|
default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1911,6 +1911,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||||
NODE_NAME_CASE(CVT_F32_UBYTE1)
|
NODE_NAME_CASE(CVT_F32_UBYTE1)
|
||||||
NODE_NAME_CASE(CVT_F32_UBYTE2)
|
NODE_NAME_CASE(CVT_F32_UBYTE2)
|
||||||
NODE_NAME_CASE(CVT_F32_UBYTE3)
|
NODE_NAME_CASE(CVT_F32_UBYTE3)
|
||||||
|
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
|
||||||
NODE_NAME_CASE(STORE_MSKOR)
|
NODE_NAME_CASE(STORE_MSKOR)
|
||||||
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
|
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
|
||||||
}
|
}
|
||||||
|
|
|
@ -203,6 +203,15 @@ enum {
|
||||||
CVT_F32_UBYTE1,
|
CVT_F32_UBYTE1,
|
||||||
CVT_F32_UBYTE2,
|
CVT_F32_UBYTE2,
|
||||||
CVT_F32_UBYTE3,
|
CVT_F32_UBYTE3,
|
||||||
|
/// This node is for VLIW targets and it is used to represent a vector
|
||||||
|
/// that is stored in consecutive registers with the same channel.
|
||||||
|
/// For example:
|
||||||
|
/// |X |Y|Z|W|
|
||||||
|
/// T0|v.x| | | |
|
||||||
|
/// T1|v.y| | | |
|
||||||
|
/// T2|v.z| | | |
|
||||||
|
/// T3|v.w| | | |
|
||||||
|
BUILD_VERTICAL_VECTOR,
|
||||||
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
||||||
STORE_MSKOR,
|
STORE_MSKOR,
|
||||||
LOAD_CONSTANT,
|
LOAD_CONSTANT,
|
||||||
|
|
|
@ -0,0 +1,365 @@
|
||||||
|
//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
|
||||||
|
//
|
||||||
|
// The LLVM Compiler Infrastructure
|
||||||
|
//
|
||||||
|
// This file is distributed under the University of Illinois Open Source
|
||||||
|
// License. See LICENSE.TXT for details.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
// This pass eliminates allocas by either converting them into vectors or
|
||||||
|
// by migrating them to local address space.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "AMDGPU.h"
|
||||||
|
#include "AMDGPUSubtarget.h"
|
||||||
|
#include "llvm/Analysis/ValueTracking.h"
|
||||||
|
#include "llvm/IR/IRBuilder.h"
|
||||||
|
#include "llvm/IR/InstVisitor.h"
|
||||||
|
#include "llvm/Support/Debug.h"
|
||||||
|
|
||||||
|
#define DEBUG_TYPE "amdgpu-promote-alloca"
|
||||||
|
|
||||||
|
using namespace llvm;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
class AMDGPUPromoteAlloca : public FunctionPass,
|
||||||
|
public InstVisitor<AMDGPUPromoteAlloca> {
|
||||||
|
|
||||||
|
static char ID;
|
||||||
|
Module *Mod;
|
||||||
|
const AMDGPUSubtarget &ST;
|
||||||
|
int LocalMemAvailable;
|
||||||
|
|
||||||
|
public:
|
||||||
|
AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
|
||||||
|
LocalMemAvailable(0) { }
|
||||||
|
virtual bool doInitialization(Module &M);
|
||||||
|
virtual bool runOnFunction(Function &F);
|
||||||
|
virtual const char *getPassName() const {
|
||||||
|
return "AMDGPU Promote Alloca";
|
||||||
|
}
|
||||||
|
void visitAlloca(AllocaInst &I);
|
||||||
|
};
|
||||||
|
|
||||||
|
} // End anonymous namespace
|
||||||
|
|
||||||
|
char AMDGPUPromoteAlloca::ID = 0;
|
||||||
|
|
||||||
|
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
|
||||||
|
Mod = &M;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
|
||||||
|
|
||||||
|
const FunctionType *FTy = F.getFunctionType();
|
||||||
|
|
||||||
|
LocalMemAvailable = ST.getLocalMemorySize();
|
||||||
|
|
||||||
|
|
||||||
|
// If the function has any arguments in the local address space, then it's
|
||||||
|
// possible these arguments require the entire local memory space, so
|
||||||
|
// we cannot use local memory in the pass.
|
||||||
|
for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
|
||||||
|
const Type *ParamTy = FTy->getParamType(i);
|
||||||
|
if (ParamTy->isPointerTy() &&
|
||||||
|
ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
|
||||||
|
LocalMemAvailable = 0;
|
||||||
|
DEBUG(dbgs() << "Function has local memory argument. Promoting to "
|
||||||
|
"local memory disabled.\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LocalMemAvailable > 0) {
|
||||||
|
// Check how much local memory is being used by global objects
|
||||||
|
for (Module::global_iterator I = Mod->global_begin(),
|
||||||
|
E = Mod->global_end(); I != E; ++I) {
|
||||||
|
GlobalVariable *GV = I;
|
||||||
|
PointerType *GVTy = GV->getType();
|
||||||
|
if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
|
||||||
|
continue;
|
||||||
|
for (Value::use_iterator U = GV->use_begin(),
|
||||||
|
UE = GV->use_end(); U != UE; ++U) {
|
||||||
|
Instruction *Use = dyn_cast<Instruction>(*U);
|
||||||
|
if (!Use)
|
||||||
|
continue;
|
||||||
|
if (Use->getParent()->getParent() == &F)
|
||||||
|
LocalMemAvailable -=
|
||||||
|
Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LocalMemAvailable = std::max(0, LocalMemAvailable);
|
||||||
|
DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
|
||||||
|
|
||||||
|
visit(F);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
|
||||||
|
return VectorType::get(ArrayTy->getArrayElementType(),
|
||||||
|
ArrayTy->getArrayNumElements());
|
||||||
|
}
|
||||||
|
|
||||||
|
static Value* calculateVectorIndex(Value *Ptr,
|
||||||
|
std::map<GetElementPtrInst*, Value*> GEPIdx) {
|
||||||
|
if (isa<AllocaInst>(Ptr))
|
||||||
|
return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
|
||||||
|
|
||||||
|
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
|
||||||
|
|
||||||
|
return GEPIdx[GEP];
|
||||||
|
}
|
||||||
|
|
||||||
|
static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
|
||||||
|
// FIXME we only support simple cases
|
||||||
|
if (GEP->getNumOperands() != 3)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
|
||||||
|
if (!I0 || !I0->isZero())
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
return GEP->getOperand(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
|
||||||
|
Type *AllocaTy = Alloca->getAllocatedType();
|
||||||
|
|
||||||
|
DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
|
||||||
|
|
||||||
|
// FIXME: There is no reason why we can't support larger arrays, we
|
||||||
|
// are just being conservative for now.
|
||||||
|
if (!AllocaTy->isArrayTy() ||
|
||||||
|
AllocaTy->getArrayElementType()->isVectorTy() ||
|
||||||
|
AllocaTy->getArrayNumElements() > 4) {
|
||||||
|
|
||||||
|
DEBUG(dbgs() << " Cannot convert type to vector");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
|
||||||
|
std::vector<Value*> WorkList;
|
||||||
|
for (User *AllocaUser : Alloca->users()) {
|
||||||
|
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
|
||||||
|
if (!GEP) {
|
||||||
|
WorkList.push_back(AllocaUser);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *Index = GEPToVectorIndex(GEP);
|
||||||
|
|
||||||
|
// If we can't compute a vector index from this GEP, then we can't
|
||||||
|
// promote this alloca to vector.
|
||||||
|
if (!Index) {
|
||||||
|
DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << "\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
GEPVectorIdx[GEP] = Index;
|
||||||
|
for (User *GEPUser : AllocaUser->users()) {
|
||||||
|
WorkList.push_back(GEPUser);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
|
||||||
|
|
||||||
|
DEBUG(dbgs() << " Converting alloca to vector "; AllocaTy->dump();
|
||||||
|
dbgs() << " -> "; VectorTy->dump(); dbgs() << "\n");
|
||||||
|
|
||||||
|
for (std::vector<Value*>::iterator I = WorkList.begin(),
|
||||||
|
E = WorkList.end(); I != E; ++I) {
|
||||||
|
Instruction *Inst = cast<Instruction>(*I);
|
||||||
|
IRBuilder<> Builder(Inst);
|
||||||
|
switch (Inst->getOpcode()) {
|
||||||
|
case Instruction::Load: {
|
||||||
|
Value *Ptr = Inst->getOperand(0);
|
||||||
|
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
|
||||||
|
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
|
||||||
|
Value *VecValue = Builder.CreateLoad(BitCast);
|
||||||
|
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
|
||||||
|
Inst->replaceAllUsesWith(ExtractElement);
|
||||||
|
Inst->eraseFromParent();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case Instruction::Store: {
|
||||||
|
Value *Ptr = Inst->getOperand(1);
|
||||||
|
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
|
||||||
|
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
|
||||||
|
Value *VecValue = Builder.CreateLoad(BitCast);
|
||||||
|
Value *NewVecValue = Builder.CreateInsertElement(VecValue,
|
||||||
|
Inst->getOperand(0),
|
||||||
|
Index);
|
||||||
|
Builder.CreateStore(NewVecValue, BitCast);
|
||||||
|
Inst->eraseFromParent();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case Instruction::BitCast:
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
Inst->dump();
|
||||||
|
llvm_unreachable("Do not know how to replace this instruction "
|
||||||
|
"with vector op");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
|
||||||
|
for (User *User : Val->users()) {
|
||||||
|
if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
|
||||||
|
continue;
|
||||||
|
if (isa<CallInst>(User)) {
|
||||||
|
WorkList.push_back(User);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!User->getType()->isPointerTy())
|
||||||
|
continue;
|
||||||
|
WorkList.push_back(User);
|
||||||
|
collectUsesWithPtrTypes(User, WorkList);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
|
||||||
|
IRBuilder<> Builder(&I);
|
||||||
|
|
||||||
|
// First try to replace the alloca with a vector
|
||||||
|
Type *AllocaTy = I.getAllocatedType();
|
||||||
|
|
||||||
|
DEBUG(dbgs() << "Trying to promote " << I);
|
||||||
|
|
||||||
|
if (tryPromoteAllocaToVector(&I))
|
||||||
|
return;
|
||||||
|
|
||||||
|
DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
|
||||||
|
|
||||||
|
// FIXME: This is the maximum work group size. We should try to get
|
||||||
|
// value from the reqd_work_group_size function attribute if it is
|
||||||
|
// available.
|
||||||
|
unsigned WorkGroupSize = 256;
|
||||||
|
int AllocaSize = WorkGroupSize *
|
||||||
|
Mod->getDataLayout()->getTypeAllocSize(AllocaTy);
|
||||||
|
|
||||||
|
if (AllocaSize > LocalMemAvailable) {
|
||||||
|
DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG(dbgs() << "Promoting alloca to local memory\n");
|
||||||
|
LocalMemAvailable -= AllocaSize;
|
||||||
|
|
||||||
|
GlobalVariable *GV = new GlobalVariable(
|
||||||
|
*Mod, ArrayType::get(I.getAllocatedType(), 256), false,
|
||||||
|
GlobalValue::ExternalLinkage, 0, I.getName(), 0,
|
||||||
|
GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
|
||||||
|
|
||||||
|
FunctionType *FTy = FunctionType::get(
|
||||||
|
Type::getInt32Ty(Mod->getContext()), false);
|
||||||
|
AttributeSet AttrSet;
|
||||||
|
AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
|
||||||
|
|
||||||
|
Value *ReadLocalSizeY = Mod->getOrInsertFunction(
|
||||||
|
"llvm.r600.read.local.size.y", FTy, AttrSet);
|
||||||
|
Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
|
||||||
|
"llvm.r600.read.local.size.z", FTy, AttrSet);
|
||||||
|
Value *ReadTIDIGX = Mod->getOrInsertFunction(
|
||||||
|
"llvm.r600.read.tidig.x", FTy, AttrSet);
|
||||||
|
Value *ReadTIDIGY = Mod->getOrInsertFunction(
|
||||||
|
"llvm.r600.read.tidig.y", FTy, AttrSet);
|
||||||
|
Value *ReadTIDIGZ = Mod->getOrInsertFunction(
|
||||||
|
"llvm.r600.read.tidig.z", FTy, AttrSet);
|
||||||
|
|
||||||
|
|
||||||
|
Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
|
||||||
|
Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
|
||||||
|
Value *TIdX = Builder.CreateCall(ReadTIDIGX);
|
||||||
|
Value *TIdY = Builder.CreateCall(ReadTIDIGY);
|
||||||
|
Value *TIdZ = Builder.CreateCall(ReadTIDIGZ);
|
||||||
|
|
||||||
|
Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
|
||||||
|
Tmp0 = Builder.CreateMul(Tmp0, TIdX);
|
||||||
|
Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
|
||||||
|
Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
|
||||||
|
TID = Builder.CreateAdd(TID, TIdZ);
|
||||||
|
|
||||||
|
std::vector<Value*> Indices;
|
||||||
|
Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
|
||||||
|
Indices.push_back(TID);
|
||||||
|
|
||||||
|
Value *Offset = Builder.CreateGEP(GV, Indices);
|
||||||
|
I.mutateType(Offset->getType());
|
||||||
|
I.replaceAllUsesWith(Offset);
|
||||||
|
I.eraseFromParent();
|
||||||
|
|
||||||
|
std::vector<Value*> WorkList;
|
||||||
|
|
||||||
|
collectUsesWithPtrTypes(Offset, WorkList);
|
||||||
|
|
||||||
|
for (std::vector<Value*>::iterator i = WorkList.begin(),
|
||||||
|
e = WorkList.end(); i != e; ++i) {
|
||||||
|
Value *V = *i;
|
||||||
|
CallInst *Call = dyn_cast<CallInst>(V);
|
||||||
|
if (!Call) {
|
||||||
|
Type *EltTy = V->getType()->getPointerElementType();
|
||||||
|
PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
|
||||||
|
V->mutateType(NewTy);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
|
||||||
|
if (!Intr) {
|
||||||
|
std::vector<Type*> ArgTypes;
|
||||||
|
for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
|
||||||
|
ArgIdx != ArgEnd; ++ArgIdx) {
|
||||||
|
ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
|
||||||
|
}
|
||||||
|
Function *F = Call->getCalledFunction();
|
||||||
|
FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
|
||||||
|
F->isVarArg());
|
||||||
|
Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
|
||||||
|
F->getAttributes());
|
||||||
|
Function *NewF = cast<Function>(C);
|
||||||
|
Call->setCalledFunction(NewF);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.SetInsertPoint(Intr);
|
||||||
|
switch (Intr->getIntrinsicID()) {
|
||||||
|
case Intrinsic::lifetime_start:
|
||||||
|
case Intrinsic::lifetime_end:
|
||||||
|
// These intrinsics are for address space 0 only
|
||||||
|
Intr->eraseFromParent();
|
||||||
|
continue;
|
||||||
|
case Intrinsic::memcpy: {
|
||||||
|
MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
|
||||||
|
Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
|
||||||
|
MemCpy->getLength(), MemCpy->getAlignment(),
|
||||||
|
MemCpy->isVolatile());
|
||||||
|
Intr->eraseFromParent();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
case Intrinsic::memset: {
|
||||||
|
MemSetInst *MemSet = cast<MemSetInst>(Intr);
|
||||||
|
Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
|
||||||
|
MemSet->getLength(), MemSet->getAlignment(),
|
||||||
|
MemSet->isVolatile());
|
||||||
|
Intr->eraseFromParent();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
Intr->dump();
|
||||||
|
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
|
||||||
|
return new AMDGPUPromoteAlloca(ST);
|
||||||
|
}
|
|
@ -41,6 +41,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
|
||||||
EnableIfCvt = true;
|
EnableIfCvt = true;
|
||||||
WavefrontSize = 0;
|
WavefrontSize = 0;
|
||||||
CFALUBug = false;
|
CFALUBug = false;
|
||||||
|
LocalMemorySize = 0;
|
||||||
ParseSubtargetFeatures(GPU, FS);
|
ParseSubtargetFeatures(GPU, FS);
|
||||||
DevName = GPU;
|
DevName = GPU;
|
||||||
|
|
||||||
|
@ -109,6 +110,10 @@ AMDGPUSubtarget::hasCFAluBug() const {
|
||||||
assert(getGeneration() <= NORTHERN_ISLANDS);
|
assert(getGeneration() <= NORTHERN_ISLANDS);
|
||||||
return CFALUBug;
|
return CFALUBug;
|
||||||
}
|
}
|
||||||
|
int
|
||||||
|
AMDGPUSubtarget::getLocalMemorySize() const {
|
||||||
|
return LocalMemorySize;
|
||||||
|
}
|
||||||
bool
|
bool
|
||||||
AMDGPUSubtarget::isTargetELF() const {
|
AMDGPUSubtarget::isTargetELF() const {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -56,6 +56,7 @@ private:
|
||||||
bool EnableIfCvt;
|
bool EnableIfCvt;
|
||||||
unsigned WavefrontSize;
|
unsigned WavefrontSize;
|
||||||
bool CFALUBug;
|
bool CFALUBug;
|
||||||
|
int LocalMemorySize;
|
||||||
|
|
||||||
InstrItineraryData InstrItins;
|
InstrItineraryData InstrItins;
|
||||||
|
|
||||||
|
@ -109,6 +110,7 @@ public:
|
||||||
unsigned getWavefrontSize() const;
|
unsigned getWavefrontSize() const;
|
||||||
unsigned getStackEntrySize() const;
|
unsigned getStackEntrySize() const;
|
||||||
bool hasCFAluBug() const;
|
bool hasCFAluBug() const;
|
||||||
|
int getLocalMemorySize() const;
|
||||||
|
|
||||||
bool enableMachineScheduler() const override {
|
bool enableMachineScheduler() const override {
|
||||||
return getGeneration() <= NORTHERN_ISLANDS;
|
return getGeneration() <= NORTHERN_ISLANDS;
|
||||||
|
|
|
@ -109,6 +109,7 @@ public:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual void addCodeGenPrepare();
|
||||||
bool addPreISel() override;
|
bool addPreISel() override;
|
||||||
bool addInstSelector() override;
|
bool addInstSelector() override;
|
||||||
bool addPreRegAlloc() override;
|
bool addPreRegAlloc() override;
|
||||||
|
@ -134,6 +135,13 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
|
||||||
PM.add(createAMDGPUTargetTransformInfoPass(this));
|
PM.add(createAMDGPUTargetTransformInfoPass(this));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void AMDGPUPassConfig::addCodeGenPrepare() {
|
||||||
|
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
|
||||||
|
addPass(createAMDGPUPromoteAlloca(ST));
|
||||||
|
addPass(createSROAPass());
|
||||||
|
TargetPassConfig::addCodeGenPrepare();
|
||||||
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
AMDGPUPassConfig::addPreISel() {
|
AMDGPUPassConfig::addPreISel() {
|
||||||
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
|
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
|
||||||
|
|
|
@ -25,6 +25,7 @@ add_llvm_target(R600CodeGen
|
||||||
AMDGPUTargetTransformInfo.cpp
|
AMDGPUTargetTransformInfo.cpp
|
||||||
AMDGPUISelLowering.cpp
|
AMDGPUISelLowering.cpp
|
||||||
AMDGPUInstrInfo.cpp
|
AMDGPUInstrInfo.cpp
|
||||||
|
AMDGPUPromoteAlloca.cpp
|
||||||
AMDGPURegisterInfo.cpp
|
AMDGPURegisterInfo.cpp
|
||||||
R600ClauseMergePass.cpp
|
R600ClauseMergePass.cpp
|
||||||
R600ControlFlowFinalizer.cpp
|
R600ControlFlowFinalizer.cpp
|
||||||
|
|
|
@ -136,6 +136,16 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
|
||||||
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
|
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
|
||||||
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
|
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
|
||||||
|
|
||||||
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
|
||||||
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
|
||||||
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
|
||||||
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
|
||||||
|
|
||||||
|
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
|
||||||
|
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
|
||||||
|
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
|
||||||
|
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
|
||||||
|
|
||||||
setTargetDAGCombine(ISD::FP_ROUND);
|
setTargetDAGCombine(ISD::FP_ROUND);
|
||||||
setTargetDAGCombine(ISD::FP_TO_SINT);
|
setTargetDAGCombine(ISD::FP_TO_SINT);
|
||||||
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
|
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
|
||||||
|
@ -540,6 +550,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
|
||||||
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
|
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
|
||||||
switch (Op.getOpcode()) {
|
switch (Op.getOpcode()) {
|
||||||
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
||||||
|
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
|
||||||
|
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
|
||||||
case ISD::FCOS:
|
case ISD::FCOS:
|
||||||
case ISD::FSIN: return LowerTrig(Op, DAG);
|
case ISD::FSIN: return LowerTrig(Op, DAG);
|
||||||
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
|
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
|
||||||
|
@ -812,6 +824,56 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
|
||||||
|
SDValue Vector) const {
|
||||||
|
|
||||||
|
SDLoc DL(Vector);
|
||||||
|
EVT VecVT = Vector.getValueType();
|
||||||
|
EVT EltVT = VecVT.getVectorElementType();
|
||||||
|
SmallVector<SDValue, 8> Args;
|
||||||
|
|
||||||
|
for (unsigned i = 0, e = VecVT.getVectorNumElements();
|
||||||
|
i != e; ++i) {
|
||||||
|
Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
|
||||||
|
Vector, DAG.getConstant(i, getVectorIdxTy())));
|
||||||
|
}
|
||||||
|
|
||||||
|
return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
|
||||||
|
}
|
||||||
|
|
||||||
|
SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
|
||||||
|
SelectionDAG &DAG) const {
|
||||||
|
|
||||||
|
SDLoc DL(Op);
|
||||||
|
SDValue Vector = Op.getOperand(0);
|
||||||
|
SDValue Index = Op.getOperand(1);
|
||||||
|
|
||||||
|
if (isa<ConstantSDNode>(Index) ||
|
||||||
|
Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
|
||||||
|
return Op;
|
||||||
|
|
||||||
|
Vector = vectorToVerticalVector(DAG, Vector);
|
||||||
|
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
|
||||||
|
Vector, Index);
|
||||||
|
}
|
||||||
|
|
||||||
|
SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
|
||||||
|
SelectionDAG &DAG) const {
|
||||||
|
SDLoc DL(Op);
|
||||||
|
SDValue Vector = Op.getOperand(0);
|
||||||
|
SDValue Value = Op.getOperand(1);
|
||||||
|
SDValue Index = Op.getOperand(2);
|
||||||
|
|
||||||
|
if (isa<ConstantSDNode>(Index) ||
|
||||||
|
Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
|
||||||
|
return Op;
|
||||||
|
|
||||||
|
Vector = vectorToVerticalVector(DAG, Vector);
|
||||||
|
SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
|
||||||
|
Vector, Value, Index);
|
||||||
|
return vectorToVerticalVector(DAG, Insert);
|
||||||
|
}
|
||||||
|
|
||||||
SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
|
SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
|
||||||
// On hw >= R700, COS/SIN input must be between -1. and 1.
|
// On hw >= R700, COS/SIN input must be between -1. and 1.
|
||||||
// Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
|
// Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
|
||||||
|
|
|
@ -51,7 +51,10 @@ private:
|
||||||
void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
|
void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
|
||||||
MachineRegisterInfo & MRI, unsigned dword_offset) const;
|
MachineRegisterInfo & MRI, unsigned dword_offset) const;
|
||||||
SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const;
|
SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const;
|
||||||
|
SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
|
||||||
|
|
||||||
|
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
|
||||||
|
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
|
||||||
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
|
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
|
||||||
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
|
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
|
||||||
SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
|
SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
|
||||||
|
|
|
@ -51,11 +51,15 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
||||||
unsigned DestReg, unsigned SrcReg,
|
unsigned DestReg, unsigned SrcReg,
|
||||||
bool KillSrc) const {
|
bool KillSrc) const {
|
||||||
unsigned VectorComponents = 0;
|
unsigned VectorComponents = 0;
|
||||||
if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
|
if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
|
||||||
AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
|
AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
|
||||||
|
(AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
|
||||||
|
AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
|
||||||
VectorComponents = 4;
|
VectorComponents = 4;
|
||||||
} else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
|
} else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
|
||||||
AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
|
AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
|
||||||
|
(AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
|
||||||
|
AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
|
||||||
VectorComponents = 2;
|
VectorComponents = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1053,6 +1057,29 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
|
||||||
|
|
||||||
|
switch(MI->getOpcode()) {
|
||||||
|
default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
|
||||||
|
case AMDGPU::R600_EXTRACT_ELT_V2:
|
||||||
|
case AMDGPU::R600_EXTRACT_ELT_V4:
|
||||||
|
buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(),
|
||||||
|
RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address
|
||||||
|
MI->getOperand(2).getReg(),
|
||||||
|
RI.getHWRegChan(MI->getOperand(1).getReg()));
|
||||||
|
break;
|
||||||
|
case AMDGPU::R600_INSERT_ELT_V2:
|
||||||
|
case AMDGPU::R600_INSERT_ELT_V4:
|
||||||
|
buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value
|
||||||
|
RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address
|
||||||
|
MI->getOperand(3).getReg(), // Offset
|
||||||
|
RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
MI->eraseFromParent();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
|
void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
|
||||||
const MachineFunction &MF) const {
|
const MachineFunction &MF) const {
|
||||||
const AMDGPUFrameLowering *TFL =
|
const AMDGPUFrameLowering *TFL =
|
||||||
|
@ -1090,7 +1117,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
|
||||||
MachineBasicBlock::iterator I,
|
MachineBasicBlock::iterator I,
|
||||||
unsigned ValueReg, unsigned Address,
|
unsigned ValueReg, unsigned Address,
|
||||||
unsigned OffsetReg) const {
|
unsigned OffsetReg) const {
|
||||||
unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
|
return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
|
||||||
|
MachineBasicBlock::iterator I,
|
||||||
|
unsigned ValueReg, unsigned Address,
|
||||||
|
unsigned OffsetReg,
|
||||||
|
unsigned AddrChan) const {
|
||||||
|
unsigned AddrReg;
|
||||||
|
switch (AddrChan) {
|
||||||
|
default: llvm_unreachable("Invalid Channel");
|
||||||
|
case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
|
||||||
|
case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
|
||||||
|
case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
|
||||||
|
case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
|
||||||
|
}
|
||||||
MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
|
MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
|
||||||
AMDGPU::AR_X, OffsetReg);
|
AMDGPU::AR_X, OffsetReg);
|
||||||
setImmOperand(MOVA, AMDGPU::OpName::write, 0);
|
setImmOperand(MOVA, AMDGPU::OpName::write, 0);
|
||||||
|
@ -1107,7 +1149,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
|
||||||
MachineBasicBlock::iterator I,
|
MachineBasicBlock::iterator I,
|
||||||
unsigned ValueReg, unsigned Address,
|
unsigned ValueReg, unsigned Address,
|
||||||
unsigned OffsetReg) const {
|
unsigned OffsetReg) const {
|
||||||
unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
|
return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
|
||||||
|
MachineBasicBlock::iterator I,
|
||||||
|
unsigned ValueReg, unsigned Address,
|
||||||
|
unsigned OffsetReg,
|
||||||
|
unsigned AddrChan) const {
|
||||||
|
unsigned AddrReg;
|
||||||
|
switch (AddrChan) {
|
||||||
|
default: llvm_unreachable("Invalid Channel");
|
||||||
|
case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
|
||||||
|
case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
|
||||||
|
case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
|
||||||
|
case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
|
||||||
|
}
|
||||||
MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
|
MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
|
||||||
AMDGPU::AR_X,
|
AMDGPU::AR_X,
|
||||||
OffsetReg);
|
OffsetReg);
|
||||||
|
|
|
@ -36,6 +36,18 @@ namespace llvm {
|
||||||
std::vector<std::pair<int, unsigned> >
|
std::vector<std::pair<int, unsigned> >
|
||||||
ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
|
ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
|
||||||
|
|
||||||
|
|
||||||
|
MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
|
||||||
|
MachineBasicBlock::iterator I,
|
||||||
|
unsigned ValueReg, unsigned Address,
|
||||||
|
unsigned OffsetReg,
|
||||||
|
unsigned AddrChan) const;
|
||||||
|
|
||||||
|
MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
|
||||||
|
MachineBasicBlock::iterator I,
|
||||||
|
unsigned ValueReg, unsigned Address,
|
||||||
|
unsigned OffsetReg,
|
||||||
|
unsigned AddrChan) const;
|
||||||
public:
|
public:
|
||||||
enum BankSwizzle {
|
enum BankSwizzle {
|
||||||
ALU_VEC_012_SCL_210 = 0,
|
ALU_VEC_012_SCL_210 = 0,
|
||||||
|
@ -195,6 +207,8 @@ namespace llvm {
|
||||||
int getInstrLatency(const InstrItineraryData *ItinData,
|
int getInstrLatency(const InstrItineraryData *ItinData,
|
||||||
SDNode *Node) const override { return 1;}
|
SDNode *Node) const override { return 1;}
|
||||||
|
|
||||||
|
virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
|
||||||
|
|
||||||
/// \brief Reserve the registers that may be accesed using indirect addressing.
|
/// \brief Reserve the registers that may be accesed using indirect addressing.
|
||||||
void reserveIndirectRegisters(BitVector &Reserved,
|
void reserveIndirectRegisters(BitVector &Reserved,
|
||||||
const MachineFunction &MF) const;
|
const MachineFunction &MF) const;
|
||||||
|
|
|
@ -1581,6 +1581,60 @@ let isTerminator=1 in {
|
||||||
defm CONTINUEC : BranchInstr2<"CONTINUEC">;
|
defm CONTINUEC : BranchInstr2<"CONTINUEC">;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
// Indirect addressing pseudo instructions
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
let isPseudo = 1 in {
|
||||||
|
|
||||||
|
class ExtractVertical <RegisterClass vec_rc> : InstR600 <
|
||||||
|
(outs R600_Reg32:$dst),
|
||||||
|
(ins vec_rc:$vec, R600_Reg32:$index), "",
|
||||||
|
[],
|
||||||
|
AnyALU
|
||||||
|
>;
|
||||||
|
|
||||||
|
let Constraints = "$dst = $vec" in {
|
||||||
|
|
||||||
|
class InsertVertical <RegisterClass vec_rc> : InstR600 <
|
||||||
|
(outs vec_rc:$dst),
|
||||||
|
(ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "",
|
||||||
|
[],
|
||||||
|
AnyALU
|
||||||
|
>;
|
||||||
|
|
||||||
|
} // End Constraints = "$dst = $vec"
|
||||||
|
|
||||||
|
} // End isPseudo = 1
|
||||||
|
|
||||||
|
def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>;
|
||||||
|
def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>;
|
||||||
|
|
||||||
|
def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
|
||||||
|
def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
|
||||||
|
|
||||||
|
class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
|
||||||
|
ValueType scalar_ty> : Pat <
|
||||||
|
(scalar_ty (extractelt vec_ty:$vec, i32:$index)),
|
||||||
|
(inst $vec, $index)
|
||||||
|
>;
|
||||||
|
|
||||||
|
def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>;
|
||||||
|
def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>;
|
||||||
|
def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
|
||||||
|
def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
|
||||||
|
|
||||||
|
class InsertVerticalPat <Instruction inst, ValueType vec_ty,
|
||||||
|
ValueType scalar_ty> : Pat <
|
||||||
|
(vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
|
||||||
|
(inst $vec, $value, $index)
|
||||||
|
>;
|
||||||
|
|
||||||
|
def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>;
|
||||||
|
def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>;
|
||||||
|
def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>;
|
||||||
|
def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// ISel Patterns
|
// ISel Patterns
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
|
@ -18,18 +18,28 @@ class R600RegWithChan <string name, bits<9> sel, string chan> :
|
||||||
|
|
||||||
class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
|
class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
|
||||||
RegisterWithSubRegs<n, subregs> {
|
RegisterWithSubRegs<n, subregs> {
|
||||||
|
field bits<2> chan_encoding = 0;
|
||||||
let Namespace = "AMDGPU";
|
let Namespace = "AMDGPU";
|
||||||
let SubRegIndices = [sub0, sub1, sub2, sub3];
|
let SubRegIndices = [sub0, sub1, sub2, sub3];
|
||||||
let HWEncoding = encoding;
|
let HWEncoding{8-0} = encoding{8-0};
|
||||||
|
let HWEncoding{10-9} = chan_encoding;
|
||||||
}
|
}
|
||||||
|
|
||||||
class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
|
class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
|
||||||
RegisterWithSubRegs<n, subregs> {
|
RegisterWithSubRegs<n, subregs> {
|
||||||
|
field bits<2> chan_encoding = 0;
|
||||||
let Namespace = "AMDGPU";
|
let Namespace = "AMDGPU";
|
||||||
let SubRegIndices = [sub0, sub1];
|
let SubRegIndices = [sub0, sub1];
|
||||||
let HWEncoding = encoding;
|
let HWEncoding = encoding;
|
||||||
|
let HWEncoding{8-0} = encoding{8-0};
|
||||||
|
let HWEncoding{10-9} = chan_encoding;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 <
|
||||||
|
"V"#lo#hi#"_"#chan,
|
||||||
|
[!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)],
|
||||||
|
lo
|
||||||
|
>;
|
||||||
|
|
||||||
foreach Index = 0-127 in {
|
foreach Index = 0-127 in {
|
||||||
foreach Chan = [ "X", "Y", "Z", "W" ] in {
|
foreach Chan = [ "X", "Y", "Z", "W" ] in {
|
||||||
|
@ -54,6 +64,24 @@ foreach Index = 0-127 in {
|
||||||
Index>;
|
Index>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
foreach Chan = [ "X", "Y", "Z", "W"] in {
|
||||||
|
|
||||||
|
let chan_encoding = !if(!eq(Chan, "X"), 0,
|
||||||
|
!if(!eq(Chan, "Y"), 1,
|
||||||
|
!if(!eq(Chan, "Z"), 2,
|
||||||
|
!if(!eq(Chan, "W"), 3, 0)))) in {
|
||||||
|
def V0123_#Chan : R600Reg_128 <"V0123_"#Chan,
|
||||||
|
[!cast<Register>("T0_"#Chan),
|
||||||
|
!cast<Register>("T1_"#Chan),
|
||||||
|
!cast<Register>("T2_"#Chan),
|
||||||
|
!cast<Register>("T3_"#Chan)],
|
||||||
|
0>;
|
||||||
|
def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>;
|
||||||
|
def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// KCACHE_BANK0
|
// KCACHE_BANK0
|
||||||
foreach Index = 159-128 in {
|
foreach Index = 159-128 in {
|
||||||
foreach Chan = [ "X", "Y", "Z", "W" ] in {
|
foreach Chan = [ "X", "Y", "Z", "W" ] in {
|
||||||
|
@ -130,8 +158,14 @@ def ALU_PARAM : R600Reg<"Param", 0>;
|
||||||
|
|
||||||
let isAllocatable = 0 in {
|
let isAllocatable = 0 in {
|
||||||
|
|
||||||
// XXX: Only use the X channel, until we support wider stack widths
|
def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
|
||||||
def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
|
|
||||||
|
// We only use Addr_[YZW] for vertical vectors.
|
||||||
|
// FIXME if we add more vertical vector registers we will need to ad more
|
||||||
|
// registers to these classes.
|
||||||
|
def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>;
|
||||||
|
def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>;
|
||||||
|
def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>;
|
||||||
|
|
||||||
def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
|
def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
|
||||||
(add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
|
(add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
|
||||||
|
@ -206,5 +240,13 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
|
||||||
let CopyCost = -1;
|
let CopyCost = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
|
||||||
|
(add V0123_W, V0123_Z, V0123_Y, V0123_X)
|
||||||
|
>;
|
||||||
|
|
||||||
def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
|
def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
|
||||||
(add (sequence "T%u_XY", 0, 63))>;
|
(add (sequence "T%u_XY", 0, 63))>;
|
||||||
|
|
||||||
|
def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
|
||||||
|
(add V01_X, V01_Y, V01_Z, V01_W,
|
||||||
|
V23_X, V23_Y, V23_Z, V23_W)>;
|
||||||
|
|
|
@ -2560,13 +2560,13 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST I
|
||||||
// 1. Extract with offset
|
// 1. Extract with offset
|
||||||
def : Pat<
|
def : Pat<
|
||||||
(vector_extract vt:$vec, (add i32:$idx, imm:$off)),
|
(vector_extract vt:$vec, (add i32:$idx, imm:$off)),
|
||||||
(f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
|
(eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
|
||||||
>;
|
>;
|
||||||
|
|
||||||
// 2. Extract without offset
|
// 2. Extract without offset
|
||||||
def : Pat<
|
def : Pat<
|
||||||
(vector_extract vt:$vec, i32:$idx),
|
(vector_extract vt:$vec, i32:$idx),
|
||||||
(f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
|
(eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
|
||||||
>;
|
>;
|
||||||
|
|
||||||
// 3. Insert with offset
|
// 3. Insert with offset
|
||||||
|
|
|
@ -10,7 +10,12 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
|
||||||
|
|
||||||
; SI-LABEL: @test_private_array_ptr_calc:
|
; SI-LABEL: @test_private_array_ptr_calc:
|
||||||
; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
|
; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
|
||||||
; SI: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]]
|
;
|
||||||
|
; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
|
||||||
|
; alloca to a vector. It currently fails because it does not know how
|
||||||
|
; to interpret:
|
||||||
|
; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
|
||||||
|
; SI: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]]
|
||||||
define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
|
define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
|
||||||
%alloca = alloca [4 x i32], i32 4, align 16
|
%alloca = alloca [4 x i32], i32 4, align 16
|
||||||
%tid = call i32 @llvm.SI.tid() readnone
|
%tid = call i32 @llvm.SI.tid() readnone
|
||||||
|
|
|
@ -3,10 +3,8 @@
|
||||||
declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
|
declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
|
||||||
|
|
||||||
; SI-LABEL: @private_access_f64_alloca:
|
; SI-LABEL: @private_access_f64_alloca:
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_WRITE_B64
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_READ_B64
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
|
define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
|
||||||
%val = load double addrspace(1)* %in, align 8
|
%val = load double addrspace(1)* %in, align 8
|
||||||
%array = alloca double, i32 16, align 8
|
%array = alloca double, i32 16, align 8
|
||||||
|
@ -19,14 +17,10 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: @private_access_v2f64_alloca:
|
; SI-LABEL: @private_access_v2f64_alloca:
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_WRITE_B64
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_WRITE_B64
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_READ_B64
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_READ_B64
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
|
define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
|
||||||
%val = load <2 x double> addrspace(1)* %in, align 16
|
%val = load <2 x double> addrspace(1)* %in, align 16
|
||||||
%array = alloca <2 x double>, i32 16, align 16
|
%array = alloca <2 x double>, i32 16, align 16
|
||||||
|
@ -39,10 +33,8 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: @private_access_i64_alloca:
|
; SI-LABEL: @private_access_i64_alloca:
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_WRITE_B64
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_READ_B64
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
|
define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
|
||||||
%val = load i64 addrspace(1)* %in, align 8
|
%val = load i64 addrspace(1)* %in, align 8
|
||||||
%array = alloca i64, i32 16, align 8
|
%array = alloca i64, i32 16, align 8
|
||||||
|
@ -55,14 +47,10 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: @private_access_v2i64_alloca:
|
; SI-LABEL: @private_access_v2i64_alloca:
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_WRITE_B64
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_WRITE_B64
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_READ_B64
|
||||||
; SI: V_MOVRELD_B32_e32
|
; SI: DS_READ_B64
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
; SI: V_MOVRELS_B32_e32
|
|
||||||
define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
|
define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
|
||||||
%val = load <2 x i64> addrspace(1)* %in, align 16
|
%val = load <2 x i64> addrspace(1)* %in, align 16
|
||||||
%array = alloca <2 x i64>, i32 16, align 16
|
%array = alloca <2 x i64>, i32 16, align 16
|
||||||
|
|
|
@ -2,10 +2,13 @@
|
||||||
; REQUIRES: asserts
|
; REQUIRES: asserts
|
||||||
; RUN: llc -march=r600 -mcpu=SI < %s
|
; RUN: llc -march=r600 -mcpu=SI < %s
|
||||||
|
|
||||||
define void @large_alloca(i32 addrspace(1)* %out, i32 %x) nounwind {
|
define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind {
|
||||||
%large = alloca [256 x i32], align 4
|
%large = alloca [8192 x i32], align 4
|
||||||
%gep = getelementptr [256 x i32]* %large, i32 0, i32 255
|
%gep = getelementptr [8192 x i32]* %large, i32 0, i32 8191
|
||||||
store i32 %x, i32* %gep
|
store i32 %x, i32* %gep
|
||||||
|
%gep1 = getelementptr [8192 x i32]* %large, i32 0, i32 %y
|
||||||
|
%0 = load i32* %gep1
|
||||||
|
store i32 %0, i32 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,12 @@
|
||||||
; CHECK: AND_INT
|
; CHECK: AND_INT
|
||||||
; CHECK-NEXT: AND_INT
|
; CHECK-NEXT: AND_INT
|
||||||
; CHECK-NEXT: OR_INT
|
; CHECK-NEXT: OR_INT
|
||||||
|
|
||||||
|
; FIXME: For some reason having the allocas here allowed the flatten cfg pass
|
||||||
|
; to do its transfomation, however now that we are using local memory for
|
||||||
|
; allocas, the transformation isn't happening.
|
||||||
|
; XFAIL: *
|
||||||
|
|
||||||
define void @_Z9chk1D_512v() #0 {
|
define void @_Z9chk1D_512v() #0 {
|
||||||
entry:
|
entry:
|
||||||
%a0 = alloca i32, align 4
|
%a0 = alloca i32, align 4
|
||||||
|
|
|
@ -3,6 +3,11 @@
|
||||||
;
|
;
|
||||||
; CFG flattening should use parallel-or to generate branch conditions and
|
; CFG flattening should use parallel-or to generate branch conditions and
|
||||||
; then merge if-regions with the same bodies.
|
; then merge if-regions with the same bodies.
|
||||||
|
|
||||||
|
; FIXME: For some reason having the allocas here allowed the flatten cfg pass
|
||||||
|
; to do its transfomation, however now that we are using local memory for
|
||||||
|
; allocas, the transformation isn't happening.
|
||||||
|
; XFAIL: *
|
||||||
;
|
;
|
||||||
; CHECK: OR_INT
|
; CHECK: OR_INT
|
||||||
; CHECK-NEXT: OR_INT
|
; CHECK-NEXT: OR_INT
|
||||||
|
|
|
@ -1,24 +1,17 @@
|
||||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
|
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
|
||||||
; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
|
; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
|
||||||
|
|
||||||
; This test checks that uses and defs of the AR register happen in the same
|
|
||||||
; instruction clause.
|
|
||||||
|
|
||||||
; FUNC-LABEL: @mova_same_clause
|
; FUNC-LABEL: @mova_same_clause
|
||||||
|
|
||||||
; R600-CHECK: MOVA_INT
|
; R600-CHECK: LDS_WRITE
|
||||||
; R600-CHECK-NOT: ALU clause
|
; R600-CHECK: LDS_WRITE
|
||||||
; R600-CHECK: 0 + AR.x
|
; R600-CHECK: LDS_READ
|
||||||
; R600-CHECK: MOVA_INT
|
; R600-CHECK: LDS_READ
|
||||||
; R600-CHECK-NOT: ALU clause
|
|
||||||
; R600-CHECK: 0 + AR.x
|
|
||||||
|
|
||||||
; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
|
; SI-CHECK: DS_WRITE_B32
|
||||||
; SI-CHECK: V_MOVRELD
|
; SI-CHECK: DS_WRITE_B32
|
||||||
; SI-CHECK: S_CBRANCH
|
; SI-CHECK: DS_READ_B32
|
||||||
; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
|
; SI-CHECK: DS_READ_B32
|
||||||
; SI-CHECK: V_MOVRELD
|
|
||||||
; SI-CHECK: S_CBRANCH
|
|
||||||
define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
|
define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
|
||||||
entry:
|
entry:
|
||||||
%stack = alloca [5 x i32], align 4
|
%stack = alloca [5 x i32], align 4
|
||||||
|
@ -114,12 +107,8 @@ for.end:
|
||||||
|
|
||||||
; FUNC-LABEL: @short_array
|
; FUNC-LABEL: @short_array
|
||||||
|
|
||||||
; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal
|
|
||||||
; R600-CHECK: 65536
|
|
||||||
; R600-CHECK: *
|
|
||||||
; R600-CHECK: MOVA_INT
|
; R600-CHECK: MOVA_INT
|
||||||
|
|
||||||
; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 0x10000
|
|
||||||
; SI-CHECK: V_MOVRELS_B32_e32
|
; SI-CHECK: V_MOVRELS_B32_e32
|
||||||
define void @short_array(i32 addrspace(1)* %out, i32 %index) {
|
define void @short_array(i32 addrspace(1)* %out, i32 %index) {
|
||||||
entry:
|
entry:
|
||||||
|
@ -137,10 +126,7 @@ entry:
|
||||||
|
|
||||||
; FUNC-LABEL: @char_array
|
; FUNC-LABEL: @char_array
|
||||||
|
|
||||||
; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal
|
; R600-CHECK: MOVA_INT
|
||||||
; R600-CHECK: 256
|
|
||||||
; R600-CHECK: *
|
|
||||||
; R600-CHECK-NEXT: MOVA_INT
|
|
||||||
|
|
||||||
; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100
|
; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100
|
||||||
; SI-CHECK: V_MOVRELS_B32_e32
|
; SI-CHECK: V_MOVRELS_B32_e32
|
||||||
|
@ -185,7 +171,9 @@ entry:
|
||||||
; Test that two stack objects are not stored in the same register
|
; Test that two stack objects are not stored in the same register
|
||||||
; The second stack object should be in T3.X
|
; The second stack object should be in T3.X
|
||||||
; FUNC-LABEL: @no_overlap
|
; FUNC-LABEL: @no_overlap
|
||||||
; R600-CHECK: MOV {{\** *}}T3.X
|
; R600_CHECK: MOV
|
||||||
|
; R600_CHECK: [[CHAN:[XYZW]]]+
|
||||||
|
; R600-CHECK-NOT: [[CHAN]]+
|
||||||
; SI-CHECK: V_MOV_B32_e32 v3
|
; SI-CHECK: V_MOV_B32_e32 v3
|
||||||
define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
|
define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
|
||||||
entry:
|
entry:
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
|
; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
|
||||||
|
|
||||||
|
; XFAIL: *
|
||||||
|
|
||||||
; 64-bit select was originally lowered with a build_pair, and this
|
; 64-bit select was originally lowered with a build_pair, and this
|
||||||
; could be simplified to 1 cndmask instead of 2, but that broken when
|
; could be simplified to 1 cndmask instead of 2, but that broken when
|
||||||
; it started being implemented with a v2i32 build_vector and
|
; it started being implemented with a v2i32 build_vector and
|
||||||
|
@ -12,9 +14,10 @@ define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) {
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; FIXME: Fix truncating store for local memory
|
||||||
; SI-LABEL: @trunc_load_alloca_i64:
|
; SI-LABEL: @trunc_load_alloca_i64:
|
||||||
; SI: V_MOVRELS_B32
|
; SI: DS_READ_B32
|
||||||
; SI-NOT: V_MOVRELS_B32
|
; SI-NOT: DS_READ_B64
|
||||||
; SI: S_ENDPGM
|
; SI: S_ENDPGM
|
||||||
define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
|
define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||||
%idx = add i32 %a, %b
|
%idx = add i32 %a, %b
|
||||||
|
|
|
@ -0,0 +1,74 @@
|
||||||
|
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
|
||||||
|
; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
|
||||||
|
|
||||||
|
; FUNC-LABEL: @vector_read
|
||||||
|
; EG: MOV
|
||||||
|
; EG: MOV
|
||||||
|
; EG: MOV
|
||||||
|
; EG: MOV
|
||||||
|
; EG: MOVA_INT
|
||||||
|
define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
|
||||||
|
entry:
|
||||||
|
%0 = alloca [4 x i32]
|
||||||
|
%x = getelementptr [4 x i32]* %0, i32 0, i32 0
|
||||||
|
%y = getelementptr [4 x i32]* %0, i32 0, i32 1
|
||||||
|
%z = getelementptr [4 x i32]* %0, i32 0, i32 2
|
||||||
|
%w = getelementptr [4 x i32]* %0, i32 0, i32 3
|
||||||
|
store i32 0, i32* %x
|
||||||
|
store i32 1, i32* %y
|
||||||
|
store i32 2, i32* %z
|
||||||
|
store i32 3, i32* %w
|
||||||
|
%1 = getelementptr [4 x i32]* %0, i32 0, i32 %index
|
||||||
|
%2 = load i32* %1
|
||||||
|
store i32 %2, i32 addrspace(1)* %out
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; FUNC-LABEL: @vector_write
|
||||||
|
; EG: MOV
|
||||||
|
; EG: MOV
|
||||||
|
; EG: MOV
|
||||||
|
; EG: MOV
|
||||||
|
; EG: MOVA_INT
|
||||||
|
; EG: MOVA_INT
|
||||||
|
define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
|
||||||
|
entry:
|
||||||
|
%0 = alloca [4 x i32]
|
||||||
|
%x = getelementptr [4 x i32]* %0, i32 0, i32 0
|
||||||
|
%y = getelementptr [4 x i32]* %0, i32 0, i32 1
|
||||||
|
%z = getelementptr [4 x i32]* %0, i32 0, i32 2
|
||||||
|
%w = getelementptr [4 x i32]* %0, i32 0, i32 3
|
||||||
|
store i32 0, i32* %x
|
||||||
|
store i32 0, i32* %y
|
||||||
|
store i32 0, i32* %z
|
||||||
|
store i32 0, i32* %w
|
||||||
|
%1 = getelementptr [4 x i32]* %0, i32 0, i32 %w_index
|
||||||
|
store i32 1, i32* %1
|
||||||
|
%2 = getelementptr [4 x i32]* %0, i32 0, i32 %r_index
|
||||||
|
%3 = load i32* %2
|
||||||
|
store i32 %3, i32 addrspace(1)* %out
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; This test should be optimize to:
|
||||||
|
; store i32 0, i32 addrspace(1)* %out
|
||||||
|
; FUNC-LABEL: @bitcast_gep
|
||||||
|
; CHECK: STORE_RAW
|
||||||
|
define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
|
||||||
|
entry:
|
||||||
|
%0 = alloca [4 x i32]
|
||||||
|
%x = getelementptr [4 x i32]* %0, i32 0, i32 0
|
||||||
|
%y = getelementptr [4 x i32]* %0, i32 0, i32 1
|
||||||
|
%z = getelementptr [4 x i32]* %0, i32 0, i32 2
|
||||||
|
%w = getelementptr [4 x i32]* %0, i32 0, i32 3
|
||||||
|
store i32 0, i32* %x
|
||||||
|
store i32 0, i32* %y
|
||||||
|
store i32 0, i32* %z
|
||||||
|
store i32 0, i32* %w
|
||||||
|
%1 = getelementptr [4 x i32]* %0, i32 0, i32 1
|
||||||
|
%2 = bitcast i32* %1 to [4 x i32]*
|
||||||
|
%3 = getelementptr [4 x i32]* %2, i32 0, i32 0
|
||||||
|
%4 = load i32* %3
|
||||||
|
store i32 %4, i32 addrspace(1)* %out
|
||||||
|
ret void
|
||||||
|
}
|
Loading…
Reference in New Issue