AMDGPU/R600: Serialize vector trunc stores to private AS

Add DUMMY_CHAIN SDNode to denote stores of interest

Bugzilla: https://llvm.org/bugs/show_bug.cgi?id=28915
Bugzilla: https://llvm.org/bugs/show_bug.cgi?id=30411

Differential Revision: https://reviews.llvm.org/D27964

llvm-svn: 292651
This commit is contained in:
Jan Vesely 2017-01-20 21:24:26 +00:00
parent 74694b19e0
commit f170504c41
6 changed files with 59 additions and 18 deletions

View File

@ -3278,6 +3278,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
NODE_NAME_CASE(KILL)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(SENDMSG)
NODE_NAME_CASE(SENDMSGHALT)

View File

@ -330,6 +330,7 @@ enum NodeType : unsigned {
INTERP_P2,
PC_ADD_REL_OFFSET,
KILL,
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
STORE_MSKOR,
LOAD_CONSTANT,

View File

@ -54,6 +54,9 @@ def AMDGPUconstdata_ptr : SDNode<
// This argument to this node is a dword address.
def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
// Force dependencies for vector trunc stores
def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>;
def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;

View File

@ -1120,7 +1120,10 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
llvm_unreachable("Unsupported private trunc store");
}
SDValue Chain = Store->getChain();
SDValue OldChain = Store->getChain();
bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN);
// Skip dummy
SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain;
SDValue BasePtr = Store->getBasePtr();
SDValue Offset = Store->getOffset();
EVT MemVT = Store->getMemoryVT();
@ -1176,7 +1179,15 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
// Store dword
// TODO: Can we be smarter about MachinePointerInfo?
return DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo());
SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo());
// If we are part of expanded vector, make our neighbors depend on this store
if (VectorTrunc) {
// Make all other vector elements depend on this store
Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore);
DAG.ReplaceAllUsesOfValueWith(OldChain, Chain);
}
return NewStore;
}
SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@ -1196,6 +1207,17 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// Neither LOCAL nor PRIVATE can do vectors at the moment
if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
VT.isVector()) {
if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && StoreNode->isTruncatingStore()) {
// Add an extra level of chain to isolate this vector
SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
// TODO: can the chain be replaced without creating a new store?
SDValue NewStore = DAG.getTruncStore(
NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(),
MemVT, StoreNode->getAlignment(),
StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo());
StoreNode = cast<StoreSDNode>(NewStore);
}
return scalarizeVectorStore(StoreNode, DAG);
}
@ -1230,7 +1252,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// Put the mask in correct place
SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift);
// Put the mask in correct place
// Put the value bits in correct place
SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift);

View File

@ -727,6 +727,20 @@ def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
def MOV : R600_1OP <0x19, "MOV", []>;
// This is a hack to get rid of DUMMY_CHAIN nodes.
// Most DUMMY_CHAINs should be eliminated during legalization, but undef
// values can sneak in some to selection.
let isPseudo = 1, isCodeGenOnly = 1 in {
def DUMMY_CHAIN : AMDGPUInst <
(outs),
(ins),
"DUMMY_CHAIN",
[(R600dummy_chain)]
>;
} // end let isPseudo = 1, isCodeGenOnly = 1
let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <

View File

@ -708,10 +708,11 @@ define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8>
; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16:
; EG: LDS_READ_RET
; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: ASHR
; EG: LDS_WRITE
; EG: LDS_WRITE
define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
@ -740,14 +741,15 @@ define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8>
; EG: LDS_READ_RET
; EG: LDS_READ_RET
; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: ASHR
; EG-DAG: ASHR
; EG: LDS_WRITE
; EG: LDS_WRITE
; EG: LDS_WRITE
@ -786,6 +788,11 @@ define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x
; EG: LDS_READ_RET
; EG: LDS_READ_RET
; EG: LDS_READ_RET
; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
@ -798,10 +805,6 @@ define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: ASHR
; EG-DAG: ASHR
; EG-DAG: ASHR
; EG-DAG: ASHR
; EG: LDS_WRITE
; EG: LDS_WRITE
; EG: LDS_WRITE
@ -860,6 +863,11 @@ define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
; EG: LDS_READ_RET
; EG: LDS_READ_RET
; EG: LDS_READ_RET
; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
@ -884,14 +892,6 @@ define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: ASHR
; EG-DAG: ASHR
; EG-DAG: ASHR
; EG-DAG: ASHR
; EG-DAG: ASHR
; EG-DAG: ASHR
; EG-DAG: ASHR
; EG-DAG: ASHR
; EG: LDS_WRITE
; EG: LDS_WRITE
; EG: LDS_WRITE