AMDGPU: Add patterns for i32/i64 local atomic load/store
Not sure why the 32/64 split is needed in the atomic_load store hierarchies. The regular PatFrags do this, but we don't do it for the existing handling for global. llvm-svn: 335325
This commit is contained in:
parent
ea19c9473c
commit
3f8e7a3dbc
|
@ -564,7 +564,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
|||
return;
|
||||
}
|
||||
case ISD::LOAD:
|
||||
case ISD::STORE: {
|
||||
case ISD::STORE:
|
||||
case ISD::ATOMIC_LOAD:
|
||||
case ISD::ATOMIC_STORE: {
|
||||
N = glueCopyToM0(N);
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -365,12 +365,15 @@ def az_extloadi8_local : LocalLoad <az_extloadi8>;
|
|||
def sextloadi8_local : LocalLoad <sextloadi8>;
|
||||
def az_extloadi16_local : LocalLoad <az_extloadi16>;
|
||||
def sextloadi16_local : LocalLoad <sextloadi16>;
|
||||
def atomic_load_32_local : LocalLoad<atomic_load_32>;
|
||||
def atomic_load_64_local : LocalLoad<atomic_load_64>;
|
||||
|
||||
def store_local : LocalStore <store>;
|
||||
def truncstorei8_local : LocalStore <truncstorei8>;
|
||||
def truncstorei16_local : LocalStore <truncstorei16>;
|
||||
def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
|
||||
def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
|
||||
def atomic_store_local : LocalStore <atomic_store>;
|
||||
|
||||
def load_align8_local : Aligned8Bytes <
|
||||
(ops node:$ptr), (load_local node:$ptr)
|
||||
|
|
|
@ -647,6 +647,8 @@ defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
|
|||
defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">;
|
||||
defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
|
||||
defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
|
||||
defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
|
||||
defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
|
||||
|
||||
let AddedComplexity = 100 in {
|
||||
|
||||
|
@ -683,11 +685,30 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
|
|||
}
|
||||
}
|
||||
|
||||
// Irritatingly, atomic_store reverses the order of operands from a
|
||||
// normal store.
|
||||
class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
|
||||
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
|
||||
(inst $ptr, $value, (as_i16imm $offset), (i1 0))
|
||||
>;
|
||||
|
||||
multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
|
||||
let OtherPredicates = [LDSRequiresM0Init] in {
|
||||
def : DSAtomicWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
|
||||
}
|
||||
|
||||
let OtherPredicates = [NotLDSRequiresM0Init] in {
|
||||
def : DSAtomicWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
|
||||
}
|
||||
}
|
||||
|
||||
defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
|
||||
defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
|
||||
defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
|
||||
defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
|
||||
defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
|
||||
defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local">;
|
||||
defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local">;
|
||||
|
||||
let OtherPredicates = [D16PreservesUnusedBits] in {
|
||||
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
|
||||
|
|
|
@ -233,6 +233,10 @@ def AMDGPUld_glue : SDNode <"ISD::LOAD", SDTLoad,
|
|||
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
|
||||
>;
|
||||
|
||||
def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad,
|
||||
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
|
||||
>;
|
||||
|
||||
def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{
|
||||
return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
|
||||
}]>;
|
||||
|
@ -241,6 +245,18 @@ def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{
|
|||
return cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
|
||||
}]>;
|
||||
|
||||
def atomic_load_32_glue : PatFrag<(ops node:$ptr),
|
||||
(AMDGPUatomic_ld_glue node:$ptr)> {
|
||||
let IsAtomic = 1;
|
||||
let MemoryVT = i32;
|
||||
}
|
||||
|
||||
def atomic_load_64_glue : PatFrag<(ops node:$ptr),
|
||||
(AMDGPUatomic_ld_glue node:$ptr)> {
|
||||
let IsAtomic = 1;
|
||||
let MemoryVT = i64;
|
||||
}
|
||||
|
||||
def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{
|
||||
return cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD;
|
||||
}]>;
|
||||
|
@ -286,12 +302,22 @@ def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress;
|
|||
def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress;
|
||||
def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress;
|
||||
def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress;
|
||||
def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress;
|
||||
def atomic_load_64_local_m0 : LoadFrag<atomic_load_64_glue>, LocalAddress;
|
||||
|
||||
|
||||
def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
|
||||
>;
|
||||
|
||||
def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
|
||||
>;
|
||||
|
||||
def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val),
|
||||
(AMDGPUatomic_st_glue node:$ptr, node:$val)> {
|
||||
}
|
||||
|
||||
def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr),
|
||||
(AMDGPUst_glue node:$val, node:$ptr), [{
|
||||
return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
|
||||
|
@ -328,6 +354,7 @@ def store_glue_align16 : Aligned16Bytes <
|
|||
def store_local_m0 : StoreFrag<store_glue>, LocalAddress;
|
||||
def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress;
|
||||
def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress;
|
||||
def atomic_store_local_m0 : StoreFrag<AMDGPUatomic_st_glue>, LocalAddress;
|
||||
|
||||
def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress;
|
||||
def store_align16_local_m0 : StoreFrag<store_glue_align16>, LocalAddress;
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_load_monotonic_i32:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NOT: s_mov_b32 m0
|
||||
; CI-NEXT: s_mov_b32 m0
|
||||
; GCN-NEXT: ds_read_b32 v0, v0{{$}}
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define i32 @atomic_load_monotonic_i32(i32 addrspace(3)* %ptr) {
|
||||
%load = load atomic i32, i32 addrspace(3)* %ptr monotonic, align 4
|
||||
ret i32 %load
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_load_monotonic_i32_offset:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NOT: s_mov_b32 m0
|
||||
; CI-NEXT: s_mov_b32 m0
|
||||
; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define i32 @atomic_load_monotonic_i32_offset(i32 addrspace(3)* %ptr) {
|
||||
%gep = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 16
|
||||
%load = load atomic i32, i32 addrspace(3)* %gep monotonic, align 4
|
||||
ret i32 %load
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_load_monotonic_i64:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NOT: s_mov_b32 m0
|
||||
; CI-NEXT: s_mov_b32 m0
|
||||
; GCN-NEXT: ds_read_b64 v[0:1], v0{{$}}
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define i64 @atomic_load_monotonic_i64(i64 addrspace(3)* %ptr) {
|
||||
%load = load atomic i64, i64 addrspace(3)* %ptr monotonic, align 8
|
||||
ret i64 %load
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_load_monotonic_i64_offset:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NOT: s_mov_b32 m0
|
||||
; CI-NEXT: s_mov_b32 m0
|
||||
; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define i64 @atomic_load_monotonic_i64_offset(i64 addrspace(3)* %ptr) {
|
||||
%gep = getelementptr inbounds i64, i64 addrspace(3)* %ptr, i64 16
|
||||
%load = load atomic i64, i64 addrspace(3)* %gep monotonic, align 8
|
||||
ret i64 %load
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_store_monotonic_i32:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NOT: s_mov_b32 m0
|
||||
; CI-NEXT: s_mov_b32 m0
|
||||
; GCN-NEXT: ds_write_b32 v0, v1{{$}}
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @atomic_store_monotonic_i32(i32 addrspace(3)* %ptr, i32 %val) {
|
||||
store atomic i32 %val, i32 addrspace(3)* %ptr monotonic, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i32:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NOT: s_mov_b32 m0
|
||||
; CI-NEXT: s_mov_b32 m0
|
||||
; GCN-NEXT: ds_write_b32 v0, v1 offset:64{{$}}
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @atomic_store_monotonic_offset_i32(i32 addrspace(3)* %ptr, i32 %val) {
|
||||
%gep = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 16
|
||||
store atomic i32 %val, i32 addrspace(3)* %gep monotonic, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_store_monotonic_i64:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NOT: s_mov_b32 m0
|
||||
; CI-NEXT: s_mov_b32 m0
|
||||
; GCN-NEXT: ds_write_b64 v0, v[1:2]{{$}}
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @atomic_store_monotonic_i64(i64 addrspace(3)* %ptr, i64 %val) {
|
||||
store atomic i64 %val, i64 addrspace(3)* %ptr monotonic, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i64:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NOT: s_mov_b32 m0
|
||||
; CI-NEXT: s_mov_b32 m0
|
||||
; GCN-NEXT: ds_write_b64 v0, v[1:2] offset:128{{$}}
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @atomic_store_monotonic_offset_i64(i64 addrspace(3)* %ptr, i64 %val) {
|
||||
%gep = getelementptr inbounds i64, i64 addrspace(3)* %ptr, i64 16
|
||||
store atomic i64 %val, i64 addrspace(3)* %gep monotonic, align 8
|
||||
ret void
|
||||
}
|
||||
|
Loading…
Reference in New Issue