From 3f8e7a3dbcb7dc7c28ee294b6fdaa0bd1f8abeec Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 22 Jun 2018 08:39:52 +0000 Subject: [PATCH] AMDGPU: Add patterns for i32/i64 local atomic load/store Not sure why the 32/64 split is needed in the atomic_load store hierarchies. The regular PatFrags do this, but we don't do it for the existing handling for global. llvm-svn: 335325 --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 4 +- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 3 ++ llvm/lib/Target/AMDGPU/DSInstructions.td | 21 ++++++++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 27 ++++++++++ llvm/test/CodeGen/AMDGPU/atomic_load_local.ll | 52 ++++++++++++++++++ .../test/CodeGen/AMDGPU/atomic_store_local.ll | 53 +++++++++++++++++++ 6 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/atomic_load_local.ll create mode 100644 llvm/test/CodeGen/AMDGPU/atomic_store_local.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 1ad10b36aed2..db6a837c2bb8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -564,7 +564,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { return; } case ISD::LOAD: - case ISD::STORE: { + case ISD::STORE: + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_STORE: { N = glueCopyToM0(N); break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index d7acb6bf12d8..c474a85595b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -365,12 +365,15 @@ def az_extloadi8_local : LocalLoad ; def sextloadi8_local : LocalLoad ; def az_extloadi16_local : LocalLoad ; def sextloadi16_local : LocalLoad ; +def atomic_load_32_local : LocalLoad; +def atomic_load_64_local : LocalLoad; def store_local : LocalStore ; def truncstorei8_local : LocalStore ; def truncstorei16_local : LocalStore ; def store_local_hi16 : StoreHi16 , LocalAddress; def truncstorei8_local_hi16 : StoreHi16, LocalAddress; +def atomic_store_local : LocalStore ; def load_align8_local : Aligned8Bytes < (ops node:$ptr), (load_local node:$ptr) diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 28887ea4a497..cdc6ab9412e6 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -647,6 +647,8 @@ defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; let AddedComplexity = 100 in { @@ -683,11 +685,30 @@ multiclass DSWritePat_mc { } } +// Irritatingly, atomic_store reverses the order of operands from a +// normal store. +class DSAtomicWritePat : GCNPat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) +>; + +multiclass DSAtomicWritePat_mc { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSAtomicWritePat(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSAtomicWritePat(!cast(inst)#"_gfx9"), vt, !cast(frag)>; + } +} + defm : DSWritePat_mc ; defm : DSWritePat_mc ; defm : DSWritePat_mc ; defm : DSWritePat_mc ; defm : DSWritePat_mc ; +defm : DSAtomicWritePat_mc ; +defm : DSAtomicWritePat_mc ; let OtherPredicates = [D16PreservesUnusedBits] in { def : DSWritePat ; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index d8ed8eb08491..a5fe25627f0f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -233,6 +233,10 @@ def AMDGPUld_glue : SDNode <"ISD::LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; +def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{ return cast(N)->getAddressingMode() == ISD::UNINDEXED; }]>; @@ -241,6 +245,18 @@ def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{ return cast(N)->getExtensionType() == ISD::NON_EXTLOAD; }]>; +def atomic_load_32_glue : PatFrag<(ops node:$ptr), + (AMDGPUatomic_ld_glue node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i32; +} + +def atomic_load_64_glue : PatFrag<(ops node:$ptr), + (AMDGPUatomic_ld_glue node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i64; +} + def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{ return cast(N)->getExtensionType() == ISD::EXTLOAD; }]>; @@ -286,12 +302,22 @@ def az_extloadi8_local_m0 : LoadFrag, LocalAddress; def az_extloadi16_local_m0 : LoadFrag, LocalAddress; def load_align8_local_m0 : LoadFrag , LocalAddress; def load_align16_local_m0 : LoadFrag , LocalAddress; +def atomic_load_32_local_m0 : LoadFrag, LocalAddress; +def atomic_load_64_local_m0 : LoadFrag, LocalAddress; def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] >; +def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] +>; + +def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val), + (AMDGPUatomic_st_glue node:$ptr, node:$val)> { +} + def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr), (AMDGPUst_glue node:$val, node:$ptr), [{ return cast(N)->getAddressingMode() == ISD::UNINDEXED; @@ -328,6 +354,7 @@ def store_glue_align16 : Aligned16Bytes < def store_local_m0 : StoreFrag, LocalAddress; def truncstorei8_local_m0 : StoreFrag, LocalAddress; def truncstorei16_local_m0 : StoreFrag, LocalAddress; +def atomic_store_local_m0 : StoreFrag, LocalAddress; def store_align8_local_m0 : StoreFrag, LocalAddress; def store_align16_local_m0 : StoreFrag, LocalAddress; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll new file mode 100644 index 000000000000..edca16871ac6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll @@ -0,0 +1,52 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +; GCN-LABEL: {{^}}atomic_load_monotonic_i32: +; GCN: s_waitcnt +; GFX9-NOT: s_mov_b32 m0 +; CI-NEXT: s_mov_b32 m0 +; GCN-NEXT: ds_read_b32 v0, v0{{$}} +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i32 @atomic_load_monotonic_i32(i32 addrspace(3)* %ptr) { + %load = load atomic i32, i32 addrspace(3)* %ptr monotonic, align 4 + ret i32 %load +} + +; GCN-LABEL: {{^}}atomic_load_monotonic_i32_offset: +; GCN: s_waitcnt +; GFX9-NOT: s_mov_b32 m0 +; CI-NEXT: s_mov_b32 m0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}} +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i32 @atomic_load_monotonic_i32_offset(i32 addrspace(3)* %ptr) { + %gep = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 16 + %load = load atomic i32, i32 addrspace(3)* %gep monotonic, align 4 + ret i32 %load +} + +; GCN-LABEL: {{^}}atomic_load_monotonic_i64: +; GCN: s_waitcnt +; GFX9-NOT: s_mov_b32 m0 +; CI-NEXT: s_mov_b32 m0 +; GCN-NEXT: ds_read_b64 v[0:1], v0{{$}} +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i64 @atomic_load_monotonic_i64(i64 addrspace(3)* %ptr) { + %load = load atomic i64, i64 addrspace(3)* %ptr monotonic, align 8 + ret i64 %load +} + +; GCN-LABEL: {{^}}atomic_load_monotonic_i64_offset: +; GCN: s_waitcnt +; GFX9-NOT: s_mov_b32 m0 +; CI-NEXT: s_mov_b32 m0 +; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}} +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i64 @atomic_load_monotonic_i64_offset(i64 addrspace(3)* %ptr) { + %gep = getelementptr inbounds i64, i64 addrspace(3)* %ptr, i64 16 + %load = load atomic i64, i64 addrspace(3)* %gep monotonic, align 8 + ret i64 %load +} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll new file mode 100644 index 000000000000..3b69070f3eae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll @@ -0,0 +1,53 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +; GCN-LABEL: {{^}}atomic_store_monotonic_i32: +; GCN: s_waitcnt +; GFX9-NOT: s_mov_b32 m0 +; CI-NEXT: s_mov_b32 m0 +; GCN-NEXT: ds_write_b32 v0, v1{{$}} +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @atomic_store_monotonic_i32(i32 addrspace(3)* %ptr, i32 %val) { + store atomic i32 %val, i32 addrspace(3)* %ptr monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i32: +; GCN: s_waitcnt +; GFX9-NOT: s_mov_b32 m0 +; CI-NEXT: s_mov_b32 m0 +; GCN-NEXT: ds_write_b32 v0, v1 offset:64{{$}} +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @atomic_store_monotonic_offset_i32(i32 addrspace(3)* %ptr, i32 %val) { + %gep = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 16 + store atomic i32 %val, i32 addrspace(3)* %gep monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_monotonic_i64: +; GCN: s_waitcnt +; GFX9-NOT: s_mov_b32 m0 +; CI-NEXT: s_mov_b32 m0 +; GCN-NEXT: ds_write_b64 v0, v[1:2]{{$}} +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @atomic_store_monotonic_i64(i64 addrspace(3)* %ptr, i64 %val) { + store atomic i64 %val, i64 addrspace(3)* %ptr monotonic, align 8 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i64: +; GCN: s_waitcnt +; GFX9-NOT: s_mov_b32 m0 +; CI-NEXT: s_mov_b32 m0 +; GCN-NEXT: ds_write_b64 v0, v[1:2] offset:128{{$}} +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @atomic_store_monotonic_offset_i64(i64 addrspace(3)* %ptr, i64 %val) { + %gep = getelementptr inbounds i64, i64 addrspace(3)* %ptr, i64 16 + store atomic i64 %val, i64 addrspace(3)* %gep monotonic, align 8 + ret void +} +