From bb5ee41ab41bf0db99c2cea4c16c7eaeb0339965 Mon Sep 17 00:00:00 2001 From: Tim Renouf Date: Tue, 21 Aug 2018 11:08:12 +0000 Subject: [PATCH] [AMDGPU] Allow int types for MUBUF vdata Summary: Previously the new llvm.amdgcn.raw/struct.buffer.load/store intrinsics only allowed float types for the data to be loaded or stored, which sometimes meant the frontend needed to generate a bitcast. In this, the new intrinsics copied the old buffer intrinsics. This commit extends the new intrinsics to allow int types as well. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D50315 Change-Id: I8202af2d036455553681dcbb3d7d32ae273f8f85 llvm-svn: 340270 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +++---- llvm/lib/Target/AMDGPU/BUFInstructions.td | 20 +++++++++++++++++ .../AMDGPU/llvm.amdgcn.raw.buffer.load.ll | 22 +++++++++++++++++++ .../AMDGPU/llvm.amdgcn.raw.buffer.store.ll | 16 ++++++++++++++ .../AMDGPU/llvm.amdgcn.struct.buffer.load.ll | 22 +++++++++++++++++++ .../AMDGPU/llvm.amdgcn.struct.buffer.store.ll | 16 ++++++++++++++ 6 files changed, 100 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 1c9a6bbdac2d..af884d446a88 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -823,7 +823,7 @@ def int_amdgcn_buffer_store : AMDGPUBufferStore; // These new instrinsics also keep the offset and soffset arguments separate as // they behave differently in bounds checking and swizzling. class AMDGPURawBufferLoad : Intrinsic < - [llvm_anyfloat_ty], + [llvm_any_ty], [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) @@ -834,7 +834,7 @@ def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; class AMDGPUStructBufferLoad : Intrinsic < - [llvm_anyfloat_ty], + [llvm_any_ty], [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) @@ -847,7 +847,7 @@ def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; class AMDGPURawBufferStore : Intrinsic < [], - [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 + [llvm_any_ty, // vdata(VGPR) llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) @@ -859,7 +859,7 @@ def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; class AMDGPUStructBufferStore : Intrinsic < [], - [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 + [llvm_any_ty, // vdata(VGPR) llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 1f425ac1361b..d4c54dacfe57 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1074,24 +1074,34 @@ multiclass MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; } // End HasPackedD16VMem. defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; multiclass MUBUF_StoreIntrinsicPat { @@ -1128,24 +1138,34 @@ multiclass MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; } // End HasPackedD16VMem. defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; //===----------------------------------------------------------------------===// // buffer_atomic patterns diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll index 797700e5b4fc..76695077f004 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -198,9 +198,31 @@ main_body: ret void } +;CHECK-LABEL: {{^}}buffer_load_int: +;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +;CHECK: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc +;CHECK: buffer_load_dword v6, off, s[0:3], 0 slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) { +main_body: + %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0) + %data_glc = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 1) + %data_slc = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %0, i32 0, i32 0, i32 2) + %fdata = bitcast <4 x i32> %data to <4 x float> + %fdata_glc = bitcast <2 x i32> %data_glc to <2 x float> + %fdata_slc = bitcast i32 %data_slc to float + %r0 = insertvalue {<4 x float>, <2 x float>, float} undef, <4 x float> %fdata, 0 + %r1 = insertvalue {<4 x float>, <2 x float>, float} %r0, <2 x float> %fdata_glc, 1 + %r2 = insertvalue {<4 x float>, <2 x float>, float} %r1, float %fdata_slc, 2 + ret {<4 x float>, <2 x float>, float} %r2 +} + declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #0 +declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32) #0 +declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll index 648651159660..afb2ef803f01 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -142,9 +142,25 @@ define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x ret void } +;CHECK-LABEL: {{^}}buffer_store_int: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +;CHECK: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc +;CHECK: buffer_store_dword v6, off, s[0:3], 0 slc +define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %3, <4 x i32> %0, i32 0, i32 0, i32 2) + ret void +} + declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll index cd373e0796be..71fbdaa71b2b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll @@ -125,9 +125,31 @@ entry: ret float %val } +;CHECK-LABEL: {{^}}buffer_load_int: +;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: buffer_load_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) { +main_body: + %data = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) + %data_glc = call <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1) + %data_slc = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + %fdata = bitcast <4 x i32> %data to <4 x float> + %fdata_glc = bitcast <2 x i32> %data_glc to <2 x float> + %fdata_slc = bitcast i32 %data_slc to float + %r0 = insertvalue {<4 x float>, <2 x float>, float} undef, <4 x float> %fdata, 0 + %r1 = insertvalue {<4 x float>, <2 x float>, float} %r0, <2 x float> %fdata_glc, 1 + %r2 = insertvalue {<4 x float>, <2 x float>, float} %r1, float %fdata_slc, 2 + ret {<4 x float>, <2 x float>, float} %r2 +} + declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32>, i32, i32, i32, i32) #0 +declare <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) #0 +declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll index 8e05c43889aa..738bb16e7d6b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -95,9 +95,25 @@ main_body: ret void } +;CHECK-LABEL: {{^}}buffer_store_int: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc +define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.struct.buffer.store.i32(i32 %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + ret void +} + declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 attributes #0 = { nounwind }