AMDGPU: Split x8 and x16 vector loads instead of scalarize

The one regression in the builtin tests is in the read2 test which now
(again) has many extra copies, but this should be solved once the pass
is replaced with a DAG combine.

llvm-svn: 253974
This commit is contained in:
Matt Arsenault 2015-11-24 12:05:03 +00:00
parent 9d0f44bf8a
commit 4d801cd357
10 changed files with 171 additions and 290 deletions

View File

@ -394,6 +394,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setFsqrtIsCheap(true);
// We want to find all load dependencies for long chains of stores to enable
// merging into very wide vectors. The problem is with vectors with > 4
// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
// vectors are a legal type, even though we have to split the loads
// usually. When we can more precisely specify load legality per address
// space, we should be able to make FindBetterChain/MergeConsecutiveStores
// smarter so that they can figure out what to do in 2 iterations without all
// N > 4 stores on the same chain.
GatherAllAliasesMaxDepth = 16;
// FIXME: Need to really handle these.
MaxStoresPerMemcpy = 4096;
MaxStoresPerMemmove = 4096;

View File

@ -1178,10 +1178,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
"Custom lowering for non-i32 vectors hasn't been implemented.");
unsigned NumElements = Op.getValueType().getVectorNumElements();
assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
switch (Load->getAddressSpace()) {
default: break;
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::PRIVATE_ADDRESS:
if (NumElements >= 8)
return SplitVectorLoad(Op, DAG);
// v4 loads are supported for private and global memory.
if (NumElements <= 4)
break;
@ -1409,7 +1413,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return Ret;
if (VT.isVector() && VT.getVectorNumElements() >= 8)
return ScalarizeVectorStore(Op, DAG);
return SplitVectorStore(Op, DAG);
if (VT == MVT::i1)
return DAG.getTruncStore(Store->getChain(), DL,

View File

@ -137,14 +137,8 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
; SI-NOT: bfe
; SI-NOT: lshr
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dwordx4
; SI: buffer_store_dwordx4
define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
%cvt = uitofp <8 x i8> %load to <8 x float>

View File

@ -116,19 +116,18 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
ret void
}
; FIXME: Extra moves shuffling superregister
; CI-LABEL: {{^}}simple_read2_v8f32_superreg:
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}}
; CI: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}}
; CI: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT4:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}}
; CI: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}}
; CI: v_mov_b32
; CI: buffer_store_dwordx4
; CI: buffer_store_dwordx4
; CI: s_endpgm
define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@ -139,41 +138,30 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
ret void
}
; FIXME: Extra moves shuffling superregister
; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
; CI-NOT: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}}
; CI: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:15{{$}}
; CI: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}}
; CI: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT9:[0-9]+]]:[[REG_ELT8:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}}
; CI: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:10 offset1:9{{$}}
; CI: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:14 offset1:13{{$}}
; CI: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:12 offset1:8{{$}}
; CI: v_mov_b32
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}}
; CI: v_mov_b32
; CI: s_waitcnt lgkmcnt(0)
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dword
; CI: buffer_store_dwordx4
; CI: buffer_store_dwordx4
; CI: buffer_store_dwordx4
; CI: buffer_store_dwordx4
; CI: s_endpgm
define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.r600.read.tidig.x() #1

View File

@ -106,14 +106,8 @@ define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
}
; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64:
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI-DAG: buffer_store_dwordx2
; SI-DAG: buffer_store_dwordx2
; SI-DAG: buffer_store_dwordx2
@ -131,14 +125,8 @@ define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
}
; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64:
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI-DAG: v_ashrrev_i32
; SI-DAG: v_ashrrev_i32
@ -166,22 +154,10 @@ define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
}
; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64:
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI-DAG: v_ashrrev_i32
; SI-DAG: v_ashrrev_i32
@ -219,22 +195,10 @@ define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
}
; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_store_dwordx2
; SI: buffer_store_dwordx2
@ -262,41 +226,15 @@ define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
}
; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64:
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI-DAG: v_ashrrev_i32
; SI-DAG: v_ashrrev_i32
@ -376,41 +314,14 @@ define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32
}
; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64:
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI-DAG: buffer_store_dwordx2
; SI-DAG: buffer_store_dwordx2

View File

@ -105,6 +105,26 @@ define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x hal
}
; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
%ext = fpext <8 x half> %arg to <8 x float>
store <8 x float> %ext, <8 x float> addrspace(1)* %out
@ -298,6 +318,46 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x
}
; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN: s_endpgm
define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
%val = load <16 x half>, <16 x half> addrspace(1)* %in
%cvt = fpext <16 x half> %val to <16 x float>
@ -426,14 +486,8 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4
}
; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dwordx4
; GCN: buffer_load_dwordx4
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
@ -459,22 +513,10 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8
}
; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16:
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dword
; GCN: buffer_load_dwordx4
; GCN: buffer_load_dwordx4
; GCN: buffer_load_dwordx4
; GCN: buffer_load_dwordx4
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32

View File

@ -277,15 +277,9 @@ entry:
; FUNC-LABEL: {{^}}load_v8i32:
; R600: VTX_READ_128
; R600: VTX_READ_128
; XXX: We should be using DWORDX4 instructions on SI.
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
entry:
%0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
@ -298,23 +292,11 @@ entry:
; R600: VTX_READ_128
; R600: VTX_READ_128
; R600: VTX_READ_128
; XXX: We should be using DWORDX4 instructions on SI.
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
entry:
%0 = load <16 x i32>, <16 x i32> addrspace(1)* %in

View File

@ -613,22 +613,9 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
ret void
}
; FIXME: This should do 2 dwordx4 loads
; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
; GCN-NOAA: buffer_store_dword v
; GCN-NOAA: buffer_store_dword v
; GCN-NOAA: buffer_store_dword v
; GCN-NOAA: buffer_store_dword v
; GCN-NOAA: buffer_store_dword v
; GCN-NOAA: buffer_store_dword v
; GCN-NOAA: buffer_store_dword v
; GCN-NOAA: buffer_store_dword v
; GCN-AA: buffer_store_dwordx4
; GCN-AA: buffer_store_dwordx2
; GCN-AA: buffer_store_dwordx2
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN: s_endpgm
define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
store i32 34, i32 addrspace(1)* %out, align 4

View File

@ -34,46 +34,16 @@ define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace
}
; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store:
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dword
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_load_dwordx4
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dword
; SI: buffer_store_dwordx4
; SI: buffer_store_dwordx4
; SI: buffer_store_dwordx4
; SI: buffer_store_dwordx4
; SI: s_endpgm
define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
%tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32

View File

@ -162,14 +162,8 @@ entry:
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@ -184,7 +178,7 @@ entry:
; FIXME: should use immediate offset instead of using s_add_i32 for adding to constant.
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
; GCN: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x13480{{$}}
; GCN-DAG: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x13480{{$}}
; SI-DAG: s_add_i32 s[[OFFSET1:[0-9]+]], s[[OFFSET0]], 16
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET0]]:{{[0-9]+}}], 0 addr64{{$}}
@ -197,6 +191,7 @@ entry:
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET2]]:{{[0-9]+}}], 0 addr64{{$}}
; GCN-DAG: s_add_i32 s[[OFFSET3:[0-9]+]], s[[OFFSET2]], 16
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET3]]:{{[0-9]+}}], 0 addr64{{$}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
@ -205,14 +200,12 @@ entry:
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN: s_endpgm
define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0