AMDGPU: Use more custom insert/extract_vector_elt lowering

Apply to i8 vectors.

llvm-svn: 334044
This commit is contained in:
Matt Arsenault 2018-06-05 19:52:46 +00:00
parent b984ffcc71
commit 9224c00d2b
5 changed files with 164 additions and 99 deletions

View File

@ -296,6 +296,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
@ -4069,14 +4076,18 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SDValue InsVal = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();
unsigned VecSize = VecVT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();
assert(VecVT.getScalarSizeInBits() == 16);
assert(VecSize <= 64);
unsigned NumElts = VecVT.getVectorNumElements();
SDLoc SL(Op);
auto KIdx = dyn_cast<ConstantSDNode>(Idx);
if (NumElts == 4 && KIdx) {
if (NumElts == 4 && EltSize == 16 && KIdx) {
SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
@ -4103,22 +4114,24 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
}
assert(NumElts == 2 || NumElts == 4);
if (isa<ConstantSDNode>(Idx))
return SDValue();
EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64;
MVT IntVT = MVT::getIntegerVT(VecSize);
// Avoid stack access for dynamic indexing.
SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
SDValue Val = InsVal;
if (InsVal.getValueType() == MVT::f16)
Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
assert(isPowerOf2_32(EltSize));
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
// Convert vector index to bit-index.
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
DAG.getConstant(4, SL, MVT::i32));
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
@ -4141,8 +4154,9 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue Vec = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
EVT VecVT = Vec.getValueType();
unsigned NumElts = VecVT.getVectorNumElements();
assert(VecVT.getScalarSizeInBits() == 16 && (NumElts == 2 || NumElts == 4));
unsigned VecSize = VecVT.getSizeInBits();
EVT EltVT = VecVT.getVectorElementType();
assert(VecSize <= 64);
DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
@ -4153,11 +4167,14 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64;
SDValue Four = DAG.getConstant(4, SL, MVT::i32);
unsigned EltSize = EltVT.getSizeInBits();
assert(isPowerOf2_32(EltSize));
// Convert vector index to bit-index (* 16)
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Four);
MVT IntVT = MVT::getIntegerVT(VecSize);
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
// Convert vector index to bit-index (* EltSize)
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);

View File

@ -252,8 +252,8 @@ entry:
; R600-VECT: MOVA_INT
; SI-PROMOTE-VECT-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding:
; SI-PROMOTE-VECT-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding:
; SI-PROMOTE-VECT-DAG: s_lshl_b32
; SI-PROMOTE-VECT-DAG: v_lshrrev
; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0

View File

@ -1,7 +1,7 @@
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; FUNC-LABEL: {{^}}extract_vector_elt_v1i8:
; GCN-LABEL: {{^}}extract_vector_elt_v1i8:
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
@ -10,7 +10,7 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i
ret void
}
; FUNC-LABEL: {{^}}extract_vector_elt_v2i8:
; GCN-LABEL: {{^}}extract_vector_elt_v2i8:
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
@ -24,7 +24,7 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i
ret void
}
; FUNC-LABEL: {{^}}extract_vector_elt_v3i8:
; GCN-LABEL: {{^}}extract_vector_elt_v3i8:
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
@ -38,7 +38,7 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i
ret void
}
; FUNC-LABEL: {{^}}extract_vector_elt_v4i8:
; GCN-LABEL: {{^}}extract_vector_elt_v4i8:
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
@ -52,7 +52,7 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i
ret void
}
; FUNC-LABEL: {{^}}extract_vector_elt_v8i8:
; GCN-LABEL: {{^}}extract_vector_elt_v8i8:
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
@ -66,7 +66,7 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i
ret void
}
; FUNC-LABEL: {{^}}extract_vector_elt_v16i8:
; GCN-LABEL: {{^}}extract_vector_elt_v16i8:
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
@ -80,7 +80,7 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x
ret void
}
; FUNC-LABEL: {{^}}extract_vector_elt_v32i8:
; GCN-LABEL: {{^}}extract_vector_elt_v32i8:
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
@ -94,7 +94,7 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x
ret void
}
; FUNC-LABEL: {{^}}extract_vector_elt_v64i8:
; GCN-LABEL: {{^}}extract_vector_elt_v64i8:
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
@ -108,18 +108,32 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x
ret void
}
; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; FIXME: SI generates much worse code from that's a pain to match
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
; VI-DAG: buffer_load_ushort [[LOAD:v[0-9]+]],
; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
; GCN: buffer_store_byte
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[LOAD]]
; VI: buffer_store_byte [[EXTRACT]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 {
%elt = extractelement <2 x i8> %foo, i32 %idx
store i8 %elt, i8 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
; VI-DAG: buffer_load_ubyte [[LOAD2:v[0-9]+]],
; VI-DAG: buffer_load_ushort [[LOAD01:v[0-9]+]],
; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: v_lshlrev_b32_e32 [[ELT2:v[0-9]+]], 16, [[LOAD2]]
; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[LOAD01]], [[ELT2]]
; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]]
; VI: buffer_store_byte [[EXTRACT]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
%p0 = extractelement <3 x i8> %foo, i32 %idx
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@ -127,20 +141,14 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
; VI-DAG: s_load_dword [[VEC3:s[0-9]+]], s[0:1], 0x2c
; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC3]], [[SCALED_IDX]]
; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
; VI: buffer_store_byte [[V_EXTRACT]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
%p0 = extractelement <4 x i8> %foo, i32 %idx
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@ -148,4 +156,19 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8:
; VI-DAG: s_load_dwordx2 [[VEC3:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c
; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC3]], [[SCALED_IDX]]
; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], s[[EXTRACT_LO]]
; VI: buffer_store_byte [[V_EXTRACT]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo, i32 %idx) #0 {
%p0 = extractelement <8 x i8> %foo, i32 %idx
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p0, i8 addrspace(1)* %out
ret void
}
attributes #0 = { nounwind }

View File

@ -349,10 +349,9 @@ define <16 x i8> @v16i8_func_void() #0 {
; FIXME: Should pack
; GCN-LABEL: {{^}}v4i8_func_void:
; GCN: buffer_load_dword v0
; GCN-DAG: v_lshrrev_b32_e32 v1, 8, v0
; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0
; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0
; CI-DAG: v_bfe_u32 v1, v0, 8, 8
; GFX89-DAG: v_lshrrev_b16_e32 v1, 8, v0
; GCN: s_setpc_b64
define <4 x i8> @v4i8_func_void() #0 {
%ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(4)* undef

View File

@ -202,19 +202,15 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
}
; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
; GCN: buffer_load_ubyte v{{[0-9]+}}, off
; GCN: buffer_load_ubyte v{{[0-9]+}}, off
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
; GCN-NO-TONGA: buffer_load_ubyte
; GCN-NO-TONGA: buffer_load_ubyte
; GCN-TONGA: buffer_load_ushort
; GCN: buffer_store_short v{{[0-9]+}}, off
; VI: buffer_load_ushort [[LOAD:v[0-9]]]
; VI: s_load_dword [[IDX:s[0-9]]]
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: v_lshlrev_b16_e64 [[SHL:v[0-9]+]], [[SCALED_IDX]], -1
; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[SHL]]
; VI: v_and_b32_e32 [[AND0:v[0-9]+]], 5, [[SHL]]
; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[LOAD]]
; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[AND0]], [[AND1]]
; VI: buffer_store_short [[OR]]
define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <2 x i8> %a, i8 5, i32 %b
store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
@ -222,22 +218,16 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
}
; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
; GCN: buffer_load_ubyte v{{[0-9]+}}, off
; GCN: buffer_load_ubyte v{{[0-9]+}}, off
; GCN: buffer_load_ubyte v{{[0-9]+}}, off
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
; GCN-NO-TONGA: buffer_load_ubyte
; GCN-NO-TONGA: buffer_load_ubyte
; GCN-NO-TONGA: buffer_load_ubyte
; GCN-TONGA: buffer_load_ushort
; GCN-TONGA: buffer_load_ubyte
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off
; VI: buffer_load_ubyte
; VI: buffer_load_ushort
; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 3
; VI: s_lshl_b32 s{{[0-9]+}}, 0xffff,
; VI: s_not_b32
; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; VI: v_or_b32_e32
; VI: v_and_b32
; VI: v_bfi_b32
; VI: v_lshrrev_b32
define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <3 x i8> %a, i8 5, i32 %b
store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
@ -245,25 +235,12 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou
}
; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
; GCN: buffer_load_ubyte v{{[0-9]+}}, off
; GCN: buffer_load_ubyte v{{[0-9]+}}, off
; GCN: buffer_load_ubyte v{{[0-9]+}}, off
; GCN: buffer_load_ubyte v{{[0-9]+}}, off
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:7
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
; GCN-NO-TONGA: buffer_load_ubyte
; GCN-NO-TONGA: buffer_load_ubyte
; GCN-NO-TONGA: buffer_load_ubyte
; GCN-NO-TONGA: buffer_load_ubyte
; GCN-TONGA: buffer_load_dword
; GCN: buffer_store_dword v{{[0-9]+}}, off
; VI: s_load_dword [[VEC:s[0-9]+]]
; VI: s_load_dword [[IDX:s[0-9]]]
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
; VI-DAG: v_mov_b32_e32 [[V_VEC:v[0-9]+]], [[VEC]]
; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[MASK]], 5, [[V_VEC]]
define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <4 x i8> %a, i8 5, i32 %b
store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
@ -271,6 +248,19 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou
}
; GCN-LABEL: {{^}}dynamic_insertelement_v8i8:
; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI: s_load_dword [[IDX:s[0-9]]]
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
; VI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[NOT_MASK]], [[VEC]]
; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5
; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]]
; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]]
; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]]
; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}}
define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <8 x i8> %a, i8 5, i32 %b
store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
@ -278,6 +268,42 @@ define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %ou
}
; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_load_ubyte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <16 x i8> %a, i8 5, i32 %b
store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16