[X86] Don't fold non-LSB extracts into truncating broadcasts.
We used to incorrectly assume that the offset we're extracting from was a multiple of the element size. So, we'd fold: (v8i16 (shufflevector (v8i16 (bitcast (v4i32 (build_vector X, Y, ...)))), <1,1,...,1>)) into: (v8i16 (vbroadcast (i16 (trunc Y)))) whereas we should have extracted the higher bits from X. Instead, bail out if the assumption doesn't hold. llvm-svn: 252361
This commit is contained in:
parent
b126f6b6c8
commit
68614a36d1
|
@ -7861,6 +7861,54 @@ static SDValue lowerVectorShuffleAsElementInsertion(
|
|||
return V2;
|
||||
}
|
||||
|
||||
/// \brief Try to lower broadcast of a single - truncated - integer element,
|
||||
/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
|
||||
///
|
||||
/// This assumes we have AVX2.
|
||||
static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
|
||||
int BroadcastIdx,
|
||||
const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
assert(Subtarget->hasAVX2() &&
|
||||
"We can only lower integer broadcasts with AVX2!");
|
||||
|
||||
EVT EltVT = VT.getVectorElementType();
|
||||
EVT V0VT = V0.getValueType();
|
||||
|
||||
assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
|
||||
assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
|
||||
|
||||
EVT V0EltVT = V0VT.getVectorElementType();
|
||||
if (!V0EltVT.isInteger())
|
||||
return SDValue();
|
||||
|
||||
const unsigned EltSize = EltVT.getSizeInBits();
|
||||
const unsigned V0EltSize = V0EltVT.getSizeInBits();
|
||||
|
||||
// This is only a truncation if the original element type is larger.
|
||||
if (V0EltSize <= EltSize)
|
||||
return SDValue();
|
||||
|
||||
assert(((V0EltSize % EltSize) == 0) &&
|
||||
"Scalar type sizes must all be powers of 2 on x86!");
|
||||
|
||||
const unsigned V0Opc = V0.getOpcode();
|
||||
const unsigned Scale = V0EltSize / EltSize;
|
||||
const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
|
||||
|
||||
// If we're extracting non-least-significant bits, this isn't a truncation.
|
||||
if (BroadcastIdx % Scale)
|
||||
return SDValue();
|
||||
|
||||
if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
|
||||
V0Opc != ISD::BUILD_VECTOR)
|
||||
return SDValue();
|
||||
|
||||
SDValue Scalar = V0.getOperand(V0BroadcastIdx);
|
||||
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
|
||||
DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
|
||||
}
|
||||
|
||||
/// \brief Try to lower broadcast of a single element.
|
||||
///
|
||||
/// For convenience, this code also bundles all of the subtarget feature set
|
||||
|
@ -7924,18 +7972,10 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
|
|||
// First, look through bitcast: if the original value has a larger element
|
||||
// type than the shuffle, the broadcast element is in essence truncated.
|
||||
// Make that explicit to ease folding.
|
||||
if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) {
|
||||
MVT EltVT = VT.getVectorElementType();
|
||||
SDValue V0 = V.getOperand(0);
|
||||
MVT V0VT = V0.getSimpleValueType();
|
||||
|
||||
if (V0VT.isInteger() && V0VT.getVectorElementType().bitsGT(EltVT) &&
|
||||
((V0.getOpcode() == ISD::BUILD_VECTOR ||
|
||||
(V0.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)))) {
|
||||
V = DAG.getNode(ISD::TRUNCATE, DL, EltVT, V0.getOperand(BroadcastIdx));
|
||||
BroadcastIdx = 0;
|
||||
}
|
||||
}
|
||||
if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
|
||||
if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
|
||||
DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
|
||||
return TruncBroadcast;
|
||||
|
||||
// Also check the simpler case, where we can directly reuse the scalar.
|
||||
if (V.getOpcode() == ISD::BUILD_VECTOR ||
|
||||
|
|
|
@ -1459,3 +1459,149 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
|
|||
%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> zeroinitializer
|
||||
ret <16 x i8> %tmp4
|
||||
}
|
||||
|
||||
define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
|
||||
; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: insert_dup_elt1_mem_v16i8_i32:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX-NEXT: retq
|
||||
%tmp = load i32, i32* %ptr, align 4
|
||||
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
|
||||
%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
|
||||
%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
||||
ret <16 x i8> %tmp3
|
||||
}
|
||||
|
||||
define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
|
||||
; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: insert_dup_elt2_mem_v16i8_i32:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
||||
; AVX-NEXT: retq
|
||||
%tmp = load i32, i32* %ptr, align 4
|
||||
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
|
||||
%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
|
||||
%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
||||
ret <16 x i8> %tmp3
|
||||
}
|
||||
|
||||
define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
|
||||
; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: movsbl (%rdi), %eax
|
||||
; SSE2-NEXT: movd %eax, %xmm0
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: movsbl (%rdi), %eax
|
||||
; SSSE3-NEXT: movd %eax, %xmm0
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: movsbl (%rdi), %eax
|
||||
; SSE41-NEXT: movd %eax, %xmm0
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: movsbl (%rdi), %eax
|
||||
; AVX-NEXT: vmovd %eax, %xmm0
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX-NEXT: retq
|
||||
%tmp = load i8, i8* %ptr, align 1
|
||||
%tmp1 = sext i8 %tmp to i32
|
||||
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
|
||||
%tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
|
||||
%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
||||
ret <16 x i8> %tmp4
|
||||
}
|
||||
|
||||
define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
|
||||
; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: movsbl (%rdi), %eax
|
||||
; SSE2-NEXT: movd %eax, %xmm0
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: movsbl (%rdi), %eax
|
||||
; SSSE3-NEXT: movd %eax, %xmm0
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: movsbl (%rdi), %eax
|
||||
; SSE41-NEXT: movd %eax, %xmm0
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: movsbl (%rdi), %eax
|
||||
; AVX-NEXT: vmovd %eax, %xmm0
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
|
||||
; AVX-NEXT: retq
|
||||
%tmp = load i8, i8* %ptr, align 1
|
||||
%tmp1 = sext i8 %tmp to i32
|
||||
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
|
||||
%tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
|
||||
%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
|
||||
ret <16 x i8> %tmp4
|
||||
}
|
||||
|
|
|
@ -2228,3 +2228,145 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) {
|
|||
%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer
|
||||
ret <8 x i16> %tmp4
|
||||
}
|
||||
|
||||
define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) {
|
||||
; SSE2-LABEL: insert_dup_elt1_mem_v8i16_i32:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_i32:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: insert_dup_elt1_mem_v8i16_i32:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: insert_dup_elt1_mem_v8i16_i32:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; AVX-NEXT: retq
|
||||
%tmp = load i32, i32* %ptr, align 4
|
||||
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
|
||||
%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
|
||||
%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
||||
ret <8 x i16> %tmp3
|
||||
}
|
||||
|
||||
define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) {
|
||||
; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: insert_dup_elt3_mem_v8i16_i32:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: insert_dup_elt3_mem_v8i16_i32:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; AVX-NEXT: retq
|
||||
%tmp = load i32, i32* %ptr, align 4
|
||||
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
|
||||
%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
|
||||
%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
ret <8 x i16> %tmp3
|
||||
}
|
||||
|
||||
define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) {
|
||||
; SSE2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: movswl (%rdi), %eax
|
||||
; SSE2-NEXT: movd %eax, %xmm0
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: movswl (%rdi), %eax
|
||||
; SSSE3-NEXT: movd %eax, %xmm0
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: movswl (%rdi), %eax
|
||||
; SSE41-NEXT: movd %eax, %xmm0
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: movswl (%rdi), %eax
|
||||
; AVX-NEXT: vmovd %eax, %xmm0
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; AVX-NEXT: retq
|
||||
%tmp = load i16, i16* %ptr, align 2
|
||||
%tmp1 = sext i16 %tmp to i32
|
||||
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
|
||||
%tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
|
||||
%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
||||
ret <8 x i16> %tmp4
|
||||
}
|
||||
|
||||
define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) {
|
||||
; SSE2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: movswl (%rdi), %eax
|
||||
; SSE2-NEXT: movd %eax, %xmm0
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: movswl (%rdi), %eax
|
||||
; SSSE3-NEXT: movd %eax, %xmm0
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: movswl (%rdi), %eax
|
||||
; SSE41-NEXT: movd %eax, %xmm0
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: movswl (%rdi), %eax
|
||||
; AVX-NEXT: vmovd %eax, %xmm0
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; AVX-NEXT: retq
|
||||
%tmp = load i16, i16* %ptr, align 2
|
||||
%tmp1 = sext i16 %tmp to i32
|
||||
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 1
|
||||
%tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
|
||||
%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
ret <8 x i16> %tmp4
|
||||
}
|
||||
|
|
|
@ -3331,3 +3331,45 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) {
|
|||
%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <16 x i32> zeroinitializer
|
||||
ret <16 x i16> %tmp4
|
||||
}
|
||||
|
||||
define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 {
|
||||
; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i32:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: insert_dup_elt1_mem_v16i16_i32:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%tmp = load i32, i32* %ptr, align 4
|
||||
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
|
||||
%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
|
||||
%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
||||
ret <16 x i16> %tmp3
|
||||
}
|
||||
|
||||
define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 {
|
||||
; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i32:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%tmp = load i32, i32* %ptr, align 4
|
||||
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
|
||||
%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
|
||||
%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
ret <16 x i16> %tmp3
|
||||
}
|
||||
|
|
|
@ -2018,3 +2018,69 @@ define <32 x i8> @insert_dup_mem_v32i8_sext_i8(i8* %ptr) {
|
|||
%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <32 x i32> zeroinitializer
|
||||
ret <32 x i8> %tmp4
|
||||
}
|
||||
|
||||
define <32 x i8> @insert_dup_elt1_mem_v32i8_i32(i32* %ptr) {
|
||||
; AVX1-LABEL: insert_dup_elt1_mem_v32i8_i32:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: insert_dup_elt1_mem_v32i8_i32:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%tmp = load i32, i32* %ptr, align 4
|
||||
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
|
||||
%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
|
||||
%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
||||
ret <32 x i8> %tmp3
|
||||
}
|
||||
|
||||
define <32 x i8> @insert_dup_elt3_mem_v32i8_i32(i32* %ptr) {
|
||||
; AVX1-LABEL: insert_dup_elt3_mem_v32i8_i32:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: insert_dup_elt3_mem_v32i8_i32:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%tmp = load i32, i32* %ptr, align 4
|
||||
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
|
||||
%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
|
||||
%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
ret <32 x i8> %tmp3
|
||||
}
|
||||
|
||||
define <32 x i8> @insert_dup_elt1_mem_v32i8_sext_i8(i8* %ptr) {
|
||||
; AVX1-LABEL: insert_dup_elt1_mem_v32i8_sext_i8:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: movsbl (%rdi), %eax
|
||||
; AVX1-NEXT: vmovd %eax, %xmm0
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: insert_dup_elt1_mem_v32i8_sext_i8:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: movsbl (%rdi), %eax
|
||||
; AVX2-NEXT: vmovd %eax, %xmm0
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%tmp = load i8, i8* %ptr, align 1
|
||||
%tmp1 = sext i8 %tmp to i32
|
||||
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
|
||||
%tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
|
||||
%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
||||
ret <32 x i8> %tmp4
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue