diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cf1583d02aad..3bf434946c67 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7543,6 +7543,38 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, } } +/// \brief Try to lower as a blend of elements from two inputs followed by +/// a single-input permutation. +/// +/// This matches the pattern where we can blend elements from two inputs and +/// then reduce the shuffle to a single-input permutation. +static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, + ArrayRef Mask, + SelectionDAG &DAG) { + // We build up the blend mask while checking whether a blend is a viable way + // to reduce the shuffle. + SmallVector BlendMask(Mask.size(), -1); + SmallVector PermuteMask(Mask.size(), -1); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] < 0) + continue; + + assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); + + if (BlendMask[Mask[i] % Size] == -1) + BlendMask[Mask[i] % Size] = Mask[i]; + else if (BlendMask[Mask[i] % Size] != Mask[i]) + return SDValue(); // Can't blend in the needed input! + + PermuteMask[i] = Mask[i] % Size; + } + + SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); + return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); +} + /// \brief Generic routine to lower a shuffle and blend as a decomposed set of /// unblended shuffles followed by an unshuffled blend. /// @@ -8552,6 +8584,24 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } +/// \brief Test whether this can be lowered with a single SHUFPS instruction. +/// +/// This is used to disable more specialized lowerings when the shufps lowering +/// will happen to be efficient. +static bool isSingleSHUFPSMask(ArrayRef Mask) { + // This routine only handles 128-bit shufps. + assert(Mask.size() == 4 && "Unsupported mask size!"); + + // To lower with a single SHUFPS we need to have the low half and high half + // each requiring a single input. + if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4 != Mask[1] < 4)) + return false; + if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4 != Mask[3] < 4)) + return false; + + return true; +} + /// \brief Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. @@ -8712,6 +8762,11 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) return V; + + if (!isSingleSHUFPSMask(Mask)) + if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( + DL, MVT::v4f32, V1, V2, Mask, DAG)) + return BlendPerm; } // Otherwise fall back to a SHUFPS lowering strategy. diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index 45d05a32069e..7c237651d522 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -295,11 +295,11 @@ define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) { define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_08991abb: ; AVX1: # BB#0: -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1] +; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8f32_08991abb: @@ -1133,11 +1133,11 @@ define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_08991abb: ; AVX1: # BB#0: -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1] +; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_08991abb: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 67da5b84ebdb..613f7c66794e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -875,16 +875,28 @@ define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { } define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { -; SSE-LABEL: combine_nested_undef_test16: -; SSE: # BB#0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: retq +; SSE2-LABEL: combine_nested_undef_test16: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_nested_undef_test16: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_nested_undef_test16: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE41-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test16: ; AVX: # BB#0: -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -892,16 +904,28 @@ define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { } define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) { -; SSE-LABEL: combine_nested_undef_test17: -; SSE: # BB#0: -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] -; SSE-NEXT: retq +; SSE2-LABEL: combine_nested_undef_test17: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_nested_undef_test17: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_nested_undef_test17: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; SSE41-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test17: ; AVX: # BB#0: -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -924,17 +948,30 @@ define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { } define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { -; SSE-LABEL: combine_nested_undef_test19: -; SSE: # BB#0: -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: combine_nested_undef_test19: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_nested_undef_test19: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,0] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_nested_undef_test19: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; SSE41-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test19: ; AVX: # BB#0: -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,0] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -942,17 +979,30 @@ define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { } define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { -; SSE-LABEL: combine_nested_undef_test20: -; SSE: # BB#0: -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: combine_nested_undef_test20: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_nested_undef_test20: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_nested_undef_test20: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,0] +; SSE41-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test20: ; AVX: # BB#0: -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -960,17 +1010,30 @@ define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { } define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { -; SSE-LABEL: combine_nested_undef_test21: -; SSE: # BB#0: -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: combine_nested_undef_test21: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_nested_undef_test21: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_nested_undef_test21: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test21: ; AVX: # BB#0: -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[1,1] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1599,16 +1662,28 @@ define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { } define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { -; SSE-LABEL: combine_test3b: -; SSE: # BB#0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE-NEXT: retq +; SSE2-LABEL: combine_test3b: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test3b: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test3b: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test3b: ; AVX: # BB#0: -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32>