From 90e87af303acf052b9e67345327e42b19cd54e9e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 15 Jun 2019 18:30:43 +0000 Subject: [PATCH] [X86][AVX] Handle lane-crossing shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1) shuffles Pull out the existing (non)lane-crossing fold into a helper lambda and use for lane-crossing unary shuffles as well. Fixes PR34380 llvm-svn: 363500 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 128 +++-- .../X86/avx512-shuffles/partial_permute.ll | 486 ++++++++---------- .../test/CodeGen/X86/vector-shuffle-512-v8.ll | 22 +- 3 files changed, 321 insertions(+), 315 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 786349653a64..0cae5f1a22a3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31939,6 +31939,58 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, bool MaskContainsZeros = any_of(Mask, [](int M) { return M == SM_SentinelZero; }); + // Unwrap shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1) -> + // shuffle(x,y,m2) + auto CombineShuffleWithExtract = + [&](SDValue &NewRoot, SmallVectorImpl &NewMask, + SmallVectorImpl &NewInputs) -> bool { + assert(NewMask.empty() && NewInputs.empty() && "Non-empty shuffle mask"); + if (UnaryShuffle || V1.getOpcode() != ISD::EXTRACT_SUBVECTOR || + V2.getOpcode() != ISD::EXTRACT_SUBVECTOR || + !isa(V1.getOperand(1)) || + !isa(V2.getOperand(1))) + return false; + + SDValue Src1 = V1.getOperand(0); + SDValue Src2 = V2.getOperand(0); + if (Src1.getValueType() != Src2.getValueType()) + return false; + + unsigned Offset1 = V1.getConstantOperandVal(1); + unsigned Offset2 = V2.getConstantOperandVal(1); + assert(((Offset1 % VT1.getVectorNumElements()) == 0 || + (Offset2 % VT2.getVectorNumElements()) == 0 || + (Src1.getValueSizeInBits() % RootSizeInBits) == 0) && + "Unexpected subvector extraction"); + unsigned Scale = Src1.getValueSizeInBits() / RootSizeInBits; + + // Convert extraction indices to mask size. + Offset1 /= VT1.getVectorNumElements(); + Offset2 /= VT2.getVectorNumElements(); + Offset1 *= NumMaskElts; + Offset2 *= NumMaskElts; + + NewInputs.push_back(Src1); + if (Src1 != Src2) { + NewInputs.push_back(Src2); + Offset2 += Scale * NumMaskElts; + } + + // Create new mask for larger type. + NewMask.append(Mask.begin(), Mask.end()); + for (int &M : NewMask) { + if (M < 0) + continue; + if (M < (int)NumMaskElts) + M += Offset1; + else + M = (M - NumMaskElts) + Offset2; + } + NewMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef); + NewRoot = Src1; + return true; + }; + if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros && @@ -31982,6 +32034,21 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } + // If that failed and both inputs are extracted from the same source type + // then try to combine as an unary shuffle with the larger type. + SDValue NewRoot; + SmallVector NewMask; + SmallVector NewInputs; + if (CombineShuffleWithExtract(NewRoot, NewMask, NewInputs)) { + if (SDValue Res = combineX86ShuffleChain( + NewInputs, NewRoot, NewMask, Depth, HasVariableMask, + AllowVariableMask, DAG, Subtarget)) { + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT1, Res, + DAG.getIntPtrConstant(0, DL)); + return DAG.getBitcast(RootVT, Res); + } + } + // If we have a dual input lane-crossing shuffle then lower to VPERMV3. if (AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && @@ -32147,55 +32214,18 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return DAG.getBitcast(RootVT, Res); } - // If that failed and both inputs are extracted from the same source then - // try to combine as an unary shuffle with the larger type. - if (!UnaryShuffle && V1.getOpcode() == ISD::EXTRACT_SUBVECTOR && - V2.getOpcode() == ISD::EXTRACT_SUBVECTOR && - isa(V1.getOperand(1)) && - isa(V2.getOperand(1))) { - SDValue Src1 = V1.getOperand(0); - SDValue Src2 = V2.getOperand(0); - if (Src1.getValueType() == Src2.getValueType()) { - unsigned Offset1 = V1.getConstantOperandVal(1); - unsigned Offset2 = V2.getConstantOperandVal(1); - assert(((Offset1 % VT1.getVectorNumElements()) == 0 || - (Offset2 % VT2.getVectorNumElements()) == 0 || - (Src1.getValueSizeInBits() % RootSizeInBits) == 0) && - "Unexpected subvector extraction"); - unsigned Scale = Src1.getValueSizeInBits() / RootSizeInBits; - - // Convert extraction indices to mask size. - Offset1 /= VT1.getVectorNumElements(); - Offset2 /= VT2.getVectorNumElements(); - Offset1 *= NumMaskElts; - Offset2 *= NumMaskElts; - - SmallVector NewInputs; - NewInputs.push_back(Src1); - if (Src1 != Src2) { - NewInputs.push_back(Src2); - Offset2 += Scale * NumMaskElts; - } - - // Create new mask for larger type. - SmallVector NewMask(Mask); - for (int &M : NewMask) { - if (M < 0) - continue; - if (M < (int)NumMaskElts) - M += Offset1; - else - M = (M - NumMaskElts) + Offset2; - } - NewMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef); - - if (SDValue Res = combineX86ShuffleChain( - NewInputs, Src1, NewMask, Depth, HasVariableMask, - AllowVariableMask, DAG, Subtarget)) { - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT1, Res, - DAG.getIntPtrConstant(0, DL)); - return DAG.getBitcast(RootVT, Res); - } + // If that failed and both inputs are extracted from the same source type + // then try to combine as an unary shuffle with the larger type. + SDValue NewRoot; + SmallVector NewMask; + SmallVector NewInputs; + if (CombineShuffleWithExtract(NewRoot, NewMask, NewInputs)) { + if (SDValue Res = combineX86ShuffleChain(NewInputs, NewRoot, NewMask, Depth, + HasVariableMask, AllowVariableMask, + DAG, Subtarget)) { + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT1, Res, + DAG.getIntPtrConstant(0, DL)); + return DAG.getBitcast(RootVT, Res); } } diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 685efdec1109..ec3f32381be1 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1198,10 +1198,10 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,5,3,6,15,2,9,14] -; CHECK-NEXT: vpermi2d %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -1209,11 +1209,11 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) { define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [9,5,3,6,15,2,9,14] -; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1224,11 +1224,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,5,3,6,15,2,9,14] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1238,11 +1237,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,0,15,3,2,3,6,8] -; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,15,3,2,3,6,8,3,0,15,3,2,3,6,8] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1253,11 +1252,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1267,11 +1265,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,15,15,2,6,10,14,7] -; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,15,15,2,6,10,14,7,2,15,15,2,6,10,14,7] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1282,11 +1280,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1296,10 +1293,10 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3] -; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -1307,11 +1304,11 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) { define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [14,5,7,7,10,3,9,3] -; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1322,11 +1319,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1336,10 +1332,9 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] -; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1348,11 +1343,10 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,12,4,6,4,12] -; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1364,11 +1358,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1411,8 +1404,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,13,0] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; CHECK-NEXT: vpermt2d %ymm4, %ymm3, %ymm0 +; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -1426,10 +1418,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,13,0] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,13,0,1,1,13,0] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermt2d %ymm3, %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1441,9 +1433,8 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,0,0,13] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13] +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1454,8 +1445,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; CHECK-NEXT: vpermt2d %ymm4, %ymm3, %ymm0 +; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -1469,10 +1459,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,0,0,13,3,0,0,13] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermt2d %ymm3, %ymm2, %ymm0 {%k1} {z} +; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1805,10 +1795,9 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,1,12,2] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpermt2d %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10] +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1985,11 +1974,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,2,5] -; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,4,6,1,6,4,6,1] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2000,11 +1989,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,2,5] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2014,12 +2002,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,2,7] -; CHECK-NEXT: # ymm4 = mem[0,1,0,1] -; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,3,6,3,6,3,6,3] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2030,12 +2017,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,7,2,7] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,3,6,3] ; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2045,10 +2031,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [2,4,4,3] -; CHECK-NEXT: vpermi2q %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,0,0,7,6,0,0,7] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> ret <4 x i64> %res @@ -2056,11 +2042,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,4,4,3] -; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,0,0,7,6,0,0,7] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2071,11 +2057,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,4,4,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2085,11 +2070,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,3,1] -; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,7,7,5,3,7,7,5] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2100,11 +2085,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,3,3,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,7,5] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2114,11 +2098,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,1,0,6] -; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,1,0,6,4,1,0,6] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2129,11 +2113,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2143,10 +2126,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,7] -; CHECK-NEXT: vpermi2q %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [7,6,5,3,7,6,5,3] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> ret <4 x i64> %res @@ -2154,11 +2137,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,2,1,7] -; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [7,6,5,3,7,6,5,3] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2169,11 +2152,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,7] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,3] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2935,10 +2917,10 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7] -; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> ret <8 x float> %res @@ -2946,12 +2928,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) { define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,12,10,8,2,11,7] -; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -2962,12 +2944,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7] -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1 -; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -2977,12 +2958,12 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %v define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [2,4,11,4,12,7,9,6] -; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [10,12,3,12,4,15,1,14,10,12,3,12,4,15,1,14] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -2993,12 +2974,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [2,4,11,4,12,7,9,6] -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1 -; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3039,10 +3019,10 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %v define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,6,1,8,4,12,13,0] -; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> ret <8 x float> %res @@ -3050,12 +3030,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) { define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [4,6,1,8,4,12,13,0] -; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3066,12 +3046,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,6,1,8,4,12,13,0] -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1 -; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3081,10 +3060,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %v define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [12,0,1,2] -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpermt2ps %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10] +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3093,12 +3071,11 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) { define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [12,0,1,2] -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm4 -; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1 -; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [4,8,9,10] +; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3110,12 +3087,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [12,0,1,2] -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1 -; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,8,9,10,4,8,9,10] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3126,12 +3103,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %v define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,14,2,14] -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm4 -; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1 -; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [8,6,10,6] +; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 +; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3143,12 +3119,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,14,2,14] -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1 -; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [8,6,10,6] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3707,11 +3682,10 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x doub define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,7,3,7] -; CHECK-NEXT: # ymm1 = mem[0,1,0,1] -; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,3,7,3,7,3,7,3] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> ret <4 x double> %res @@ -3719,13 +3693,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,7,3,7] -; CHECK-NEXT: # ymm4 = mem[0,1,0,1] -; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,3,7,3,7,3,7,3] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -3736,13 +3709,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,7,3,7] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,3,7,3] ; CHECK-NEXT: # ymm2 = mem[0,1,0,1] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 -; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -3752,12 +3724,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,0,7,6] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,0,7,6,2,0,7,6] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -3768,12 +3740,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,7,6] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 -; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -3882,12 +3853,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,6,2,2] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,6,2,2,2,6,2,2] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -3898,12 +3869,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 -; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -3913,10 +3883,10 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,4,3,4] -; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,7,0,5,0,7,0] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> ret <4 x double> %res @@ -3924,12 +3894,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,4,3,4] -; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4 -; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [5,0,7,0,5,0,7,0] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -3940,12 +3910,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,4,3,4] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 -; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [5,0,7,0] +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -3955,12 +3924,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,5,0,6] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,5,0,6,3,5,0,6] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -3971,12 +3940,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 -; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index bdf4dbd546d6..ba0707a5bba2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2280,13 +2280,21 @@ define <2 x double> @test_v8f64_34 (<8 x double> %v) { ; FIXME: vpcompress define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) { -; ALL-LABEL: test_v8i64_1257: -; ALL: # %bb.0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,3] -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: test_v8i64_1257: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,2,5,7,1,2,5,7] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: test_v8i64_1257: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,2,0,5,0,7,0,1,0,2,0,5,0,7,0] +; AVX512F-32-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-32-NEXT: retl %res = shufflevector <8 x i64> %v, <8 x i64> undef, <4 x i32> ret <4 x i64> %res }