diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 004b9fa431a1..793176e4bf48 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25959,7 +25959,9 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || - (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16))) { + (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || + (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || + (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll index bc29a0b19a1b..d589a9265f10 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -73,16 +73,14 @@ define <16 x i8> @combine_vpermi2var_16i8_as_vpshufb(<16 x i8> %x0, <16 x i8> %x define <32 x i8> @combine_vpermi2var_32i8_as_vpermb(<32 x i8> %x0, <32 x i8> %x1) { ; X32-LABEL: combine_vpermi2var_32i8_as_vpermb: ; X32: # BB#0: -; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; X32-NEXT: vmovdqu8 {{.*#+}} ymm0 = [0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22,0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22] -; X32-NEXT: vpermi2b %ymm1, %ymm1, %ymm0 +; X32-NEXT: vmovdqu8 {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X32-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_32i8_as_vpermb: ; X64: # BB#0: -; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; X64-NEXT: vmovdqu8 {{.*#+}} ymm0 = [0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22,0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22] -; X64-NEXT: vpermi2b %ymm1, %ymm1, %ymm0 +; X64-NEXT: vmovdqu8 {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq %res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32> %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %res0, <32 x i8> , <32 x i8> %res0, i32 -1) @@ -91,16 +89,14 @@ define <32 x i8> @combine_vpermi2var_32i8_as_vpermb(<32 x i8> %x0, <32 x i8> %x1 define <64 x i8> @combine_vpermi2var_64i8_as_vpermb(<64 x i8> %x0, <64 x i8> %x1) { ; X32-LABEL: combine_vpermi2var_64i8_as_vpermb: ; X32: # BB#0: -; X32-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] -; X32-NEXT: vmovdqu8 {{.*#+}} zmm0 = [0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22,0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22,0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22,0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22] -; X32-NEXT: vpermi2b %zmm1, %zmm1, %zmm0 +; X32-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X32-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_64i8_as_vpermb: ; X64: # BB#0: -; X64-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] -; X64-NEXT: vmovdqu8 {{.*#+}} zmm0 = [0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22,0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22,0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22,0,32,2,30,4,28,6,26,8,28,10,26,12,24,14,22] -; X64-NEXT: vpermi2b %zmm1, %zmm1, %zmm0 +; X64-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19] +; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq %res0 = shufflevector <64 x i8> %x0, <64 x i8> %x1, <64 x i32> %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %res0, <64 x i8> , <64 x i8> %res0, i64 -1)