[X86] Teach lowerShuffleAsBlend to use bit blend for v16i8/v32i8/v16i16 when avx512vl is enabled but not avx512bw.

Probably not super important since there are no real CPUs with
avx512vl and not avx512bw. But vpternlog should be better than
vblendvb.

I do wonder if we should use vpternlog even with BWI. We
currently use vblendmb or vpblendmw by putting the mask into a GPR
and moving it to a k-register. But I don't think we hoist the
GPR to k-register copy in machine LICM. Using VPTERNLOG would use
a constant pool load, but has the advantage that we're pretty good
at hoisting and rematerializing those.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D83156
This commit is contained in:
Craig Topper 2020-07-04 10:26:56 -07:00
parent b4eb415a99
commit e652c0f8f3
8 changed files with 36 additions and 31 deletions

View File

@ -11731,6 +11731,12 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}
// If we have VPTERNLOG, we can use that as a bit blend.
if (Subtarget.hasVLX())
if (SDValue BitBlend =
lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return BitBlend;
// Scale the blend by the number of bytes per element.
int Scale = VT.getScalarSizeInBits() / 8;

View File

@ -148,18 +148,17 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX256VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
; AVX256VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,12,13,u,u,8,9,6,7,14,15,14,15,0,1,22,23,28,29,18,19,26,27,22,23,u,u,30,31,16,17]
; AVX256VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,12,13],zero,zero,ymm1[8,9,6,7,14,15,14,15,0,1,22,23,28,29,18,19,26,27,22,23],zero,zero,ymm1[30,31,16,17]
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2
; AVX256VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX256VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,1]
; AVX256VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
; AVX256VL-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm2
; AVX256VL-NEXT: vpslld $31, %ymm2, %ymm2
; AVX256VL-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX256VL-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX256VL-NEXT: vpternlogq $220, {{.*}}(%rip), %ymm1, %ymm2
; AVX256VL-NEXT: vpmovsxwd %xmm2, %ymm1
; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1
; AVX256VL-NEXT: vextracti128 $1, %ymm2, %xmm1
; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0

View File

@ -1345,10 +1345,11 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
;
; AVX512VL-LABEL: negative:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512VL-NEXT: vpternlogq $206, %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,3,2,3]
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq

View File

@ -2905,8 +2905,8 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm2, %zmm2
; AVX512VL-NEXT: vpord %zmm1, %zmm2, %zmm1
; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360]
; AVX512VL-NEXT: vpternlogq $216, %xmm2, %xmm1, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;

View File

@ -2376,8 +2376,8 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
; AVX512VL-NEXT: vpternlogq $216, %ymm2, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i8:

View File

@ -2651,9 +2651,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm2
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [18446744073709551360,18446744073709551360]
; AVX512VL-NEXT: vpternlogq $202, %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;

View File

@ -2083,9 +2083,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512VL-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm2
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
; AVX512VL-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i8:

View File

@ -52,17 +52,16 @@ define <32 x i8> @foo(<48 x i8>* %x0) {
;
; AVX512F-LABEL: foo:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm0
; AVX512F-NEXT: vmovdqu (%rdi), %ymm1
; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm2
; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6]
; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u>
; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,2,3,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero,ymm0[24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: foo: