[X86][SSE] Remove unnecessary bit-and in pshufb vector ctlz (PR39703)

SSE PSHUFB vector ctlz lowering works at the i4 nibble level. As detailed in PR39703, we were masking the lower nibble off but we only actually use it in the case where the upper nibble is known to be zero, making it safe to remove the mask and save an instruction.

Differential Revision: https://reviews.llvm.org/D54707

llvm-svn: 347242
This commit is contained in:
Simon Pilgrim 2018-11-19 18:40:59 +00:00
parent 5a47dc607e
commit c4861ab170
6 changed files with 873 additions and 1076 deletions

View File

@ -23103,9 +23103,8 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
SDValue Lo = Op0;
SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
SDValue HiZ;
if (CurrVT.is512BitVector()) {

View File

@ -347,50 +347,48 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: pshufb %xmm0, %xmm3
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrlw $4, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: pshufb %xmm1, %xmm3
; SSE-NEXT: pcmpeqb %xmm2, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE-NEXT: pand %xmm0, %xmm5
; SSE-NEXT: pshufb %xmm5, %xmm4
; SSE-NEXT: pand %xmm1, %xmm4
; SSE-NEXT: paddb %xmm4, %xmm3
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pcmpeqb %xmm2, %xmm1
; SSE-NEXT: psrlw $8, %xmm1
; SSE-NEXT: pxor %xmm4, %xmm4
; SSE-NEXT: pshufb %xmm1, %xmm2
; SSE-NEXT: pcmpeqb %xmm4, %xmm1
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: psrlw $8, %xmm3
; SSE-NEXT: paddw %xmm1, %xmm3
; SSE-NEXT: pcmpeqw %xmm2, %xmm0
; SSE-NEXT: paddb %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pcmpeqb %xmm4, %xmm2
; SSE-NEXT: psrlw $8, %xmm2
; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: psrlw $8, %xmm1
; SSE-NEXT: paddw %xmm2, %xmm1
; SSE-NEXT: pcmpeqw %xmm4, %xmm0
; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: pand %xmm3, %xmm0
; SSE-NEXT: psrld $16, %xmm3
; SSE-NEXT: paddd %xmm3, %xmm0
; SSE-NEXT: psrld $5, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: psrld $16, %xmm1
; SSE-NEXT: paddd %xmm0, %xmm1
; SSE-NEXT: psrld $5, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
; AVX: # %bb.0:
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm3
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm4
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm4
; AVX-NEXT: vpand %xmm3, %xmm4, %xmm3
; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm1
; AVX-NEXT: vpaddb %xmm1, %xmm3, %xmm1
; AVX-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3
; AVX-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX-NEXT: vpand %xmm3, %xmm1, %xmm3
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3
; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5
; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2
; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2
; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpsrld $16, %xmm1, %xmm1

View File

@ -38,17 +38,15 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
define <16 x i8> @testv16i8(<16 x i8> %in) {
; AVX256-LABEL: testv16i8:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX256-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX256-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX256-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX256-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX256-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
; AVX256-NEXT: vpand %xmm1, %xmm2, %xmm1
; AVX256-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX256-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX256-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX256-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX256-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3
; AVX256-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX256-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX256-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX256-NEXT: retq
;
; AVX512-LABEL: testv16i8:
@ -93,17 +91,15 @@ define <16 x i16> @testv16i16(<16 x i16> %in) {
define <32 x i8> @testv32i8(<32 x i8> %in) {
; AVX256-LABEL: testv32i8:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX256-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX256-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
; AVX256-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX256-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX256-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX256-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX256-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX256-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3
; AVX256-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX256-NEXT: vpshufb %ymm0, %ymm1, %ymm0
; AVX256-NEXT: vpaddb %ymm0, %ymm2, %ymm0
; AVX256-NEXT: retq
;
; AVX512-LABEL: testv32i8:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -359,16 +359,15 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
;
; AVX512BW-LABEL: testv32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm4
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm4, %k0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1
; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
@ -380,29 +379,27 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
;
; AVX512DQ-LABEL: testv32i16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm5
; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm5
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm7
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7
; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm5
; AVX512DQ-NEXT: vpaddb %ymm5, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm4
; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512DQ-NEXT: vpaddw %ymm0, %ymm3, %ymm0
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm5
; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm2
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm5
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm5
; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm2
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512DQ-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
@ -445,16 +442,15 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
;
; AVX512BW-LABEL: testv32i16u:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm4
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm4, %k0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1
; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
@ -466,29 +462,27 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
;
; AVX512DQ-LABEL: testv32i16u:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm5
; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm5
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm7
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7
; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm5
; AVX512DQ-NEXT: vpaddb %ymm5, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm4
; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512DQ-NEXT: vpaddw %ymm0, %ymm3, %ymm0
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm5
; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm2
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm5
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm5
; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm2
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512DQ-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
@ -555,40 +549,37 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
;
; AVX512BW-LABEL: testv64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6
; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm2
; AVX512DQ-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512DQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm4
; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1
; AVX512DQ-NEXT: retq
%out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0)
ret <64 x i8> %out
@ -649,40 +640,37 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
;
; AVX512BW-LABEL: testv64i8u:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8u:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6
; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm2
; AVX512DQ-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512DQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm4
; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1
; AVX512DQ-NEXT: retq
%out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1)
ret <64 x i8> %out