[X86][SSE] Remove unnecessary bit-and in pshufb vector ctlz (PR39703)
SSE PSHUFB vector ctlz lowering works at the i4 nibble level. As detailed in PR39703, we were masking the lower nibble off but we only actually use it in the case where the upper nibble is known to be zero, making it safe to remove the mask and save an instruction. Differential Revision: https://reviews.llvm.org/D54707 llvm-svn: 347242
This commit is contained in:
parent
5a47dc607e
commit
c4861ab170
|
@ -23103,9 +23103,8 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
|
|||
SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
|
||||
SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
|
||||
|
||||
SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
|
||||
SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
|
||||
SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
|
||||
SDValue Lo = Op0;
|
||||
SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
|
||||
SDValue HiZ;
|
||||
if (CurrVT.is512BitVector()) {
|
||||
|
|
|
@ -347,50 +347,48 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
|
|||
; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
|
||||
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; SSE-NEXT: movdqa %xmm2, %xmm3
|
||||
; SSE-NEXT: pshufb %xmm0, %xmm3
|
||||
; SSE-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE-NEXT: psrlw $4, %xmm1
|
||||
; SSE-NEXT: pxor %xmm2, %xmm2
|
||||
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; SSE-NEXT: movdqa %xmm3, %xmm4
|
||||
; SSE-NEXT: pshufb %xmm1, %xmm3
|
||||
; SSE-NEXT: pcmpeqb %xmm2, %xmm1
|
||||
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; SSE-NEXT: pand %xmm0, %xmm5
|
||||
; SSE-NEXT: pshufb %xmm5, %xmm4
|
||||
; SSE-NEXT: pand %xmm1, %xmm4
|
||||
; SSE-NEXT: paddb %xmm4, %xmm3
|
||||
; SSE-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE-NEXT: pcmpeqb %xmm2, %xmm1
|
||||
; SSE-NEXT: psrlw $8, %xmm1
|
||||
; SSE-NEXT: pxor %xmm4, %xmm4
|
||||
; SSE-NEXT: pshufb %xmm1, %xmm2
|
||||
; SSE-NEXT: pcmpeqb %xmm4, %xmm1
|
||||
; SSE-NEXT: pand %xmm3, %xmm1
|
||||
; SSE-NEXT: psrlw $8, %xmm3
|
||||
; SSE-NEXT: paddw %xmm1, %xmm3
|
||||
; SSE-NEXT: pcmpeqw %xmm2, %xmm0
|
||||
; SSE-NEXT: paddb %xmm2, %xmm1
|
||||
; SSE-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE-NEXT: pcmpeqb %xmm4, %xmm2
|
||||
; SSE-NEXT: psrlw $8, %xmm2
|
||||
; SSE-NEXT: pand %xmm1, %xmm2
|
||||
; SSE-NEXT: psrlw $8, %xmm1
|
||||
; SSE-NEXT: paddw %xmm2, %xmm1
|
||||
; SSE-NEXT: pcmpeqw %xmm4, %xmm0
|
||||
; SSE-NEXT: psrld $16, %xmm0
|
||||
; SSE-NEXT: pand %xmm3, %xmm0
|
||||
; SSE-NEXT: psrld $16, %xmm3
|
||||
; SSE-NEXT: paddd %xmm3, %xmm0
|
||||
; SSE-NEXT: psrld $5, %xmm0
|
||||
; SSE-NEXT: pand %xmm1, %xmm0
|
||||
; SSE-NEXT: psrld $16, %xmm1
|
||||
; SSE-NEXT: paddd %xmm0, %xmm1
|
||||
; SSE-NEXT: psrld $5, %xmm1
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1
|
||||
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm3
|
||||
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm4
|
||||
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm4
|
||||
; AVX-NEXT: vpand %xmm3, %xmm4, %xmm3
|
||||
; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm1
|
||||
; AVX-NEXT: vpaddb %xmm1, %xmm3, %xmm1
|
||||
; AVX-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3
|
||||
; AVX-NEXT: vpsrlw $8, %xmm3, %xmm3
|
||||
; AVX-NEXT: vpand %xmm3, %xmm1, %xmm3
|
||||
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2
|
||||
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3
|
||||
; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
|
||||
; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5
|
||||
; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
|
||||
; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2
|
||||
; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2
|
||||
; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
|
||||
|
|
|
@ -38,17 +38,15 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
|
|||
define <16 x i8> @testv16i8(<16 x i8> %in) {
|
||||
; AVX256-LABEL: testv16i8:
|
||||
; AVX256: # %bb.0:
|
||||
; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm2
|
||||
; AVX256-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX256-NEXT: vpshufb %xmm2, %xmm3, %xmm2
|
||||
; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX256-NEXT: vpshufb %xmm0, %xmm1, %xmm2
|
||||
; AVX256-NEXT: vpsrlw $4, %xmm0, %xmm0
|
||||
; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm0
|
||||
; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX256-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
|
||||
; AVX256-NEXT: vpand %xmm1, %xmm2, %xmm1
|
||||
; AVX256-NEXT: vpshufb %xmm0, %xmm3, %xmm0
|
||||
; AVX256-NEXT: vpaddb %xmm0, %xmm1, %xmm0
|
||||
; AVX256-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX256-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; AVX256-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3
|
||||
; AVX256-NEXT: vpand %xmm3, %xmm2, %xmm2
|
||||
; AVX256-NEXT: vpshufb %xmm0, %xmm1, %xmm0
|
||||
; AVX256-NEXT: vpaddb %xmm0, %xmm2, %xmm0
|
||||
; AVX256-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: testv16i8:
|
||||
|
@ -93,17 +91,15 @@ define <16 x i16> @testv16i16(<16 x i16> %in) {
|
|||
define <32 x i8> @testv32i8(<32 x i8> %in) {
|
||||
; AVX256-LABEL: testv32i8:
|
||||
; AVX256: # %bb.0:
|
||||
; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2
|
||||
; AVX256-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2
|
||||
; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX256-NEXT: vpshufb %ymm0, %ymm1, %ymm2
|
||||
; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0
|
||||
; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
|
||||
; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
|
||||
; AVX256-NEXT: vpand %ymm1, %ymm2, %ymm1
|
||||
; AVX256-NEXT: vpshufb %ymm0, %ymm3, %ymm0
|
||||
; AVX256-NEXT: vpaddb %ymm0, %ymm1, %ymm0
|
||||
; AVX256-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
|
||||
; AVX256-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; AVX256-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3
|
||||
; AVX256-NEXT: vpand %ymm3, %ymm2, %ymm2
|
||||
; AVX256-NEXT: vpshufb %ymm0, %ymm1, %ymm0
|
||||
; AVX256-NEXT: vpaddb %ymm0, %ymm2, %ymm0
|
||||
; AVX256-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: testv32i8:
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -359,16 +359,15 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
|
|||
;
|
||||
; AVX512BW-LABEL: testv32i16:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
|
||||
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm4
|
||||
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm4, %k0
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
|
||||
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0
|
||||
; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
|
||||
; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
|
||||
; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1
|
||||
; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
|
||||
; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
|
||||
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
|
||||
; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
|
||||
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
|
||||
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
|
||||
|
@ -380,29 +379,27 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
|
|||
;
|
||||
; AVX512DQ-LABEL: testv32i16:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm5
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm5
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
|
||||
; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm7
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7
|
||||
; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm5
|
||||
; AVX512DQ-NEXT: vpaddb %ymm5, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm4
|
||||
; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm0
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpaddw %ymm0, %ymm3, %ymm0
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm5
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm2
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm5
|
||||
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4
|
||||
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm5
|
||||
; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm2
|
||||
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
|
||||
; AVX512DQ-NEXT: vpaddb %ymm2, %ymm3, %ymm2
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
|
||||
|
@ -445,16 +442,15 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
|
|||
;
|
||||
; AVX512BW-LABEL: testv32i16u:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
|
||||
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm4
|
||||
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm4, %k0
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
|
||||
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0
|
||||
; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
|
||||
; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
|
||||
; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1
|
||||
; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
|
||||
; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
|
||||
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
|
||||
; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
|
||||
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
|
||||
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
|
||||
|
@ -466,29 +462,27 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
|
|||
;
|
||||
; AVX512DQ-LABEL: testv32i16u:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm5
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm5
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
|
||||
; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm7
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7
|
||||
; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm5
|
||||
; AVX512DQ-NEXT: vpaddb %ymm5, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm4
|
||||
; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm0
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpaddw %ymm0, %ymm3, %ymm0
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm5
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm2
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm5
|
||||
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4
|
||||
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm5
|
||||
; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm2
|
||||
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
|
||||
; AVX512DQ-NEXT: vpaddb %ymm2, %ymm3, %ymm2
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
|
||||
|
@ -555,40 +549,37 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
|
|||
;
|
||||
; AVX512BW-LABEL: testv64i8:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
|
||||
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
|
||||
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0
|
||||
; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
|
||||
; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
|
||||
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
|
||||
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
|
||||
; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
|
||||
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
|
||||
; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512DQ-LABEL: testv64i8:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6
|
||||
; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
||||
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
|
||||
; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm2
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm3, %ymm2
|
||||
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
|
||||
; AVX512DQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
|
||||
; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm4
|
||||
; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
|
||||
; AVX512DQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1
|
||||
; AVX512DQ-NEXT: retq
|
||||
%out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0)
|
||||
ret <64 x i8> %out
|
||||
|
@ -649,40 +640,37 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
|
|||
;
|
||||
; AVX512BW-LABEL: testv64i8u:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
|
||||
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
|
||||
; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
|
||||
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0
|
||||
; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
|
||||
; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
|
||||
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
|
||||
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
|
||||
; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
|
||||
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
|
||||
; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512DQ-LABEL: testv64i8u:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
|
||||
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6
|
||||
; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
|
||||
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
|
||||
; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
|
||||
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm2
|
||||
; AVX512DQ-NEXT: vpand %ymm2, %ymm3, %ymm2
|
||||
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
|
||||
; AVX512DQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
|
||||
; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm4
|
||||
; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
|
||||
; AVX512DQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1
|
||||
; AVX512DQ-NEXT: retq
|
||||
%out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1)
|
||||
ret <64 x i8> %out
|
||||
|
|
Loading…
Reference in New Issue