From 65165d54bbfc2baae569f09014a040a491e76081 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 15 Mar 2019 16:16:49 +0000 Subject: [PATCH] [X86] Add SimplifyDemandedBitsForTargetNode support for PINSRB/PINSRW llvm-svn: 356270 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 47 ++- llvm/test/CodeGen/X86/avg.ll | 344 ++++++++++----------- llvm/test/CodeGen/X86/vector-sext-widen.ll | 46 +-- llvm/test/CodeGen/X86/vector-sext.ll | 46 +-- 4 files changed, 260 insertions(+), 223 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 222a6e82729a..d862469db84f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33353,6 +33353,38 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } break; } + case X86ISD::PINSRB: + case X86ISD::PINSRW: { + SDValue Vec = Op.getOperand(0); + SDValue Scl = Op.getOperand(1); + auto *CIdx = dyn_cast(Op.getOperand(2)); + MVT VecVT = Vec.getSimpleValueType(); + + if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) { + unsigned Idx = CIdx->getZExtValue(); + if (!OriginalDemandedElts[Idx]) + return TLO.CombineTo(Op, Vec); + + KnownBits KnownVec; + APInt DemandedVecElts(OriginalDemandedElts); + DemandedVecElts.clearBit(Idx); + if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts, + KnownVec, TLO, Depth + 1)) + return true; + + KnownBits KnownScl; + unsigned NumSclBits = Scl.getScalarValueSizeInBits(); + APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits); + if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1)) + return true; + + KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits()); + Known.One = KnownVec.One & KnownScl.One; + Known.Zero = KnownVec.Zero & KnownScl.Zero; + return false; + } + break; + } case X86ISD::PCMPGT: // icmp sgt(0, R) == ashr(R, BitWidth-1). // iff we only need the sign bit then we can use R directly. @@ -36634,11 +36666,16 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - assert( - ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || - (N->getOpcode() == X86ISD::PINSRW && - N->getValueType(0) == MVT::v8i16)) && - "Unexpected vector insertion"); + EVT VT = N->getValueType(0); + assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || + (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && + "Unexpected vector insertion"); + + unsigned NumBitsPerElt = VT.getScalarSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), + APInt::getAllOnesValue(NumBitsPerElt), DCI)) + return SDValue(N, 0); // Attempt to combine PINSRB/PINSRW patterns to a shuffle. SDValue Op(N, 0); diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index f9678b92f666..943fb4e3a8f4 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -2173,115 +2173,115 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm4, %r14 -; AVX2-NEXT: vmovq %xmm4, %r13 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm5 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm7 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovd %xmm4, %r12d -; AVX2-NEXT: vpextrd $2, %xmm4, %r15d -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %r15 +; AVX2-NEXT: vmovq %xmm2, %r14 +; AVX2-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %r13 +; AVX2-NEXT: vmovq %xmm1, %r11 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovd %xmm9, %r12d +; AVX2-NEXT: vpextrd $2, %xmm9, %r9d +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX2-NEXT: vmovd %xmm7, %ecx ; AVX2-NEXT: vpextrd $2, %xmm7, %edi -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX2-NEXT: vmovd %xmm6, %ebx -; AVX2-NEXT: vpextrd $2, %xmm6, %esi -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vmovd %xmm5, %edx -; AVX2-NEXT: vpextrd $2, %xmm5, %ebp -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vpextrd $2, %xmm6, %eax +; AVX2-NEXT: vmovd %xmm5, %ebx +; AVX2-NEXT: vpextrd $2, %xmm5, %esi +; AVX2-NEXT: vmovd %xmm4, %edx +; AVX2-NEXT: vpextrd $2, %xmm4, %ebp +; AVX2-NEXT: vpextrd $2, %xmm1, %eax ; AVX2-NEXT: leal -1(%rbp,%rax), %eax ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vmovd %xmm6, %eax +; AVX2-NEXT: vmovd %xmm1, %eax ; AVX2-NEXT: leal -1(%rdx,%rax), %eax ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: vpextrd $2, %xmm7, %eax -; AVX2-NEXT: leal -1(%rsi,%rax), %r11d -; AVX2-NEXT: vmovd %xmm7, %eax +; AVX2-NEXT: vpextrd $2, %xmm8, %eax +; AVX2-NEXT: leal -1(%rsi,%rax), %eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vmovd %xmm8, %eax ; AVX2-NEXT: leal -1(%rbx,%rax), %r10d -; AVX2-NEXT: vpextrd $2, %xmm5, %eax -; AVX2-NEXT: leal -1(%rdi,%rax), %r9d -; AVX2-NEXT: vmovd %xmm5, %eax -; AVX2-NEXT: leal -1(%rcx,%rax), %r8d +; AVX2-NEXT: vpextrd $2, %xmm6, %eax +; AVX2-NEXT: leal -1(%rdi,%rax), %r8d +; AVX2-NEXT: vmovd %xmm6, %eax +; AVX2-NEXT: leal -1(%rcx,%rax), %edi ; AVX2-NEXT: vpextrd $2, %xmm3, %eax -; AVX2-NEXT: leal -1(%r15,%rax), %r15d +; AVX2-NEXT: leal -1(%r9,%rax), %r9d ; AVX2-NEXT: vmovd %xmm3, %ecx ; AVX2-NEXT: leal -1(%r12,%rcx), %r12d +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: leal -1(%r15,%rcx), %r15d +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: leal -1(%r14,%rcx), %r14d ; AVX2-NEXT: vpextrq $1, %xmm2, %rdx ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdx), %rdx -; AVX2-NEXT: vmovq %xmm2, %rsi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rsi), %rsi -; AVX2-NEXT: vmovq %xmm4, %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rbx), %rbx -; AVX2-NEXT: vpextrq $1, %xmm4, %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rbp), %rbp -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: leaq -1(%r13,%rdi), %rdi -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: leaq -1(%r14,%rax), %rax -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %r13 -; AVX2-NEXT: leaq -1(%rcx,%r13), %r13 -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx -; AVX2-NEXT: vpextrq $1, %xmm1, %r14 -; AVX2-NEXT: leaq -1(%rcx,%r14), %rcx -; AVX2-NEXT: shrq %rsi -; AVX2-NEXT: vmovd %esi, %xmm0 -; AVX2-NEXT: shrq %rdx -; AVX2-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX2-NEXT: shrq %rbx -; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX2-NEXT: leal -1(%rax,%rdx), %edx +; AVX2-NEXT: vmovq %xmm2, %rax +; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm0 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: leal -1(%rcx,%rax), %eax +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: leal -1(%r13,%rsi), %esi +; AVX2-NEXT: vmovq %xmm0, %rbx +; AVX2-NEXT: leal -1(%r11,%rbx), %ebx +; AVX2-NEXT: vpextrq $1, %xmm10, %rcx +; AVX2-NEXT: vpextrq $1, %xmm11, %r13 +; AVX2-NEXT: leal -1(%rcx,%r13), %ecx +; AVX2-NEXT: vmovq %xmm10, %r13 +; AVX2-NEXT: vmovq %xmm11, %r11 +; AVX2-NEXT: leaq -1(%r13,%r11), %rbp ; AVX2-NEXT: shrq %rbp -; AVX2-NEXT: vpinsrb $3, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: shrq %rdi -; AVX2-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX2-NEXT: shrq %r13 -; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %ebp, %xmm0 +; AVX2-NEXT: shrl %ecx +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrl %ebx +; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX2-NEXT: shrl %esi +; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: shrl %edx +; AVX2-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; AVX2-NEXT: shrl %r14d +; AVX2-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0 +; AVX2-NEXT: shrl %r15d +; AVX2-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0 ; AVX2-NEXT: shrl %r12d ; AVX2-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0 -; AVX2-NEXT: shrl %r15d -; AVX2-NEXT: vpinsrb $9, %r15d, %xmm0, %xmm0 -; AVX2-NEXT: shrl %r8d -; AVX2-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0 ; AVX2-NEXT: shrl %r9d -; AVX2-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: shrl %edi +; AVX2-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; AVX2-NEXT: shrl %r8d +; AVX2-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0 ; AVX2-NEXT: shrl %r10d ; AVX2-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0 -; AVX2-NEXT: shrl %r11d -; AVX2-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; AVX2-NEXT: shrl %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 @@ -2307,115 +2307,115 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX512-NEXT: vpextrq $1, %xmm4, %r14 -; AVX512-NEXT: vmovq %xmm4, %r13 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm5 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm7 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vmovd %xmm4, %r12d -; AVX512-NEXT: vpextrd $2, %xmm4, %r15d -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrq $1, %xmm2, %r15 +; AVX512-NEXT: vmovq %xmm2, %r14 +; AVX512-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %r13 +; AVX512-NEXT: vmovq %xmm1, %r11 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vmovd %xmm9, %r12d +; AVX512-NEXT: vpextrd $2, %xmm9, %r9d +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512-NEXT: vmovd %xmm7, %ecx ; AVX512-NEXT: vpextrd $2, %xmm7, %edi -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX512-NEXT: vmovd %xmm6, %ebx -; AVX512-NEXT: vpextrd $2, %xmm6, %esi -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vmovd %xmm5, %edx -; AVX512-NEXT: vpextrd $2, %xmm5, %ebp -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-NEXT: vpextrd $2, %xmm6, %eax +; AVX512-NEXT: vmovd %xmm5, %ebx +; AVX512-NEXT: vpextrd $2, %xmm5, %esi +; AVX512-NEXT: vmovd %xmm4, %edx +; AVX512-NEXT: vpextrd $2, %xmm4, %ebp +; AVX512-NEXT: vpextrd $2, %xmm1, %eax ; AVX512-NEXT: leal -1(%rbp,%rax), %eax ; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vmovd %xmm6, %eax +; AVX512-NEXT: vmovd %xmm1, %eax ; AVX512-NEXT: leal -1(%rdx,%rax), %eax ; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: vpextrd $2, %xmm7, %eax -; AVX512-NEXT: leal -1(%rsi,%rax), %r11d -; AVX512-NEXT: vmovd %xmm7, %eax +; AVX512-NEXT: vpextrd $2, %xmm8, %eax +; AVX512-NEXT: leal -1(%rsi,%rax), %eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vmovd %xmm8, %eax ; AVX512-NEXT: leal -1(%rbx,%rax), %r10d -; AVX512-NEXT: vpextrd $2, %xmm5, %eax -; AVX512-NEXT: leal -1(%rdi,%rax), %r9d -; AVX512-NEXT: vmovd %xmm5, %eax -; AVX512-NEXT: leal -1(%rcx,%rax), %r8d +; AVX512-NEXT: vpextrd $2, %xmm6, %eax +; AVX512-NEXT: leal -1(%rdi,%rax), %r8d +; AVX512-NEXT: vmovd %xmm6, %eax +; AVX512-NEXT: leal -1(%rcx,%rax), %edi ; AVX512-NEXT: vpextrd $2, %xmm3, %eax -; AVX512-NEXT: leal -1(%r15,%rax), %r15d +; AVX512-NEXT: leal -1(%r9,%rax), %r9d ; AVX512-NEXT: vmovd %xmm3, %ecx ; AVX512-NEXT: leal -1(%r12,%rcx), %r12d +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: leal -1(%r15,%rcx), %r15d +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: leal -1(%r14,%rcx), %r14d ; AVX512-NEXT: vpextrq $1, %xmm2, %rdx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: leaq -1(%rax,%rdx), %rdx -; AVX512-NEXT: vmovq %xmm2, %rsi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: leaq -1(%rax,%rsi), %rsi -; AVX512-NEXT: vmovq %xmm4, %rbx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: leaq -1(%rax,%rbx), %rbx -; AVX512-NEXT: vpextrq $1, %xmm4, %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: leaq -1(%rax,%rbp), %rbp -; AVX512-NEXT: vmovq %xmm1, %rdi -; AVX512-NEXT: leaq -1(%r13,%rdi), %rdi -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: leaq -1(%r14,%rax), %rax -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %r13 -; AVX512-NEXT: leaq -1(%rcx,%r13), %r13 -; AVX512-NEXT: vpextrq $1, %xmm0, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %r14 -; AVX512-NEXT: leaq -1(%rcx,%r14), %rcx -; AVX512-NEXT: shrq %rsi -; AVX512-NEXT: vmovd %esi, %xmm0 -; AVX512-NEXT: shrq %rdx -; AVX512-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX512-NEXT: shrq %rbx -; AVX512-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512-NEXT: leal -1(%rax,%rdx), %edx +; AVX512-NEXT: vmovq %xmm2, %rax +; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm0 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: leal -1(%rcx,%rax), %eax +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: leal -1(%r13,%rsi), %esi +; AVX512-NEXT: vmovq %xmm0, %rbx +; AVX512-NEXT: leal -1(%r11,%rbx), %ebx +; AVX512-NEXT: vpextrq $1, %xmm10, %rcx +; AVX512-NEXT: vpextrq $1, %xmm11, %r13 +; AVX512-NEXT: leal -1(%rcx,%r13), %ecx +; AVX512-NEXT: vmovq %xmm10, %r13 +; AVX512-NEXT: vmovq %xmm11, %r11 +; AVX512-NEXT: leaq -1(%r13,%r11), %rbp ; AVX512-NEXT: shrq %rbp -; AVX512-NEXT: vpinsrb $3, %ebp, %xmm0, %xmm0 -; AVX512-NEXT: shrq %rdi -; AVX512-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX512-NEXT: shrq %r13 -; AVX512-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 -; AVX512-NEXT: shrq %rcx -; AVX512-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %ebp, %xmm0 +; AVX512-NEXT: shrl %ecx +; AVX512-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: shrl %ebx +; AVX512-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512-NEXT: shrl %esi +; AVX512-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX512-NEXT: shrl %eax +; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512-NEXT: shrl %edx +; AVX512-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; AVX512-NEXT: shrl %r14d +; AVX512-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0 +; AVX512-NEXT: shrl %r15d +; AVX512-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0 ; AVX512-NEXT: shrl %r12d ; AVX512-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0 -; AVX512-NEXT: shrl %r15d -; AVX512-NEXT: vpinsrb $9, %r15d, %xmm0, %xmm0 -; AVX512-NEXT: shrl %r8d -; AVX512-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0 ; AVX512-NEXT: shrl %r9d -; AVX512-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0 +; AVX512-NEXT: shrl %edi +; AVX512-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; AVX512-NEXT: shrl %r8d +; AVX512-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0 ; AVX512-NEXT: shrl %r10d ; AVX512-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: shrl %r11d -; AVX512-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0 +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX512-NEXT: shrl %eax +; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; AVX512-NEXT: shrl %eax ; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-sext-widen.ll b/llvm/test/CodeGen/X86/vector-sext-widen.ll index 6c9a644c9769..327e6fbbdd82 100644 --- a/llvm/test/CodeGen/X86/vector-sext-widen.ll +++ b/llvm/test/CodeGen/X86/vector-sext-widen.ll @@ -2151,7 +2151,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; SSE41-NEXT: shlq $57, %rcx ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrw $6, %ecx, %xmm0 -; SSE41-NEXT: shrq $7, %rax +; SSE41-NEXT: shrl $7, %eax ; SSE41-NEXT: pinsrw $7, %eax, %xmm0 ; SSE41-NEXT: retq ; @@ -2186,7 +2186,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX1-NEXT: shlq $57, %rcx ; AVX1-NEXT: sarq $63, %rcx ; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shrq $7, %rax +; AVX1-NEXT: shrl $7, %eax ; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -2221,7 +2221,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX2-NEXT: shlq $57, %rcx ; AVX2-NEXT: sarq $63, %rcx ; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: shrq $7, %rax +; AVX2-NEXT: shrl $7, %eax ; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -3068,7 +3068,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 ; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrq $7, %rcx +; SSE41-NEXT: shrl $7, %ecx ; SSE41-NEXT: pinsrb $7, %ecx, %xmm0 ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shlq $55, %rcx @@ -3098,7 +3098,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: shlq $49, %rcx ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 -; SSE41-NEXT: shrq $15, %rax +; SSE41-NEXT: shrl $15, %eax ; SSE41-NEXT: pinsrb $15, %eax, %xmm0 ; SSE41-NEXT: retq ; @@ -3134,7 +3134,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX1-NEXT: sarq $63, %rcx ; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: movsbq %al, %rcx -; AVX1-NEXT: shrq $7, %rcx +; AVX1-NEXT: shrl $7, %ecx ; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shlq $55, %rcx @@ -3164,7 +3164,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX1-NEXT: shlq $49, %rcx ; AVX1-NEXT: sarq $63, %rcx ; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shrq $15, %rax +; AVX1-NEXT: shrl $15, %eax ; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -3200,7 +3200,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX2-NEXT: sarq $63, %rcx ; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movsbq %al, %rcx -; AVX2-NEXT: shrq $7, %rcx +; AVX2-NEXT: shrl $7, %ecx ; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shlq $55, %rcx @@ -3230,7 +3230,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX2-NEXT: shlq $49, %rcx ; AVX2-NEXT: sarq $63, %rcx ; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: shrq $15, %rax +; AVX2-NEXT: shrl $15, %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -3688,7 +3688,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX1-NEXT: movq %rax, %r11 ; AVX1-NEXT: movq %rax, %r14 ; AVX1-NEXT: movq %rax, %r15 -; AVX1-NEXT: movq %rax, %r9 +; AVX1-NEXT: movl %eax, %r9d ; AVX1-NEXT: movq %rax, %r12 ; AVX1-NEXT: movq %rax, %r13 ; AVX1-NEXT: movq %rax, %rbx @@ -3715,7 +3715,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX1-NEXT: shlq $49, %r15 ; AVX1-NEXT: sarq $63, %r15 ; AVX1-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 -; AVX1-NEXT: shrq $15, %r9 +; AVX1-NEXT: shrl $15, %r9d ; AVX1-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 ; AVX1-NEXT: shlq $63, %r13 ; AVX1-NEXT: sarq $63, %r13 @@ -3738,7 +3738,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX1-NEXT: shlq $57, %rsi ; AVX1-NEXT: sarq $63, %rsi ; AVX1-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 -; AVX1-NEXT: shrq $7, %rbp +; AVX1-NEXT: shrl $7, %ebp ; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: popq %rbx @@ -3785,7 +3785,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX2-NEXT: movq %rax, %r11 ; AVX2-NEXT: movq %rax, %r14 ; AVX2-NEXT: movq %rax, %r15 -; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movl %eax, %r9d ; AVX2-NEXT: movq %rax, %r12 ; AVX2-NEXT: movq %rax, %r13 ; AVX2-NEXT: movq %rax, %rbx @@ -3812,7 +3812,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX2-NEXT: shlq $49, %r15 ; AVX2-NEXT: sarq $63, %r15 ; AVX2-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 -; AVX2-NEXT: shrq $15, %r9 +; AVX2-NEXT: shrl $15, %r9d ; AVX2-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 ; AVX2-NEXT: shlq $63, %r13 ; AVX2-NEXT: sarq $63, %r13 @@ -3835,7 +3835,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX2-NEXT: shlq $57, %rsi ; AVX2-NEXT: sarq $63, %rsi ; AVX2-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 -; AVX2-NEXT: shrq $7, %rbp +; AVX2-NEXT: shrl $7, %ebp ; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: popq %rbx @@ -4408,7 +4408,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 ; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrq $7, %rcx +; SSE41-NEXT: shrl $7, %ecx ; SSE41-NEXT: pinsrb $7, %ecx, %xmm0 ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shlq $55, %rcx @@ -4438,7 +4438,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: shlq $49, %rcx ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 -; SSE41-NEXT: shrq $15, %rax +; SSE41-NEXT: shrl $15, %eax ; SSE41-NEXT: pinsrb $15, %eax, %xmm0 ; SSE41-NEXT: movswq 2(%rdi), %rax ; SSE41-NEXT: movq %rax, %rcx @@ -4470,7 +4470,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 ; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrq $7, %rcx +; SSE41-NEXT: shrl $7, %ecx ; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shlq $55, %rcx @@ -4500,7 +4500,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: shlq $49, %rcx ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; SSE41-NEXT: shrq $15, %rax +; SSE41-NEXT: shrl $15, %eax ; SSE41-NEXT: pinsrb $15, %eax, %xmm1 ; SSE41-NEXT: retq ; @@ -4613,7 +4613,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX1-NEXT: shlq $57, %r10 ; AVX1-NEXT: sarq $63, %r10 ; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 -; AVX1-NEXT: shrq $7, %r11 +; AVX1-NEXT: shrl $7, %r11d ; AVX1-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1 ; AVX1-NEXT: shlq $55, %r9 ; AVX1-NEXT: sarq $63, %r9 @@ -4636,7 +4636,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX1-NEXT: shlq $49, %rdx ; AVX1-NEXT: sarq $63, %rdx ; AVX1-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 -; AVX1-NEXT: shrq $15, %rax +; AVX1-NEXT: shrl $15, %eax ; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: popq %rbx @@ -4756,7 +4756,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX2-NEXT: shlq $57, %r10 ; AVX2-NEXT: sarq $63, %r10 ; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 -; AVX2-NEXT: shrq $7, %r11 +; AVX2-NEXT: shrl $7, %r11d ; AVX2-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1 ; AVX2-NEXT: shlq $55, %r9 ; AVX2-NEXT: sarq $63, %r9 @@ -4779,7 +4779,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX2-NEXT: shlq $49, %rdx ; AVX2-NEXT: sarq $63, %rdx ; AVX2-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 -; AVX2-NEXT: shrq $15, %rax +; AVX2-NEXT: shrl $15, %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index 61bb1e99ba79..a01efcbf17ca 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -2151,7 +2151,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; SSE41-NEXT: shlq $57, %rcx ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrw $6, %ecx, %xmm0 -; SSE41-NEXT: shrq $7, %rax +; SSE41-NEXT: shrl $7, %eax ; SSE41-NEXT: pinsrw $7, %eax, %xmm0 ; SSE41-NEXT: retq ; @@ -2186,7 +2186,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX1-NEXT: shlq $57, %rcx ; AVX1-NEXT: sarq $63, %rcx ; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shrq $7, %rax +; AVX1-NEXT: shrl $7, %eax ; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -2221,7 +2221,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX2-NEXT: shlq $57, %rcx ; AVX2-NEXT: sarq $63, %rcx ; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: shrq $7, %rax +; AVX2-NEXT: shrl $7, %eax ; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -3068,7 +3068,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 ; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrq $7, %rcx +; SSE41-NEXT: shrl $7, %ecx ; SSE41-NEXT: pinsrb $7, %ecx, %xmm0 ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shlq $55, %rcx @@ -3098,7 +3098,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: shlq $49, %rcx ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 -; SSE41-NEXT: shrq $15, %rax +; SSE41-NEXT: shrl $15, %eax ; SSE41-NEXT: pinsrb $15, %eax, %xmm0 ; SSE41-NEXT: retq ; @@ -3134,7 +3134,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX1-NEXT: sarq $63, %rcx ; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: movsbq %al, %rcx -; AVX1-NEXT: shrq $7, %rcx +; AVX1-NEXT: shrl $7, %ecx ; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shlq $55, %rcx @@ -3164,7 +3164,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX1-NEXT: shlq $49, %rcx ; AVX1-NEXT: sarq $63, %rcx ; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shrq $15, %rax +; AVX1-NEXT: shrl $15, %eax ; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -3200,7 +3200,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX2-NEXT: sarq $63, %rcx ; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movsbq %al, %rcx -; AVX2-NEXT: shrq $7, %rcx +; AVX2-NEXT: shrl $7, %ecx ; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shlq $55, %rcx @@ -3230,7 +3230,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX2-NEXT: shlq $49, %rcx ; AVX2-NEXT: sarq $63, %rcx ; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: shrq $15, %rax +; AVX2-NEXT: shrl $15, %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -3688,7 +3688,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX1-NEXT: movq %rax, %r11 ; AVX1-NEXT: movq %rax, %r14 ; AVX1-NEXT: movq %rax, %r15 -; AVX1-NEXT: movq %rax, %r9 +; AVX1-NEXT: movl %eax, %r9d ; AVX1-NEXT: movq %rax, %r12 ; AVX1-NEXT: movq %rax, %r13 ; AVX1-NEXT: movq %rax, %rbx @@ -3715,7 +3715,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX1-NEXT: shlq $49, %r15 ; AVX1-NEXT: sarq $63, %r15 ; AVX1-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 -; AVX1-NEXT: shrq $15, %r9 +; AVX1-NEXT: shrl $15, %r9d ; AVX1-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 ; AVX1-NEXT: shlq $63, %r13 ; AVX1-NEXT: sarq $63, %r13 @@ -3738,7 +3738,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX1-NEXT: shlq $57, %rsi ; AVX1-NEXT: sarq $63, %rsi ; AVX1-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 -; AVX1-NEXT: shrq $7, %rbp +; AVX1-NEXT: shrl $7, %ebp ; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: popq %rbx @@ -3785,7 +3785,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX2-NEXT: movq %rax, %r11 ; AVX2-NEXT: movq %rax, %r14 ; AVX2-NEXT: movq %rax, %r15 -; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movl %eax, %r9d ; AVX2-NEXT: movq %rax, %r12 ; AVX2-NEXT: movq %rax, %r13 ; AVX2-NEXT: movq %rax, %rbx @@ -3812,7 +3812,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX2-NEXT: shlq $49, %r15 ; AVX2-NEXT: sarq $63, %r15 ; AVX2-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 -; AVX2-NEXT: shrq $15, %r9 +; AVX2-NEXT: shrl $15, %r9d ; AVX2-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 ; AVX2-NEXT: shlq $63, %r13 ; AVX2-NEXT: sarq $63, %r13 @@ -3835,7 +3835,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX2-NEXT: shlq $57, %rsi ; AVX2-NEXT: sarq $63, %rsi ; AVX2-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 -; AVX2-NEXT: shrq $7, %rbp +; AVX2-NEXT: shrl $7, %ebp ; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: popq %rbx @@ -4408,7 +4408,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 ; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrq $7, %rcx +; SSE41-NEXT: shrl $7, %ecx ; SSE41-NEXT: pinsrb $7, %ecx, %xmm0 ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shlq $55, %rcx @@ -4438,7 +4438,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: shlq $49, %rcx ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 -; SSE41-NEXT: shrq $15, %rax +; SSE41-NEXT: shrl $15, %eax ; SSE41-NEXT: pinsrb $15, %eax, %xmm0 ; SSE41-NEXT: movswq 2(%rdi), %rax ; SSE41-NEXT: movq %rax, %rcx @@ -4470,7 +4470,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 ; SSE41-NEXT: movsbq %al, %rcx -; SSE41-NEXT: shrq $7, %rcx +; SSE41-NEXT: shrl $7, %ecx ; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shlq $55, %rcx @@ -4500,7 +4500,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; SSE41-NEXT: shlq $49, %rcx ; SSE41-NEXT: sarq $63, %rcx ; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 -; SSE41-NEXT: shrq $15, %rax +; SSE41-NEXT: shrl $15, %eax ; SSE41-NEXT: pinsrb $15, %eax, %xmm1 ; SSE41-NEXT: retq ; @@ -4613,7 +4613,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX1-NEXT: shlq $57, %r10 ; AVX1-NEXT: sarq $63, %r10 ; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 -; AVX1-NEXT: shrq $7, %r11 +; AVX1-NEXT: shrl $7, %r11d ; AVX1-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1 ; AVX1-NEXT: shlq $55, %r9 ; AVX1-NEXT: sarq $63, %r9 @@ -4636,7 +4636,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX1-NEXT: shlq $49, %rdx ; AVX1-NEXT: sarq $63, %rdx ; AVX1-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 -; AVX1-NEXT: shrq $15, %rax +; AVX1-NEXT: shrl $15, %eax ; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: popq %rbx @@ -4756,7 +4756,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX2-NEXT: shlq $57, %r10 ; AVX2-NEXT: sarq $63, %r10 ; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 -; AVX2-NEXT: shrq $7, %r11 +; AVX2-NEXT: shrl $7, %r11d ; AVX2-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1 ; AVX2-NEXT: shlq $55, %r9 ; AVX2-NEXT: sarq $63, %r9 @@ -4779,7 +4779,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX2-NEXT: shlq $49, %rdx ; AVX2-NEXT: sarq $63, %rdx ; AVX2-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 -; AVX2-NEXT: shrq $15, %rax +; AVX2-NEXT: shrl $15, %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: popq %rbx