[X86] Add 64 bit pattern matching for PSADBW

PSADBW pattern currently supports the 32 bit IR pattern and only GLT (greather than) comparison. The patch extends the pattern to catch also 64 bit IR pattern and includes all other comparison types (not only GLT). Differential Revision: https://reviews.llvm.org/D31577 llvm-svn: 299425
2017-04-04 10:23:18 +00:00 · 2017-04-04 10:23:18 +00:00 · 568fb197da
parent 447f175eb5
commit 568fb197da
2 changed files with 388 additions and 13 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -29144,12 +29144,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
  if (SetCC.getOpcode() != ISD::SETCC)
    return false;
  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
-  if (CC != ISD::SETGT)
+  if (CC != ISD::SETGT && CC != ISD::SETLT)
    return false;

  SDValue SelectOp1 = Select->getOperand(1);
  SDValue SelectOp2 = Select->getOperand(2);

+  // The following instructions assume SelectOp1 is the subtraction operand
+  // and SelectOp2 is the negation operand.
+  // In the case of SETLT this is the other way around.
+  if (CC == ISD::SETLT)
+    std::swap(SelectOp1, SelectOp2);
+
  // The second operand of the select should be the negation of the first
  // operand, which is implemented as 0 - SelectOp1.
  if (!(SelectOp2.getOpcode() == ISD::SUB &&
@ -29162,8 +29168,17 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
  if (SetCC.getOperand(0) != SelectOp1)
    return false;

-  // The second operand of the comparison can be either -1 or 0.
-  if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+  // In SetLT case, The second operand of the comparison can be either 1 or 0.
+  APInt SplatVal;
+  if ((CC == ISD::SETLT) &&
+      !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
+         SplatVal == 1) ||
+        (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
+    return false;
+
+  // In SetGT case, The second operand of the comparison can be either -1 or 0.
+  if ((CC == ISD::SETGT) &&
+      !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
        ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
    return false;

@ -29292,11 +29307,9 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
  if (!Subtarget.hasSSE2())
    return SDValue();

-  // Verify the type we're extracting from is appropriate
-  // TODO: There's nothing special about i32, any integer type above i16 should
-  // work just as well.
+  // Verify the type we're extracting from is any integer type above i16.
  EVT VT = Extract->getOperand(0).getValueType();
-  if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
+  if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
    return SDValue();

  unsigned RegSize = 128;
@ -29305,15 +29318,28 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
  else if (Subtarget.hasAVX2())
    RegSize = 256;

-  // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+  // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
  // TODO: We should be able to handle larger vectors by splitting them before
  // feeding them into several SADs, and then reducing over those.
-  if (VT.getSizeInBits() / 4 > RegSize)
+  if (RegSize / VT.getVectorNumElements() < 8)
    return SDValue();

  // Match shuffle + add pyramid.
  SDValue Root = matchBinOpReduction(Extract, ISD::ADD);

+  // The operand is expected to be zero extended from i8
+  // (verified in detectZextAbsDiff). 
+  // In order to convert to i64 and above, additional any/zero/sign 
+  // extend is expected.
+  // The zero extend from 32 bit has no mathematical effect on the result.
+  // Also the sign extend is basically zero extend 
+  // (extends the sign bit which is zero).
+  // So it is correct to skip the sign/zero extend instruction.
+  if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || 
+	  Root.getOpcode() == ISD::ZERO_EXTEND ||
+	  Root.getOpcode() == ISD::ANY_EXTEND))
+    Root = Root.getOperand(0);
+
  // If there was a match, we want Root to be a select that is the root of an
  // abs-diff pattern.
  if (!Root || (Root.getOpcode() != ISD::VSELECT))
@ -29324,7 +29350,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
  if (!detectZextAbsDiff(Root, Zext0, Zext1))
    return SDValue();

-  // Create the SAD instruction
+  // Create the SAD instruction.
  SDLoc DL(Extract);
  SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);

@ -29346,10 +29372,12 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
    }
  }

-  // Return the lowest i32.
-  MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
+  MVT Type = Extract->getSimpleValueType(0);
+  unsigned TypeSizeInBits = Type.getSizeInBits();
+  // Return the lowest TypeSizeInBits bits.
+  MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
  SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
                     Extract->getOperand(1));
 }

--- a/llvm/test/CodeGen/X86/sad_variations.ll
+++ b/llvm/test/CodeGen/X86/sad_variations.ll
@ -0,0 +1,347 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+
+define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 {
+; SSE2-LABEL: sad8_32bit_icmp_sge:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_sge:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_sge:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp sgt <8 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i32 @sad8_32bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #1 {
+; SSE2-LABEL: sad8_32bit_icmp_sgt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_sgt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_sgt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp sgt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i32 @sad8_32bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #2 {
+; SSE2-LABEL: sad8_32bit_icmp_sle:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_sle:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_sle:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i32 @sad8_32bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #3 {
+; SSE2-LABEL: sad8_32bit_icmp_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
+; SSE2-LABEL: sad8_64bit_icmp_sext_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_64bit_icmp_sext_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_64bit_icmp_sext_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %10 = sext <8 x i32> %9 to <8 x i64>
+  %rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i64> %rdx.shuf, %10
+  %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
+  %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
+  %11 = extractelement <8 x i64> %bin.rdx239, i32 0
+  ret i64 %11
+}
+
+define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
+; SSE2-LABEL: sad8_64bit_icmp_zext_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_64bit_icmp_zext_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_64bit_icmp_zext_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %10 = zext <8 x i32> %9 to <8 x i64>
+  %rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i64> %rdx.shuf, %10
+  %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
+  %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
+  %11 = extractelement <8 x i64> %bin.rdx239, i32 0
+  ret i64 %11
+}
+
+define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
+; SSE2-LABEL: sad8_early_64bit_icmp_zext_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_early_64bit_icmp_zext_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_early_64bit_icmp_zext_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i64>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i64>
+  %6 = sub nsw <8 x i64> %2, %5
+  %7 = icmp slt <8 x i64> %6, zeroinitializer
+  %8 = sub nsw <8 x i64> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i64> %8, <8 x i64> %6
+  %rdx.shuf = shufflevector <8 x i64> %9, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i64> %rdx.shuf, %9
+  %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
+  %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
+  %10 = extractelement <8 x i64> %bin.rdx239, i32 0
+  ret i64 %10
+}