[x86] scalarize extract element 0 of FP math

This is another step towards ensuring that we produce the optimal code for reductions,
but there are other potential benefits as seen in the tests diffs:

  1. Memory loads may get scalarized resulting in more efficient code.
  2. Memory stores may get scalarized resulting in more efficient code.
  3. Complex ops like fdiv/sqrt get scalarized which may be faster instructions depending on uarch.
  4. Even simple ops like addss/subss/mulss/roundss may result in faster operation/less frequency throttling when scalarized depending on uarch.

The TODO comment suggests 1 or more follow-ups for opcodes that can currently result in regressions.

Differential Revision: https://reviews.llvm.org/D58282

llvm-svn: 355130
This commit is contained in:
Sanjay Patel 2019-02-28 19:47:04 +00:00
parent fadb22f4e2
commit 7fc6ef7dd7
12 changed files with 306 additions and 258 deletions

View File

@ -34240,6 +34240,62 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
/// Extracting a scalar FP value from vector element 0 is free, so extract each
/// operand first, then perform the math as a scalar op.
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
SDValue Vec = ExtElt->getOperand(0);
SDValue Index = ExtElt->getOperand(1);
EVT VT = ExtElt->getValueType(0);
EVT VecVT = Vec.getValueType();
// TODO: If this is a unary/expensive/expand op, allow extraction from a
// non-zero element because the shuffle+scalar op will be cheaper?
if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
return SDValue();
if (VT != MVT::f32 && VT != MVT::f64)
return SDValue();
// TODO: This switch could include FNEG, the x86-specific FP logic ops
// (FAND, FANDN, FOR, FXOR), FRSQRT/FRCP and other FP math ops. But that may
// require enhancements to avoid missed load folding and fma+fneg combining.
switch (Vec.getOpcode()) {
case ISD::FMA: // Begin 3 operands
case ISD::FMAD:
case ISD::FADD: // Begin 2 operands
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FREM:
case ISD::FCOPYSIGN:
case ISD::FMINNUM:
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
case ISD::FMAXIMUM:
case ISD::FMINIMUM:
case ISD::FABS: // Begin 1 operand
case ISD::FSQRT:
case ISD::FRINT:
case ISD::FCEIL:
case ISD::FTRUNC:
case ISD::FNEARBYINT:
case ISD::FROUND:
case ISD::FFLOOR: {
// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
SDLoc DL(ExtElt);
SmallVector<SDValue, 4> ExtOps;
for (SDValue Op : Vec->ops())
ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
}
default:
return SDValue();
}
llvm_unreachable("All opcodes should return within switch");
}
/// Detect vector gather/scatter index generation and convert it from being a
/// bunch of shuffles and extracts into a somewhat faster sequence.
/// For i686, the best sequence is apparently storing the value and loading
@ -34310,6 +34366,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;
if (SDValue V = scalarizeExtEltFP(N, DAG))
return V;
return SDValue();
}

View File

@ -8,14 +8,14 @@ define void @test1(float* %A, float* %C) #0 {
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovaps (%ecx), %xmm0
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vandps LCPI0_0, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: test1:
; X64: ## %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vmovss %xmm0, (%rsi)
; X64-NEXT: retq

View File

@ -62,7 +62,7 @@ define float @fhadd_16(<16 x float> %x225) {
; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0
; KNL-NEXT: vaddss %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: fhadd_16:
@ -70,7 +70,7 @@ define float @fhadd_16(<16 x float> %x225) {
; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@ -87,7 +87,7 @@ define float @fhsub_16(<16 x float> %x225) {
; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; KNL-NEXT: vsubps %xmm1, %xmm0, %xmm0
; KNL-NEXT: vsubss %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: fhsub_16:
@ -95,7 +95,7 @@ define float @fhsub_16(<16 x float> %x225) {
; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SKX-NEXT: vsubps %xmm1, %xmm0, %xmm0
; SKX-NEXT: vsubss %xmm1, %xmm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@ -227,13 +227,13 @@ define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) {
; KNL-LABEL: fadd_noundef_eel:
; KNL: # %bb.0:
; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; KNL-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; KNL-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: fadd_noundef_eel:
; SKX: # %bb.0:
; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>

View File

@ -7289,8 +7289,8 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovlpd %xmm0, (%esp)
; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
@ -7305,7 +7305,7 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@ -7336,8 +7336,8 @@ define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovlpd %xmm0, (%esp)
; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
@ -7352,7 +7352,7 @@ define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@ -7380,7 +7380,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@ -7397,7 +7397,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@ -7430,7 +7430,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@ -7447,7 +7447,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@ -7486,8 +7486,8 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W)
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovlpd %xmm0, (%esp)
; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
@ -7504,7 +7504,7 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W)
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@ -7541,8 +7541,8 @@ define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W)
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovlpd %xmm0, (%esp)
; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
@ -7560,7 +7560,7 @@ define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W)
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@ -7593,7 +7593,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W)
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@ -7612,7 +7612,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W)
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@ -7651,7 +7651,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@ -7671,7 +7671,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:

View File

@ -32,14 +32,14 @@ define void @store_floats(<4 x float> %x, i64* %p) {
define void @store_double(<2 x double> %x, i64* %p) {
; SSE-LABEL: store_double:
; SSE: # %bb.0:
; SSE-NEXT: addpd %xmm0, %xmm0
; SSE-NEXT: movlpd %xmm0, (%rdi)
; SSE-NEXT: addsd %xmm0, %xmm0
; SSE-NEXT: movsd %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: store_double:
; AVX: # %bb.0:
; AVX-NEXT: vaddpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovlpd %xmm0, (%rdi)
; AVX-NEXT: vaddsd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovsd %xmm0, (%rdi)
; AVX-NEXT: retq
%a = fadd <2 x double> %x, %x
%b = extractelement <2 x double> %a, i32 0

View File

@ -27,7 +27,7 @@ define double @fneg_v4f64(<4 x double> %x) nounwind {
define float @fadd_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
; CHECK-LABEL: fadd_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%v = fadd <4 x float> %x, %y
%r = extractelement <4 x float> %v, i32 0
@ -37,7 +37,7 @@ define float @fadd_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
define double @fadd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
; CHECK-LABEL: fadd_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = fadd <4 x double> %x, %y
@ -48,7 +48,7 @@ define double @fadd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
define float @fsub_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
; CHECK-LABEL: fsub_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%v = fsub <4 x float> %x, %y
%r = extractelement <4 x float> %v, i32 0
@ -58,7 +58,7 @@ define float @fsub_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
define double @fsub_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
; CHECK-LABEL: fsub_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = fsub <4 x double> %x, %y
@ -69,7 +69,7 @@ define double @fsub_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
define float @fmul_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
; CHECK-LABEL: fmul_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%v = fmul <4 x float> %x, %y
%r = extractelement <4 x float> %v, i32 0
@ -79,7 +79,7 @@ define float @fmul_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
define double @fmul_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
; CHECK-LABEL: fmul_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = fmul <4 x double> %x, %y
@ -90,7 +90,7 @@ define double @fmul_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
define float @fdiv_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
; CHECK-LABEL: fdiv_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%v = fdiv <4 x float> %x, %y
%r = extractelement <4 x float> %v, i32 0
@ -100,7 +100,7 @@ define float @fdiv_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
define double @fdiv_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
; CHECK-LABEL: fdiv_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vdivpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vdivsd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = fdiv <4 x double> %x, %y
@ -132,7 +132,7 @@ define double @frem_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
define float @fsqrt_v4f32(<4 x float> %x) nounwind {
; CHECK-LABEL: fsqrt_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsqrtps %xmm0, %xmm0
; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%v = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
%r = extractelement <4 x float> %v, i32 0
@ -142,8 +142,7 @@ define float @fsqrt_v4f32(<4 x float> %x) nounwind {
define double @fsqrt_v4f64(<4 x double> %x) nounwind {
; CHECK-LABEL: fsqrt_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsqrtpd %ymm0, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %x)
@ -174,7 +173,7 @@ define double @fsin_v4f64(<4 x double> %x) nounwind {
define float @fma_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) nounwind {
; CHECK-LABEL: fma_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq
%v = call <4 x float> @llvm.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z)
%r = extractelement <4 x float> %v, i32 0
@ -184,7 +183,7 @@ define float @fma_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) nounwind
define double @fma_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z) nounwind {
; CHECK-LABEL: fma_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@ -207,8 +206,7 @@ define float @fabs_v4f32(<4 x float> %x) nounwind {
define double @fabs_v4f64(<4 x double> %x) nounwind {
; CHECK-LABEL: fabs_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
@ -219,8 +217,8 @@ define double @fabs_v4f64(<4 x double> %x) nounwind {
define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
; CHECK-LABEL: fmaxnum_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vmaxps %xmm0, %xmm1, %xmm2
; CHECK-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmaxss %xmm0, %xmm1, %xmm2
; CHECK-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; CHECK-NEXT: retq
%v = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
@ -231,10 +229,9 @@ define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
; CHECK-LABEL: fmaxnum_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm2
; CHECK-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vmaxsd %xmm0, %xmm1, %xmm2
; CHECK-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y)
@ -245,8 +242,8 @@ define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
; CHECK-LABEL: fminnum_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vminps %xmm0, %xmm1, %xmm2
; CHECK-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vminss %xmm0, %xmm1, %xmm2
; CHECK-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; CHECK-NEXT: retq
%v = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
@ -257,10 +254,9 @@ define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
define double @fminnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
; CHECK-LABEL: fminnum_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm2
; CHECK-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vminsd %xmm0, %xmm1, %xmm2
; CHECK-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = call <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y)
@ -309,10 +305,8 @@ define float @copysign_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
define double @copysign_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
; CHECK-LABEL: copysign_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vandps %xmm3, %xmm1, %xmm1
; CHECK-NEXT: vandps %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1
; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@ -324,7 +318,7 @@ define double @copysign_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
define float @floor_v4f32(<4 x float> %x) nounwind {
; CHECK-LABEL: floor_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundps $9, %xmm0, %xmm0
; CHECK-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%v = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
%r = extractelement <4 x float> %v, i32 0
@ -334,8 +328,7 @@ define float @floor_v4f32(<4 x float> %x) nounwind {
define double @floor_v4f64(<4 x double> %x) nounwind {
; CHECK-LABEL: floor_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundpd $9, %ymm0, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
@ -346,7 +339,7 @@ define double @floor_v4f64(<4 x double> %x) nounwind {
define float @ceil_v4f32(<4 x float> %x) nounwind {
; CHECK-LABEL: ceil_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundps $10, %xmm0, %xmm0
; CHECK-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%v = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
%r = extractelement <4 x float> %v, i32 0
@ -356,8 +349,7 @@ define float @ceil_v4f32(<4 x float> %x) nounwind {
define double @ceil_v4f64(<4 x double> %x) nounwind {
; CHECK-LABEL: ceil_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundpd $10, %ymm0, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x)
@ -368,7 +360,7 @@ define double @ceil_v4f64(<4 x double> %x) nounwind {
define float @trunc_v4f32(<4 x float> %x) nounwind {
; CHECK-LABEL: trunc_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundps $11, %xmm0, %xmm0
; CHECK-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%v = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x)
%r = extractelement <4 x float> %v, i32 0
@ -378,8 +370,7 @@ define float @trunc_v4f32(<4 x float> %x) nounwind {
define double @trunc_v4f64(<4 x double> %x) nounwind {
; CHECK-LABEL: trunc_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundpd $11, %ymm0, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x)
@ -390,7 +381,7 @@ define double @trunc_v4f64(<4 x double> %x) nounwind {
define float @rint_v4f32(<4 x float> %x) nounwind {
; CHECK-LABEL: rint_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundps $4, %xmm0, %xmm0
; CHECK-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%v = call <4 x float> @llvm.rint.v4f32(<4 x float> %x)
%r = extractelement <4 x float> %v, i32 0
@ -400,8 +391,7 @@ define float @rint_v4f32(<4 x float> %x) nounwind {
define double @rint_v4f64(<4 x double> %x) nounwind {
; CHECK-LABEL: rint_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundpd $4, %ymm0, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vroundsd $4, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = call <4 x double> @llvm.rint.v4f64(<4 x double> %x)
@ -412,7 +402,7 @@ define double @rint_v4f64(<4 x double> %x) nounwind {
define float @nearbyint_v4f32(<4 x float> %x) nounwind {
; CHECK-LABEL: nearbyint_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundps $12, %xmm0, %xmm0
; CHECK-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%v = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x)
%r = extractelement <4 x float> %v, i32 0
@ -422,8 +412,7 @@ define float @nearbyint_v4f32(<4 x float> %x) nounwind {
define double @nearbyint_v4f64(<4 x double> %x) nounwind {
; CHECK-LABEL: nearbyint_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundpd $12, %ymm0, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%v = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x)

View File

@ -43,7 +43,7 @@ define double @trunc_unsigned_f64(double %x) #0 {
; SSE2-NEXT: subpd {{.*}}(%rip), %xmm1
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE2-NEXT: addpd %xmm1, %xmm0
; SSE2-NEXT: addsd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: trunc_unsigned_f64:

View File

@ -1366,7 +1366,7 @@ define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE3-SLOW-NEXT: addps %xmm1, %xmm2
; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE3-SLOW-NEXT: addps %xmm2, %xmm0
; SSE3-SLOW-NEXT: addss %xmm2, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: fadd_reduce_v8f32:
@ -1385,7 +1385,7 @@ define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
;
@ -1408,7 +1408,7 @@ define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1
; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE3-SLOW-NEXT: addpd %xmm1, %xmm0
; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: fadd_reduce_v4f64:
@ -1423,7 +1423,7 @@ define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
;

View File

@ -638,8 +638,8 @@ define double @u64_to_d(i64 %a) nounwind {
; SSE2_32-NEXT: subpd {{\.LCPI.*}}, %xmm0
; SSE2_32-NEXT: movapd %xmm0, %xmm1
; SSE2_32-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE2_32-NEXT: addpd %xmm0, %xmm1
; SSE2_32-NEXT: movlpd %xmm1, (%esp)
; SSE2_32-NEXT: addsd %xmm0, %xmm1
; SSE2_32-NEXT: movsd %xmm1, (%esp)
; SSE2_32-NEXT: fldl (%esp)
; SSE2_32-NEXT: movl %ebp, %esp
; SSE2_32-NEXT: popl %ebp
@ -652,7 +652,7 @@ define double @u64_to_d(i64 %a) nounwind {
; SSE2_64-NEXT: subpd {{.*}}(%rip), %xmm1
; SSE2_64-NEXT: movapd %xmm1, %xmm0
; SSE2_64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE2_64-NEXT: addpd %xmm1, %xmm0
; SSE2_64-NEXT: addsd %xmm1, %xmm0
; SSE2_64-NEXT: retq
;
; X87-LABEL: u64_to_d:

View File

@ -7,15 +7,15 @@ define void @test1(<4 x float>* %F, float* %f) nounwind {
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movaps (%ecx), %xmm0
; X32-NEXT: addps %xmm0, %xmm0
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: addss %xmm0, %xmm0
; X32-NEXT: movss %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test1:
; X64: # %bb.0: # %entry
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: addps %xmm0, %xmm0
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: addss %xmm0, %xmm0
; X64-NEXT: movss %xmm0, (%rsi)
; X64-NEXT: retq
entry:

View File

@ -15,25 +15,25 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f32:
; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
ret float %1
@ -47,7 +47,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
; SSE2-NEXT: addps %xmm1, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
; SSE2-NEXT: addps %xmm2, %xmm0
; SSE2-NEXT: addss %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4f32:
@ -56,7 +56,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE41-NEXT: addps %xmm1, %xmm2
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE41-NEXT: addps %xmm2, %xmm0
; SSE41-NEXT: addss %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f32:
@ -64,7 +64,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32:
@ -72,7 +72,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
ret float %1
@ -87,7 +87,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
; SSE2-NEXT: addps %xmm1, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
; SSE2-NEXT: addps %xmm2, %xmm0
; SSE2-NEXT: addss %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8f32:
@ -97,7 +97,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE41-NEXT: addps %xmm1, %xmm2
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE41-NEXT: addps %xmm2, %xmm0
; SSE41-NEXT: addss %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f32:
@ -107,7 +107,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -118,7 +118,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
@ -136,7 +136,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
; SSE2-NEXT: addps %xmm1, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
; SSE2-NEXT: addps %xmm2, %xmm0
; SSE2-NEXT: addss %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16f32:
@ -148,7 +148,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE41-NEXT: addps %xmm1, %xmm2
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE41-NEXT: addps %xmm2, %xmm0
; SSE41-NEXT: addss %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f32:
@ -159,7 +159,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -172,7 +172,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
@ -188,26 +188,26 @@ define float @test_v2f32_zero(<2 x float> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
; SSE2-NEXT: addps %xmm0, %xmm1
; SSE2-NEXT: addss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32_zero:
; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f32_zero:
; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f32_zero:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
ret float %1
@ -221,7 +221,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; SSE2-NEXT: addps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4f32_zero:
@ -230,7 +230,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: addss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -239,7 +239,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32_zero:
@ -247,7 +247,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
ret float %1
@ -262,7 +262,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; SSE2-NEXT: addps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8f32_zero:
@ -272,7 +272,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: addss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -283,7 +283,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -294,7 +294,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
@ -312,7 +312,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; SSE2-NEXT: addps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16f32_zero:
@ -324,7 +324,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: addss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -336,7 +336,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -349,7 +349,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
@ -365,26 +365,26 @@ define float @test_v2f32_undef(<2 x float> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
; SSE2-NEXT: addps %xmm0, %xmm1
; SSE2-NEXT: addss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32_undef:
; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f32_undef:
; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f32_undef:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
ret float %1
@ -398,7 +398,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; SSE2-NEXT: addps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4f32_undef:
@ -407,7 +407,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: addss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -416,7 +416,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32_undef:
@ -424,7 +424,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
ret float %1
@ -439,7 +439,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; SSE2-NEXT: addps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8f32_undef:
@ -449,7 +449,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: addss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -460,7 +460,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -471,7 +471,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
@ -489,7 +489,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; SSE2-NEXT: addps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16f32_undef:
@ -501,7 +501,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: addss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -513,7 +513,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -526,7 +526,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
@ -542,19 +542,19 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX512-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
ret double %1
@ -566,7 +566,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
; SSE-NEXT: addpd %xmm2, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64:
@ -574,7 +574,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -583,7 +583,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX512-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
@ -598,7 +598,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
; SSE-NEXT: addpd %xmm2, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f64:
@ -607,7 +607,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -618,7 +618,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
@ -637,7 +637,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; SSE-NEXT: addpd %xmm1, %xmm4
; SSE-NEXT: movapd %xmm4, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
; SSE-NEXT: addpd %xmm4, %xmm0
; SSE-NEXT: addsd %xmm4, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v16f64:
@ -648,7 +648,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -660,7 +660,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
@ -676,20 +676,20 @@ define double @test_v2f64_zero(<2 x double> %a0) {
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: addpd %xmm0, %xmm1
; SSE-NEXT: addsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64_zero:
; AVX512: # %bb.0:
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
ret double %1
@ -701,7 +701,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: addpd %xmm0, %xmm1
; SSE-NEXT: addsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
@ -710,7 +710,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -719,7 +719,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
@ -734,7 +734,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: addpd %xmm0, %xmm1
; SSE-NEXT: addsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
@ -744,7 +744,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -755,7 +755,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
@ -774,7 +774,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
; SSE-NEXT: addpd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v16f64_zero:
@ -785,7 +785,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -797,7 +797,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
@ -813,20 +813,20 @@ define double @test_v2f64_undef(<2 x double> %a0) {
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: addpd %xmm0, %xmm1
; SSE-NEXT: addsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_undef:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64_undef:
; AVX512: # %bb.0:
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
ret double %1
@ -838,7 +838,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: addpd %xmm0, %xmm1
; SSE-NEXT: addsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
@ -847,7 +847,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -856,7 +856,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
@ -871,7 +871,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: addpd %xmm0, %xmm1
; SSE-NEXT: addsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
@ -881,7 +881,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -892,7 +892,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
@ -911,7 +911,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; SSE-NEXT: addpd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v16f64_undef:
@ -922,7 +922,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -934,7 +934,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)

View File

@ -15,25 +15,25 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: mulps %xmm1, %xmm0
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: mulps %xmm1, %xmm0
; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f32:
; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
ret float %1
@ -47,7 +47,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
; SSE2-NEXT: mulps %xmm1, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
; SSE2-NEXT: mulps %xmm2, %xmm0
; SSE2-NEXT: mulss %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4f32:
@ -56,7 +56,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE41-NEXT: mulps %xmm1, %xmm2
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE41-NEXT: mulps %xmm2, %xmm0
; SSE41-NEXT: mulss %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f32:
@ -64,7 +64,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32:
@ -72,7 +72,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
ret float %1
@ -87,7 +87,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
; SSE2-NEXT: mulps %xmm1, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
; SSE2-NEXT: mulps %xmm2, %xmm0
; SSE2-NEXT: mulss %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8f32:
@ -97,7 +97,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE41-NEXT: mulps %xmm1, %xmm2
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE41-NEXT: mulps %xmm2, %xmm0
; SSE41-NEXT: mulss %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f32:
@ -107,7 +107,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -118,7 +118,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
@ -136,7 +136,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
; SSE2-NEXT: mulps %xmm1, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
; SSE2-NEXT: mulps %xmm2, %xmm0
; SSE2-NEXT: mulss %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16f32:
@ -148,7 +148,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE41-NEXT: mulps %xmm1, %xmm2
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE41-NEXT: mulps %xmm2, %xmm0
; SSE41-NEXT: mulss %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f32:
@ -159,7 +159,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -172,7 +172,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
@ -188,26 +188,26 @@ define float @test_v2f32_zero(<2 x float> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
; SSE2-NEXT: mulps %xmm0, %xmm1
; SSE2-NEXT: mulss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32_zero:
; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: mulps %xmm1, %xmm0
; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f32_zero:
; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f32_zero:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
ret float %1
@ -221,7 +221,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; SSE2-NEXT: mulps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: mulps %xmm1, %xmm0
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4f32_zero:
@ -230,7 +230,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: mulss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -239,7 +239,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32_zero:
@ -247,7 +247,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
ret float %1
@ -262,7 +262,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; SSE2-NEXT: mulps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: mulps %xmm1, %xmm0
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8f32_zero:
@ -272,7 +272,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: mulss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -283,7 +283,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -294,7 +294,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
@ -312,7 +312,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; SSE2-NEXT: mulps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: mulps %xmm1, %xmm0
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16f32_zero:
@ -324,7 +324,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: mulss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -336,7 +336,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -349,7 +349,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
@ -365,26 +365,26 @@ define float @test_v2f32_undef(<2 x float> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
; SSE2-NEXT: mulps %xmm0, %xmm1
; SSE2-NEXT: mulss %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32_undef:
; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: mulps %xmm1, %xmm0
; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f32_undef:
; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f32_undef:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
ret float %1
@ -398,7 +398,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; SSE2-NEXT: mulps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: mulps %xmm1, %xmm0
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4f32_undef:
@ -407,7 +407,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: mulss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -416,7 +416,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32_undef:
@ -424,7 +424,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
ret float %1
@ -439,7 +439,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; SSE2-NEXT: mulps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: mulps %xmm1, %xmm0
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8f32_undef:
@ -449,7 +449,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: mulss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -460,7 +460,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -471,7 +471,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
@ -489,7 +489,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; SSE2-NEXT: mulps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; SSE2-NEXT: mulps %xmm1, %xmm0
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16f32_undef:
@ -501,7 +501,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: mulss %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@ -513,7 +513,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -526,7 +526,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
@ -542,19 +542,19 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX512-NEXT: vmulpd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
ret double %1
@ -566,7 +566,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
; SSE-NEXT: mulpd %xmm2, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64:
@ -574,7 +574,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -583,7 +583,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX512-NEXT: vmulpd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
@ -598,7 +598,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
; SSE-NEXT: mulpd %xmm2, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f64:
@ -607,7 +607,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -618,7 +618,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
@ -637,7 +637,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; SSE-NEXT: mulpd %xmm1, %xmm4
; SSE-NEXT: movapd %xmm4, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
; SSE-NEXT: mulpd %xmm4, %xmm0
; SSE-NEXT: mulsd %xmm4, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v16f64:
@ -648,7 +648,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -660,7 +660,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
@ -676,20 +676,20 @@ define double @test_v2f64_zero(<2 x double> %a0) {
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: mulpd %xmm0, %xmm1
; SSE-NEXT: mulsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64_zero:
; AVX512: # %bb.0:
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
ret double %1
@ -701,7 +701,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: mulpd %xmm0, %xmm1
; SSE-NEXT: mulsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
@ -710,7 +710,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -719,7 +719,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
@ -734,7 +734,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: mulpd %xmm0, %xmm1
; SSE-NEXT: mulsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
@ -744,7 +744,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -755,7 +755,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
@ -774,7 +774,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
; SSE-NEXT: mulpd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v16f64_zero:
@ -785,7 +785,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -797,7 +797,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
@ -813,20 +813,20 @@ define double @test_v2f64_undef(<2 x double> %a0) {
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: mulpd %xmm0, %xmm1
; SSE-NEXT: mulsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_undef:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64_undef:
; AVX512: # %bb.0:
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
ret double %1
@ -838,7 +838,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: mulpd %xmm0, %xmm1
; SSE-NEXT: mulsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
@ -847,7 +847,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -856,7 +856,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
@ -871,7 +871,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: mulpd %xmm0, %xmm1
; SSE-NEXT: mulsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
@ -881,7 +881,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -892,7 +892,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
@ -911,7 +911,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; SSE-NEXT: mulpd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v16f64_undef:
@ -922,7 +922,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@ -934,7 +934,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)