diff --git a/llvm/test/Analysis/CostModel/X86/reduce-add.ll b/llvm/test/Analysis/CostModel/X86/reduce-add.ll index 090813a597b8..2efadd801b05 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-add.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-add.ll @@ -8,6 +8,8 @@ ; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW ; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -cost-model -mtriple=x86_64-apple-darwin -analyze -mcpu=slm | FileCheck %s --check-prefixes=SLM + define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) @@ -32,6 +34,14 @@ define i32 @reduce_i64(i32 %arg) { ; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'reduce_i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) @@ -65,6 +75,14 @@ define i32 @reduce_i32(i32 %arg) { ; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'reduce_i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) @@ -119,6 +137,15 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'reduce_i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) @@ -179,6 +206,16 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'reduce_i8' +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll index eafe580b3cc1..fe48435d0865 100644 --- a/llvm/test/Analysis/CostModel/X86/reduction.ll +++ b/llvm/test/Analysis/CostModel/X86/reduction.ll @@ -5,6 +5,8 @@ ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mcpu=slm | FileCheck %s --check-prefixes=SLM + ; Check that we recognize the tree starting at the extractelement as a ; reduction. ; NOTE: We're only really interested in the extractelement cost, which represents the entire reduction. @@ -41,6 +43,14 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; +; SLM-LABEL: 'reduction_cost_float' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf @@ -81,6 +91,16 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; +; SLM-LABEL: 'reduction_cost_int' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; +; SLM-LABEL: 'pairwise_hadd' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -204,6 +235,17 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; +; SLM-LABEL: 'pairwise_hadd_assoc' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -261,6 +303,16 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 +; +; SLM-LABEL: 'pairwise_hadd_skip_first' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -300,6 +352,12 @@ define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; +; SLM-LABEL: 'no_pairwise_reduction2double' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf @@ -340,6 +398,14 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; +; SLM-LABEL: 'no_pairwise_reduction4float' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf @@ -390,6 +456,14 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; +; SLM-LABEL: 'no_pairwise_reduction4double' +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf @@ -450,6 +524,16 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; +; SLM-LABEL: 'no_pairwise_reduction8float' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 @@ -474,6 +558,12 @@ define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; +; SLM-LABEL: 'no_pairwise_reduction2i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> %bin.rdx = add <2 x i64> %rdx, %rdx.shuf @@ -490,6 +580,14 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; +; SLM-LABEL: 'no_pairwise_reduction4i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> %bin.rdx = add <4 x i32> %rdx, %rdx.shuf @@ -524,6 +622,14 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; +; SLM-LABEL: 'no_pairwise_reduction4i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> %bin.rdx = add <4 x i64> %rdx, %rdx.shuf @@ -574,6 +680,16 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; +; SLM-LABEL: 'no_pairwise_reduction8i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 @@ -616,6 +732,16 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; +; SLM-LABEL: 'no_pairwise_reduction8i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 @@ -656,6 +782,13 @@ define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; +; SLM-LABEL: 'pairwise_reduction2double' +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> @@ -705,6 +838,16 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; +; SLM-LABEL: 'pairwise_reduction4float' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -767,6 +910,16 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r +; +; SLM-LABEL: 'pairwise_reduction4double' +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> @@ -844,6 +997,19 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r +; +; SLM-LABEL: 'pairwise_reduction8float' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> @@ -873,6 +1039,13 @@ define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; +; SLM-LABEL: 'pairwise_reduction2i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> @@ -892,6 +1065,16 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; +; SLM-LABEL: 'pairwise_reduction4i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> @@ -934,6 +1117,16 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; +; SLM-LABEL: 'pairwise_reduction4i64' +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> @@ -998,6 +1191,19 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r +; +; SLM-LABEL: 'pairwise_reduction8i16' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> @@ -1052,6 +1258,19 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r +; +; SLM-LABEL: 'pairwise_reduction8i32' +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32>