From 61d6ddbf0abf6e841344c231450c7929e259161d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 23 Feb 2018 20:13:42 +0000 Subject: [PATCH] [X86] Add DAG combine to remove (and X, 1) from in front of a v1i1 scalar to vector. These can be created by type legalization promoting the inputs to select to match scalar boolean contents. We were trying to pattern match them away during isel, but its better to just remove them from the DAG. I've cleaned up some patterns to not check for this 'and' anymore. But I suspect this has also opened up opportunities for pattern removal. llvm-svn: 325949 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 20 ++++++++++++++++++++ llvm/lib/Target/X86/X86InstrAVX512.td | 8 ++++---- llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll | 6 ++---- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d6e36d5122a2..87f7e52cfe00 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1653,6 +1653,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::INSERT_SUBVECTOR); setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); @@ -38042,11 +38043,30 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + + // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and. + // This occurs frequently in our masked scalar intrinsic code and our + // floating point select lowering with AVX512. + // TODO: SimplifyDemandedBits instead? + if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse()) + if (auto *C = dyn_cast(Src.getOperand(1))) + if (C->getAPIntValue().isOneValue()) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, + Src.getOperand(0)); + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; + case ISD::SCALAR_TO_VECTOR: + return combineScalarToVector(N, DAG); case ISD::EXTRACT_VECTOR_ELT: case X86ISD::PEXTRW: case X86ISD::PEXTRB: diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index a5a15e841893..f24927fb2a03 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3870,7 +3870,7 @@ multiclass avx512_move_scalar_lowering(InstrStr#rrk) @@ -3881,7 +3881,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0, def : Pat<(_.VT (OpNode _.RC:$src0, (_.VT (scalar_to_vector - (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))), + (_.EltVT (X86selects (scalar_to_vector (i8 (trunc GR32:$mask))), (_.EltVT _.FRC:$src1), (_.EltVT ZeroFP))))))), (!cast(InstrStr#rrkz) @@ -3993,7 +3993,7 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; -def : Pat<(f32 (X86selects (scalar_to_vector (and GR8:$mask, (i8 1))), +def : Pat<(f32 (X86selects (scalar_to_vector GR8:$mask), (f32 FR32X:$src1), (f32 FR32X:$src2))), (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), @@ -4007,7 +4007,7 @@ def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>; -def : Pat<(f64 (X86selects (scalar_to_vector (and GR8:$mask, (i8 1))), +def : Pat<(f64 (X86selects (scalar_to_vector GR8:$mask), (f64 FR64X:$src1), (f64 FR64X:$src2))), (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll index 60e041b05abe..6a5c4a8df26d 100644 --- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1117,9 +1117,8 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, ; ; AVX512-LABEL: add_ss_mask: ; AVX512: # %bb.0: -; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} +; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1} ; AVX512-NEXT: vmovaps %xmm2, %xmm0 ; AVX512-NEXT: retq %1 = extractelement <4 x float> %a, i64 0 @@ -1172,9 +1171,8 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> ; ; AVX512-LABEL: add_sd_mask: ; AVX512: # %bb.0: -; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} +; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1} ; AVX512-NEXT: vmovapd %xmm2, %xmm0 ; AVX512-NEXT: retq %1 = extractelement <2 x double> %a, i64 0