From 659279450e5bbf7fefb971884243eeadc4fbfe8d Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 12 Sep 2017 23:24:05 +0000 Subject: [PATCH] [x86] eliminate unnecessary vector compare for AVX masked store The masked store instruction only cares about the sign-bit of each mask element, so the compare s<0 isn't needed. As noted in PR11210: https://bugs.llvm.org/show_bug.cgi?id=11210 ...fixing this should allow us to eliminate x86-specific masked store intrinsics in IR. (Although more testing will be needed to confirm that.) I filed a bug to track improvements for AVX512: https://bugs.llvm.org/show_bug.cgi?id=34584 Differential Revision: https://reviews.llvm.org/D37446 llvm-svn: 313089 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 29 +++++++++++++++++++++++-- llvm/test/CodeGen/X86/masked_memop.ll | 11 ++++------ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 725e2eac506e..942669c958c0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33145,8 +33145,33 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, if (Mst->isCompressingStore()) return SDValue(); - if (!Mst->isTruncatingStore()) - return reduceMaskedStoreToScalarStore(Mst, DAG); + if (!Mst->isTruncatingStore()) { + if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) + return ScalarStore; + + // If the mask is checking (0 > X), we're creating a vector with all-zeros + // or all-ones elements based on the sign bits of X. AVX1 masked store only + // cares about the sign bit of each mask element, so eliminate the compare: + // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X + // Note that by waiting to match an x86-specific PCMPGT node, we're + // eliminating potentially more complex matching of a setcc node which has + // a full range of predicates. + SDValue Mask = Mst->getMask(); + if (Mask.getOpcode() == X86ISD::PCMPGT && + ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) { + assert(Mask.getValueType() == Mask.getOperand(1).getValueType() && + "Unexpected type for PCMPGT"); + return DAG.getMaskedStore( + Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(), + Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand()); + } + + // TODO: AVX512 targets should also be able to simplify something like the + // pattern above, but that pattern will be different. It will either need to + // match setcc more generally or match PCMPGTM later (in tablegen?). + + return SDValue(); + } // Resolve truncating stores. EVT VT = Mst->getValue().getValueType(); diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index fa540c7643f4..b1923ebb081d 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -1140,21 +1140,18 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v ret <8 x double> %res } -; FIXME: The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed. +; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed. +; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that. define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) { ; AVX-LABEL: trunc_mask: ; AVX: ## BB#0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX-NEXT: vmaskmovps %xmm0, %xmm2, (%rdi) ; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_mask: ; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX512F-NEXT: vmaskmovps %xmm0, %xmm2, (%rdi) ; AVX512F-NEXT: retq ; ; SKX-LABEL: trunc_mask: