[x86] convert masked store of one element to scalar store

Another opportunity to reduce masked stores: in D16691, we decided not to attempt the 'one mask element is set'
transform in InstCombine, but this should be a win for any AVX machine.

Code comments note that this transform could be extended for other targets / cases.

Differential Revision: http://reviews.llvm.org/D16828

llvm-svn: 260145
This commit is contained in:
Sanjay Patel 2016-02-08 21:05:08 +00:00
parent 9fd2039faf
commit 264d7e5b68
2 changed files with 161 additions and 29 deletions

View File

@ -26784,13 +26784,86 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
}
/// PerformMSTORECombine - Resolve truncating stores
/// If exactly one element of the mask is set for a non-truncating masked store,
/// it is a vector extract and scalar store.
/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
SelectionDAG &DAG) {
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
// If V is a build vector of boolean constants and exactly one of those
// constants is true, return the operand index of that true element.
// Otherwise, return -1.
auto getOneTrueElt = [](SDValue V) {
// This needs to be a build vector of booleans.
// TODO: Checking for the i1 type matches the IR definition for the mask,
// but the mask check could be loosened to i8 or other types. That might
// also require checking more than 'allOnesValue'; eg, the x86 HW
// instructions only require that the MSB is set for each mask element.
// The ISD::MSTORE comments/definition do not specify how the mask operand
// is formatted.
auto *BV = dyn_cast<BuildVectorSDNode>(V);
if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
return -1;
int TrueIndex = -1;
unsigned NumElts = BV->getValueType(0).getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
const SDValue &Op = BV->getOperand(i);
if (Op.getOpcode() == ISD::UNDEF)
continue;
auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
if (!ConstNode)
return -1;
if (ConstNode->getAPIntValue().isAllOnesValue()) {
// If we already found a one, this is too many.
if (TrueIndex >= 0)
return -1;
TrueIndex = i;
}
}
return TrueIndex;
};
int TrueMaskElt = getOneTrueElt(MS->getMask());
if (TrueMaskElt < 0)
return SDValue();
SDLoc DL(MS);
EVT VT = MS->getValue().getValueType();
EVT EltVT = VT.getVectorElementType();
// Extract the one scalar element that is actually being stored.
SDValue ExtractIndex = DAG.getIntPtrConstant(TrueMaskElt, DL);
SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
MS->getValue(), ExtractIndex);
// Store that element at the appropriate offset from the base pointer.
SDValue StoreAddr = MS->getBasePtr();
unsigned EltSize = EltVT.getStoreSize();
if (TrueMaskElt != 0) {
unsigned StoreOffset = TrueMaskElt * EltSize;
SDValue StoreOffsetVal = DAG.getIntPtrConstant(StoreOffset, DL);
StoreAddr = DAG.getNode(ISD::ADD, DL, StoreAddr.getValueType(), StoreAddr,
StoreOffsetVal);
}
unsigned Alignment = MinAlign(MS->getAlignment(), EltSize);
return DAG.getStore(MS->getChain(), DL, Extract, StoreAddr,
MS->getPointerInfo(), MS->isVolatile(),
MS->isNonTemporal(), Alignment);
}
static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
if (!Mst->isTruncatingStore())
return SDValue();
return reduceMaskedStoreToScalarStore(Mst, DAG);
// Resolve truncating stores.
EVT VT = Mst->getValue().getValueType();
unsigned NumElems = VT.getVectorNumElements();
EVT StVT = Mst->getMemoryVT();

View File

@ -991,36 +991,92 @@ define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
ret void
}
define void @test22(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
; AVX1-LABEL: test22:
; AVX1: ## BB#0:
; AVX1-NEXT: movl $-1, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
; AVX1-NEXT: retq
; When only one element of the mask is set, reduce to a scalar store.
define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
; AVX-LABEL: one_mask_bit_set1:
; AVX: ## BB#0:
; AVX-NEXT: vmovd %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX2-LABEL: test22:
; AVX2: ## BB#0:
; AVX2-NEXT: movl $-1, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
; AVX2-NEXT: retq
; AVX512-LABEL: one_mask_bit_set1:
; AVX512: ## BB#0:
; AVX512-NEXT: vmovd %xmm0, (%rdi)
; AVX512-NEXT: retq
call void @llvm.masked.store.v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
ret void
}
; Choose a different element to show that the correct address offset is produced.
define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
; AVX-LABEL: one_mask_bit_set2:
; AVX: ## BB#0:
; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: test22:
; AVX512F: ## BB#0:
; AVX512F-NEXT: movl $-1, %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
; AVX512F-NEXT: retq
; AVX512-LABEL: one_mask_bit_set2:
; AVX512: ## BB#0:
; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi)
; AVX512-NEXT: retq
call void @llvm.masked.store.v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
ret void
}
; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; AVX-LABEL: one_mask_bit_set3:
; AVX: ## BB#0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; SKX-LABEL: test22:
; SKX: ## BB#0:
; SKX-NEXT: movb $1, %al
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
; AVX512-LABEL: one_mask_bit_set3:
; AVX512: ## BB#0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, 16(%rdi)
; AVX512-NEXT: retq
call void @llvm.masked.store.v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
ret void
}
; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
; AVX-LABEL: one_mask_bit_set4:
; AVX: ## BB#0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmovhpd %xmm0, 24(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: one_mask_bit_set4:
; AVX512: ## BB#0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi)
; AVX512-NEXT: retq
call void @llvm.masked.store.v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
ret void
}
; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
; AVX-LABEL: one_mask_bit_set5:
; AVX: ## BB#0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX-NEXT: vmovlps %xmm0, 48(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: one_mask_bit_set5:
; AVX512: ## BB#0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vmovlpd %xmm0, 48(%rdi)
; AVX512-NEXT: retq
call void @llvm.masked.store.v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
ret void
}
@ -1030,8 +1086,10 @@ declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
declare void @llvm.masked.store.v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>)
declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
@ -1043,6 +1101,7 @@ declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x
declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
declare void @llvm.masked.store.v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)