[X86] Lower avx2/avx512f gather intrinsics to X86MaskedGatherSDNode instead of going directly to MachineSDNode.:

This sends these intrinsics through isel in a much more normal way. This should allow addressing mode matching in isel to make better use of the displacement field.

Differential Revision: https://reviews.llvm.org/D56827

llvm-svn: 351570
This commit is contained in:
Craig Topper 2019-01-18 18:22:26 +00:00
parent ae17ff0781
commit b9d4461f9f
5 changed files with 100 additions and 89 deletions

View File

@ -4810,6 +4810,18 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
break;
}
case GATHER:
case GATHER_AVX2: {
Info.ptrVal = nullptr;
MVT DataVT = MVT::getVT(I.getType());
MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
Info.align = 1;
Info.flags |= MachineMemOperand::MOLoad;
break;
}
default:
return false;
}
@ -22376,25 +22388,26 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
if (!C)
return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
EVT MaskVT = Mask.getValueType();
EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
return DAG.getMergeValues(RetOps, dl);
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
}
static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
@ -22412,17 +22425,18 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
SDValue Ops[] = {Src, Mask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
return DAG.getMergeValues(RetOps, dl);
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@ -22787,7 +22801,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SDValue Index = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
Chain, Subtarget);
}
case SCATTER: {

View File

@ -8363,7 +8363,7 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
VEX, VEX_L, Sched<[WriteLoad]>;
}
let Predicates = [UseAVX2] in {
let Predicates = [HasAVX2] in {
let mayLoad = 1, hasSideEffects = 0, Constraints
= "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
in {

View File

@ -64,47 +64,47 @@ struct IntrinsicData {
* the alphabetical order.
*/
static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, X86::VPGATHERDDrm, 0),
X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, X86::VPGATHERDDYrm, 0),
X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, X86::VGATHERDPDrm, 0),
X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0),
X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, X86::VGATHERDPSrm, 0),
X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0),
X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, X86::VPGATHERDQrm, 0),
X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, X86::VPGATHERDQYrm, 0),
X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, X86::VPGATHERQDrm, 0),
X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, X86::VPGATHERQDYrm, 0),
X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, X86::VGATHERQPDrm, 0),
X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0),
X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, X86::VGATHERQPSrm, 0),
X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0),
X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, X86::VPGATHERQQrm, 0),
X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, X86::VPGATHERQQYrm, 0),
X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, 0, 0),
X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
@ -115,30 +115,30 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_dps_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_qpd_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_qpi_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_qpq_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather_qps_512, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div2_df, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div2_di, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div4_df, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div4_di, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div4_sf, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div4_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div8_sf, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3div8_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv2_df, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv2_di, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv4_df, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv4_di, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv4_sf, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv4_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv8_sf, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_gather3siv8_si, GATHER, 0, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNC, 0),

View File

@ -152,9 +152,8 @@ define <4 x float> @gather_global(<4 x i64>, i32* nocapture readnone) {
; X32-LABEL: gather_global:
; X32: # %bb.0:
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X32-NEXT: movl $x, %eax
; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm0,4), %xmm1
; X32-NEXT: vgatherqps %xmm2, x(,%ymm0,4), %xmm1
; X32-NEXT: vmovaps %xmm1, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
@ -162,9 +161,8 @@ define <4 x float> @gather_global(<4 x i64>, i32* nocapture readnone) {
; X64-LABEL: gather_global:
; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: movl $x, %eax
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherqps %xmm2, (%rax,%ymm0,4), %xmm1
; X64-NEXT: vgatherqps %xmm2, x(,%ymm0,4), %xmm1
; X64-NEXT: vmovaps %xmm1, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq

View File

@ -881,9 +881,8 @@ define <8 x float> @gather_global(<8 x i64>, i32* nocapture readnone) {
; CHECK-LABEL: gather_global:
; CHECK: # %bb.0:
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: movl $x, %eax
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vgatherqps (%rax,%zmm0,4), %ymm1 {%k1}
; CHECK-NEXT: vgatherqps x(,%zmm0,4), %ymm1 {%k1}
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%3 = tail call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> zeroinitializer, i8* bitcast ([1024 x float]* @x to i8*), <8 x i64> %0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)