From 01e4667e021a7d65ae9e9cbe53ef3699b65e6b9d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 25 Oct 2016 04:00:29 +0000 Subject: [PATCH] [AVX-512] Add support for creating SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG for 512-bit vectors to support vpmovzxbq and vpmovsxbq. Summary: The one tricky thing about this is that the sign/zero_extend_inreg uses v64i8 as an input type which isn't legal without BWI support. Though the vpmovsxbq and vpmovzxbq instructions themselves don't require BWI. To support this we need to add custom lowering for ZERO_EXTEND_VECTOR_INREG with v64i8 input. This can mostly reuse the existing sign extend code with a couple checks for sign extend vs zero extend added. Reviewers: delena, RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D25594 llvm-svn: 285053 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 50 +++++++++++++++++++------ llvm/test/CodeGen/X86/avx512-pmovxrm.ll | 24 +++--------- llvm/test/CodeGen/X86/vector-sext.ll | 4 +- llvm/test/CodeGen/X86/vector-zext.ll | 7 +--- 4 files changed, 47 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3ffebf84521c..4423b01f7d2a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1309,6 +1309,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, VT, Legal); } + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom); + + // Without BWI we need to use custom lowering to handle MVT::v64i8 input. + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); @@ -1509,6 +1516,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v64i8, Legal); setOperationAction(ISD::UMIN, MVT::v32i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); if (Subtarget.hasVLX()) { setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); @@ -16368,9 +16377,13 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); } -static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. +// For sign extend this needs to handle all vector sizes and SSE4.1 and +// non-SSE4.1 targets. For zero extend this should only handle inputs of +// MVT::v64i8 when BWI is not supported, but AVX512 is. +static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); @@ -16385,20 +16398,33 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && - !(VT.is256BitVector() && Subtarget.hasInt256())) + !(VT.is256BitVector() && Subtarget.hasInt256()) && + !(VT.is512BitVector() && Subtarget.hasAVX512())) return SDValue(); SDLoc dl(Op); // For 256-bit vectors, we only need the lower (128-bit) half of the input. - if (VT.is256BitVector()) - In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, - MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2), - In, DAG.getIntPtrConstant(0, dl)); + // For 512-bit vectors, we need 128-bits or 256-bits. + if (VT.getSizeInBits() > 128) { + // Input needs to be at least the same number of elements as output, and + // at least 128-bits. + int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements(); + In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); + } + + assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG || + InVT == MVT::v64i8) && "Zero extend only for v64i8 input!"); // SSE41 targets can use the pmovsx* instructions directly. + unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? + X86ISD::VSEXT : X86ISD::VZEXT; if (Subtarget.hasSSE41()) - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + return DAG.getNode(ExtOpc, dl, VT, In); + + // We should only get here for sign extend. + assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && + "Unexpected opcode!"); // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. SDValue Curr = In; @@ -22075,8 +22101,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); + case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: - return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG); + return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); @@ -31120,7 +31147,8 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. // Also use this if we don't have SSE41 to allow the legalizer do its job. if (!Subtarget.hasSSE41() || VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.hasInt256())) { + (VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is512BitVector() && Subtarget.hasAVX512())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); return Opcode == ISD::SIGN_EXTEND ? DAG.getSignExtendVectorInReg(ExOp, DL, VT) diff --git a/llvm/test/CodeGen/X86/avx512-pmovxrm.ll b/llvm/test/CodeGen/X86/avx512-pmovxrm.ll index 91f0afec9d78..7c3965e08632 100644 --- a/llvm/test/CodeGen/X86/avx512-pmovxrm.ll +++ b/llvm/test/CodeGen/X86/avx512-pmovxrm.ll @@ -38,16 +38,12 @@ define <8 x i64> @test_llvm_x86_avx512_pmovsxbq(<16 x i8>* %a) { ; X32-LABEL: test_llvm_x86_avx512_pmovsxbq: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; X32-NEXT: vpsllq $56, %zmm0, %zmm0 -; X32-NEXT: vpsraq $56, %zmm0, %zmm0 +; X32-NEXT: vpmovsxbq (%eax), %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_llvm_x86_avx512_pmovsxbq: ; X64: ## BB#0: -; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vpsllq $56, %zmm0, %zmm0 -; X64-NEXT: vpsraq $56, %zmm0, %zmm0 +; X64-NEXT: vpmovsxbq (%rdi), %zmm0 ; X64-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a, align 1 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> @@ -139,22 +135,14 @@ define <8 x i64> @test_llvm_x86_avx512_pmovzxbq(<16 x i8>* %a) { ; X32-LABEL: test_llvm_x86_avx512_pmovzxbq: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; X32-NEXT: vpand %ymm2, %ymm1, %ymm1 -; X32-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT: vmovdqu (%eax), %xmm0 +; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; X32-NEXT: retl ; ; X64-LABEL: test_llvm_x86_avx512_pmovzxbq: ; X64: ## BB#0: -; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; X64-NEXT: vpand %ymm2, %ymm1, %ymm1 -; X64-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X64-NEXT: vmovdqu (%rdi), %xmm0 +; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a, align 1 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index f332f488800d..de946c04ea0c 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -599,9 +599,7 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp ; ; AVX512-LABEL: sext_16i8_to_8i64: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpsllq $56, %zmm0, %zmm0 -; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0 +; AVX512-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512-NEXT: retq ; ; X32-SSE41-LABEL: sext_16i8_to_8i64: diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index 72dda578338c..e679cc9bc42f 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -461,17 +461,12 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX512F-LABEL: zext_16i8_to_8i64: ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: zext_16i8_to_8i64: ; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq entry: %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32>