[AVX512] Add popcount support for v32i16 and v64i8.

llvm-svn: 266858
This commit is contained in:
Craig Topper 2016-04-20 05:18:55 +00:00
parent 580c1b6952
commit 99e60e9f1f
2 changed files with 72 additions and 44 deletions

View File

@ -1528,6 +1528,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
@ -20580,7 +20581,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
int NumByteElts = VecSize / 8;
MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
SDValue In = DAG.getBitcast(ByteVecVT, Op);
SmallVector<SDValue, 16> LUTVec;
SmallVector<SDValue, 64> LUTVec;
for (int i = 0; i < NumByteElts; ++i)
LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
@ -20676,8 +20677,7 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
// FIXME: Need to add AVX-512 support here!
assert((VT.is256BitVector() || VT.is128BitVector()) &&
assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
"Unknown CTPOP type to handle");
SDLoc DL(Op.getNode());
SDValue Op0 = Op.getOperand(0);

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; ALL-LABEL: testv8i64:
@ -106,51 +107,78 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
}
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; ALL-LABEL: testv32i16:
; ALL: ## BB#0:
; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; ALL-NEXT: vpand %ymm2, %ymm0, %ymm3
; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
; ALL-NEXT: vpand %ymm2, %ymm0, %ymm0
; ALL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
; ALL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; ALL-NEXT: vpsllw $8, %ymm0, %ymm3
; ALL-NEXT: vpaddb %ymm0, %ymm3, %ymm0
; ALL-NEXT: vpsrlw $8, %ymm0, %ymm0
; ALL-NEXT: vpand %ymm2, %ymm1, %ymm3
; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
; ALL-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; ALL-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; ALL-NEXT: vpsllw $8, %ymm1, %ymm2
; ALL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
; ALL-NEXT: vpsrlw $8, %ymm1, %ymm1
; ALL-NEXT: retq
; AVX512F-LABEL: testv32i16:
; AVX512F: ## BB#0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm3
; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm2
; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
ret <32 x i16> %out
}
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; ALL-LABEL: testv64i8:
; ALL: ## BB#0:
; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; ALL-NEXT: vpand %ymm2, %ymm0, %ymm3
; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
; ALL-NEXT: vpand %ymm2, %ymm0, %ymm0
; ALL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
; ALL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; ALL-NEXT: vpand %ymm2, %ymm1, %ymm3
; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
; ALL-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; ALL-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; ALL-NEXT: retq
; AVX512F-LABEL: testv64i8:
; AVX512F: ## BB#0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv64i8:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
ret <64 x i8> %out
}