[X86][SSE] combineTruncateWithSat - use truncateVectorWithPACK down to 64-bit subvectors

Add support for chaining PACKSS/PACKUS down to 64-bit vectors by using only a single 128-bit input.

llvm-svn: 325494
This commit is contained in:
Simon Pilgrim 2018-02-19 13:29:20 +00:00
parent ab8e393f62
commit c302a581a0
3 changed files with 44 additions and 182 deletions

View File

@ -16597,8 +16597,8 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode");
// Requires SSE2 but AVX512 has fast truncate.
if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
// Requires SSE2 but AVX512 has fast vector truncate.
if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
return SDValue();
EVT SrcVT = In.getValueType();
@ -16607,25 +16607,23 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
if (SrcVT == DstVT)
return In;
// We only support vector truncation to 128bits or greater from a
// 256bits or greater source.
// We only support vector truncation to 64bits or greater from a
// 128bits or greater source.
unsigned DstSizeInBits = DstVT.getSizeInBits();
unsigned SrcSizeInBits = SrcVT.getSizeInBits();
if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
return SDValue();
unsigned NumElems = SrcVT.getVectorNumElements();
if (!isPowerOf2_32(NumElems))
return SDValue();
LLVMContext &Ctx = *DAG.getContext();
unsigned NumElems = SrcVT.getVectorNumElements();
assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
// Extract lower/upper subvectors.
unsigned NumSubElts = NumElems / 2;
SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
// Pack to the largest type possible:
// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
EVT InVT = MVT::i16, OutVT = MVT::i8;
@ -16635,12 +16633,27 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
OutVT = MVT::i16;
}
// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
if (SrcVT.is128BitVector()) {
InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
SDValue Res = DAG.getNode(Opcode, DL, OutVT,
DAG.getBitcast(InVT, In), DAG.getUNDEF(InVT));
Res = extractSubVector(Res, 0, DAG, DL, 64);
return DAG.getBitcast(DstVT, Res);
}
// Extract lower/upper subvectors.
unsigned NumSubElts = NumElems / 2;
SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
unsigned SubSizeInBits = SrcSizeInBits / 2;
InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
if (SrcVT.is256BitVector()) {
if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
Lo = DAG.getBitcast(InVT, Lo);
Hi = DAG.getBitcast(InVT, Hi);
SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
@ -16669,7 +16682,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
}
// Recursively pack lower/upper subvectors, concat result and pack again.
assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

View File

@ -3099,98 +3099,27 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) {
}
define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
; SSE2-LABEL: trunc_packus_v8i32_v8i8_store:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pandn %xmm2, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
; SSE2-NEXT: pand %xmm2, %xmm5
; SSE2-NEXT: pand %xmm3, %xmm5
; SSE2-NEXT: pand %xmm2, %xmm4
; SSE2-NEXT: pand %xmm1, %xmm4
; SSE2-NEXT: packuswb %xmm5, %xmm4
; SSE2-NEXT: packuswb %xmm4, %xmm4
; SSE2-NEXT: movq %xmm4, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i32_v8i8_store:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm1
; SSSE3-NEXT: pandn %xmm2, %xmm3
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pandn %xmm2, %xmm1
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm2
; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm3, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: pshufb %xmm0, %xmm1
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: movq %xmm2, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i32_v8i8_store:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
; SSE41-NEXT: pminsd %xmm2, %xmm0
; SSE41-NEXT: pminsd %xmm2, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pmaxsd %xmm2, %xmm1
; SSE41-NEXT: pmaxsd %xmm2, %xmm0
; SSE41-NEXT: packssdw %xmm1, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSE41-NEXT: movq %xmm0, (%rdi)
; SSE41-NEXT: retq
; SSE-LABEL: trunc_packus_v8i32_v8i8_store:
; SSE: # %bb.0:
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: movq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i32_v8i8_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i32_v8i8_store:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq

View File

@ -2959,93 +2959,17 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) {
}
define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
; SSE2-LABEL: trunc_ssat_v8i32_v8i8_store:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pandn %xmm2, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: packuswb %xmm1, %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm2
; SSE2-NEXT: movq %xmm2, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_ssat_v8i32_v8i8_store:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm1
; SSSE3-NEXT: pandn %xmm2, %xmm3
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pandn %xmm2, %xmm1
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm1
; SSSE3-NEXT: pandn %xmm0, %xmm2
; SSSE3-NEXT: por %xmm1, %xmm2
; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm3
; SSSE3-NEXT: pandn %xmm0, %xmm1
; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: pshufb %xmm0, %xmm1
; SSSE3-NEXT: pshufb %xmm0, %xmm2
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: movq %xmm2, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v8i32_v8i8_store:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
; SSE41-NEXT: pminsd %xmm2, %xmm0
; SSE41-NEXT: pminsd %xmm2, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168]
; SSE41-NEXT: pmaxsd %xmm2, %xmm1
; SSE41-NEXT: pmaxsd %xmm2, %xmm0
; SSE41-NEXT: packssdw %xmm1, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSE41-NEXT: movq %xmm0, (%rdi)
; SSE41-NEXT: retq
; SSE-LABEL: trunc_ssat_v8i32_v8i8_store:
; SSE: # %bb.0:
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packsswb %xmm0, %xmm0
; SSE-NEXT: movq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_ssat_v8i32_v8i8_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127]
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
@ -3053,10 +2977,6 @@ define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
;
; AVX2-LABEL: trunc_ssat_v8i32_v8i8_store:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0