[X86][SSE] Add support for target shuffle constant folding

Initial support for target shuffle constant folding in cases where all shuffle inputs are constant. We may be able to relax this and merge shuffles with only some constant inputs in the future.

I've added the helper function getTargetConstantBitsFromNode (based off a similar function in X86ShuffleDecodeConstantPool.cpp) that could be reused for other cases requiring constant vector extraction.

Differential Revision: https://reviews.llvm.org/D27220

llvm-svn: 288250
This commit is contained in:
Simon Pilgrim 2016-11-30 16:33:46 +00:00
parent c6d8b4c044
commit 288c088c17
9 changed files with 217 additions and 95 deletions

View File

@ -4622,9 +4622,9 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
return ConstsNode;
}
static SDValue getConstVector(ArrayRef<APInt> Values, SmallBitVector &Undefs,
static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
assert(Values.size() == Undefs.size() && "Unequal constant and undef arrays");
assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
SmallVector<SDValue, 32> Ops;
bool Split = false;
@ -4637,16 +4637,22 @@ static SDValue getConstVector(ArrayRef<APInt> Values, SmallBitVector &Undefs,
}
MVT EltVT = ConstVecVT.getVectorElementType();
for (unsigned i = 0, e = Values.size(); i != e; ++i) {
for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
if (Undefs[i]) {
Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
continue;
}
const APInt &V = Values[i];
const APInt &V = Bits[i];
assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
if (Split) {
Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
} else if (EltVT == MVT::f32) {
APFloat FV(APFloat::IEEEsingle, V);
Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
} else if (EltVT == MVT::f64) {
APFloat FV(APFloat::IEEEdouble, V);
Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
} else {
Ops.push_back(DAG.getConstant(V, dl, EltVT));
}
@ -5037,6 +5043,77 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
return dyn_cast<Constant>(CNode->getConstVal());
}
// Extract constant bits from constant pool vector.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
SmallBitVector &UndefElts,
SmallVectorImpl<APInt> &EltBits) {
assert(UndefElts.empty() && "Expected an empty UndefElts vector");
assert(EltBits.empty() && "Expected an empty EltBits vector");
EVT VT = Op.getValueType();
unsigned SizeInBits = VT.getSizeInBits();
assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
unsigned NumElts = SizeInBits / EltSizeInBits;
auto *Cst = getTargetConstantFromNode(Op);
if (!Cst)
return false;
Type *CstTy = Cst->getType();
if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
return false;
// Extract all the undef/constant element data and pack into single bitsets.
APInt UndefBits(SizeInBits, 0);
APInt MaskBits(SizeInBits, 0);
unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
auto *COp = Cst->getAggregateElement(i);
if (!COp ||
!(isa<UndefValue>(COp) || isa<ConstantInt>(COp) ||
isa<ConstantFP>(COp)))
return false;
if (isa<UndefValue>(COp)) {
APInt EltUndef = APInt::getLowBitsSet(SizeInBits, CstEltSizeInBits);
UndefBits |= EltUndef.shl(i * CstEltSizeInBits);
continue;
}
APInt Bits;
if (auto *CInt = dyn_cast<ConstantInt>(COp))
Bits = CInt->getValue();
else if (auto *CFP = dyn_cast<ConstantFP>(COp))
Bits = CFP->getValueAPF().bitcastToAPInt();
Bits = Bits.zextOrTrunc(SizeInBits);
MaskBits |= Bits.shl(i * CstEltSizeInBits);
}
UndefElts = SmallBitVector(NumElts, false);
EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
// Now extract the undef/constant bit data into the target elts.
for (unsigned i = 0; i != NumElts; ++i) {
APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
// Only treat the element as UNDEF if all bits are UNDEF, otherwise
// treat it as zero.
if (UndefEltBits.isAllOnesValue()) {
UndefElts[i] = true;
continue;
}
APInt Bits = MaskBits.lshr(i * EltSizeInBits);
Bits = Bits.zextOrTrunc(EltSizeInBits);
EltBits[i] = Bits.getZExtValue();
}
return true;
}
static bool getTargetShuffleMaskIndices(SDValue MaskNode,
unsigned MaskEltSizeInBits,
SmallVectorImpl<uint64_t> &RawMask) {
@ -26308,6 +26385,93 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return false;
}
// Attempt to constant fold all of the constant source ops.
// Returns true if the entire shuffle is folded to a constant.
// TODO: Extend this to merge multiple constant Ops and update the mask.
static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
ArrayRef<int> Mask, SDValue Root,
bool HasVariableMask, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
MVT VT = Root.getSimpleValueType();
unsigned SizeInBits = VT.getSizeInBits();
unsigned NumMaskElts = Mask.size();
unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
unsigned NumOps = Ops.size();
// Extract constant bits from each source op.
bool OneUseConstantOp = false;
SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
for (unsigned i = 0; i != NumOps; ++i) {
SDValue SrcOp = Ops[i];
OneUseConstantOp |= SrcOp.hasOneUse();
if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
RawBitsOps[i]))
return false;
}
// Only fold if at least one of the constants is only used once or
// the combined shuffle has included a variable mask shuffle, this
// is to avoid constant pool bloat.
if (!OneUseConstantOp && !HasVariableMask)
return false;
// Shuffle the constant bits according to the mask.
SmallBitVector UndefElts(NumMaskElts, false);
SmallBitVector ZeroElts(NumMaskElts, false);
SmallBitVector ConstantElts(NumMaskElts, false);
SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
APInt::getNullValue(MaskSizeInBits));
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef) {
UndefElts[i] = true;
continue;
} else if (M == SM_SentinelZero) {
ZeroElts[i] = true;
continue;
}
assert(0 <= M && M < (int)(NumMaskElts * NumOps));
unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
if (SrcUndefElts[SrcMaskIdx]) {
UndefElts[i] = true;
continue;
}
auto &SrcEltBits = RawBitsOps[SrcOpIdx];
APInt &Bits = SrcEltBits[SrcMaskIdx];
if (!Bits) {
ZeroElts[i] = true;
continue;
}
ConstantElts[i] = true;
ConstantBitData[i] = Bits;
}
assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
// Create the constant data.
MVT MaskSVT;
if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
else
MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
SDLoc DL(Root);
SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
DCI.AddToWorklist(CstOp.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
return true;
}
/// \brief Fully generic combining of x86 shuffle instructions.
///
/// This should be the last combine run over the x86 shuffle instructions. Once
@ -26491,6 +26655,11 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
HasVariableMask, DAG, DCI, Subtarget))
return true;
// Attempt to constant fold all of the constant source ops.
if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
Subtarget))
return true;
// We can only combine unary and binary shuffle mask cases.
if (Ops.size() > 2)
return false;

View File

@ -361,12 +361,12 @@ define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
define <2 x double> @constant_fold_vpermilvar_pd() {
; X32-LABEL: constant_fold_vpermilvar_pd:
; X32: # BB#0:
; X32-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_pd:
; X64: # BB#0:
; X64-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00]
; X64-NEXT: retq
%1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> <double 1.0, double 2.0>, <2 x i64> <i64 2, i64 0>)
ret <2 x double> %1
@ -375,12 +375,12 @@ define <2 x double> @constant_fold_vpermilvar_pd() {
define <4 x double> @constant_fold_vpermilvar_pd_256() {
; X32-LABEL: constant_fold_vpermilvar_pd_256:
; X32: # BB#0:
; X32-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,2,3]
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_pd_256:
; X64: # BB#0:
; X64-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,2,3]
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00]
; X64-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
ret <4 x double> %1
@ -389,12 +389,12 @@ define <4 x double> @constant_fold_vpermilvar_pd_256() {
define <4 x float> @constant_fold_vpermilvar_ps() {
; X32-LABEL: constant_fold_vpermilvar_ps:
; X32: # BB#0:
; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,0,2,1]
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_ps:
; X64: # BB#0:
; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,0,2,1]
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00]
; X64-NEXT: retq
%1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x i32> <i32 3, i32 0, i32 2, i32 1>)
ret <4 x float> %1
@ -403,14 +403,12 @@ define <4 x float> @constant_fold_vpermilvar_ps() {
define <8 x float> @constant_fold_vpermilvar_ps_256() {
; X32-LABEL: constant_fold_vpermilvar_ps_256:
; X32: # BB#0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,5,5]
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_ps_256:
; X64: # BB#0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,5,5]
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>)
ret <8 x float> %1

View File

@ -685,14 +685,12 @@ define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
define <8 x i32> @constant_fold_permd() {
; X32-LABEL: constant_fold_permd:
; X32: # BB#0:
; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
; X32-NEXT: vpermd {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_permd:
; X64: # BB#0:
; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
; X64-NEXT: vpermd {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
; X64-NEXT: retq
%1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
ret <8 x i32> %1
@ -701,14 +699,12 @@ define <8 x i32> @constant_fold_permd() {
define <8 x float> @constant_fold_permps() {
; X32-LABEL: constant_fold_permps:
; X32: # BB#0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
; X32-NEXT: vpermps {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_permps:
; X64: # BB#0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
; X64-NEXT: vpermps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
ret <8 x float> %1
@ -717,14 +713,12 @@ define <8 x float> @constant_fold_permps() {
define <32 x i8> @constant_fold_pshufb_256() {
; X32-LABEL: constant_fold_pshufb_256:
; X32: # BB#0:
; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[15],zero,zero,zero,zero,zero,ymm0[7,6,17],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[31],zero,zero,zero,zero,zero,ymm0[23,22]
; X32-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_pshufb_256:
; X64: # BB#0:
; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[15],zero,zero,zero,zero,zero,ymm0[7,6,17],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[31],zero,zero,zero,zero,zero,ymm0[23,22]
; X64-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
; X64-NEXT: retq
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
ret <32 x i8> %1

View File

@ -511,14 +511,12 @@ define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
define <16 x i8> @constant_fold_pshufb() {
; SSE-LABEL: constant_fold_pshufb:
; SSE: # BB#0:
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[u,u],zero,zero,xmm0[15],zero,zero,zero,zero,zero,xmm0[7,6]
; SSE-NEXT: movaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
; SSE-NEXT: retq
;
; AVX-LABEL: constant_fold_pshufb:
; AVX: # BB#0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[u,u],zero,zero,xmm0[15],zero,zero,zero,zero,zero,xmm0[7,6]
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
; AVX-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
ret <16 x i8> %1

View File

@ -261,14 +261,12 @@ define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) {
define <2 x double> @constant_fold_vpermil2pd() {
; X32-LABEL: constant_fold_vpermil2pd:
; X32: # BB#0:
; X32-NEXT: vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00]
; X32-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2pd:
; X64: # BB#0:
; X64-NEXT: vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00]
; X64-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
; X64-NEXT: retq
%1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> <double 1.0, double 2.0>, <2 x double> <double -2.0, double -1.0>, <2 x i64> <i64 4, i64 2>, i8 2)
ret <2 x double> %1
@ -277,16 +275,12 @@ define <2 x double> @constant_fold_vpermil2pd() {
define <4 x double> @constant_fold_vpermil2pd_256() {
; X32-LABEL: constant_fold_vpermil2pd_256:
; X32: # BB#0:
; X32-NEXT: vmovapd {{.*#+}} ymm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
; X32-NEXT: vmovapd {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
; X32-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3,2]
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2pd_256:
; X64: # BB#0:
; X64-NEXT: vmovapd {{.*#+}} ymm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
; X64-NEXT: vmovapd {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
; X64-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3,2]
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00]
; X64-NEXT: retq
%1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x double> <double -4.0, double -3.0, double -2.0, double -1.0>, <4 x i64> <i64 4, i64 8, i64 2, i64 0>, i8 2)
ret <4 x double> %1
@ -295,16 +289,12 @@ define <4 x double> @constant_fold_vpermil2pd_256() {
define <4 x float> @constant_fold_vpermil2ps() {
; X32-LABEL: constant_fold_vpermil2ps:
; X32: # BB#0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
; X32-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,2],zero
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2ps:
; X64: # BB#0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
; X64-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,2],zero
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00]
; X64-NEXT: retq
%1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> <float -4.0, float -3.0, float -2.0, float -1.0>, <4 x i32> <i32 4, i32 0, i32 2, i32 8>, i8 2)
ret <4 x float> %1
@ -313,16 +303,12 @@ define <4 x float> @constant_fold_vpermil2ps() {
define <8 x float> @constant_fold_vpermil2ps_256() {
; X32-LABEL: constant_fold_vpermil2ps_256:
; X32: # BB#0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,-7.000000e+00,-6.000000e+00,-5.000000e+00,-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
; X32-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
; X32-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0,2],zero,ymm1[4],zero,ymm1[4,6]
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2ps_256:
; X64: # BB#0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,-7.000000e+00,-6.000000e+00,-5.000000e+00,-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
; X64-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
; X64-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0,2],zero,ymm1[4],zero,ymm1[4,6]
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x float> <float -8.0, float -7.0, float -6.0, float -5.0, float -4.0, float -3.0, float -2.0, float -1.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 8, i32 0, i32 8, i32 0, i32 2>, i8 2)
ret <8 x float> %1
@ -331,16 +317,12 @@ define <8 x float> @constant_fold_vpermil2ps_256() {
define <16 x i8> @constant_fold_vpperm() {
; X32-LABEL: constant_fold_vpperm:
; X32: # BB#0:
; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpperm:
; X64: # BB#0:
; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; X64-NEXT: retq
%1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> <i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16>)
ret <16 x i8> %1

View File

@ -42,10 +42,8 @@ define void @test1() {
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: movlps %xmm0, (%esp)
; X32-NEXT: movq (%esp), %mm0
; X32-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: movq {{[0-9]+}}(%esp), %mm1
; X32-NEXT: xorl %edi, %edi
; X32-NEXT: maskmovq %mm1, %mm0
@ -58,10 +56,8 @@ define void @test1() {
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq {{.*}}(%rip), %rax
; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm1
; X64-NEXT: xorl %edi, %edi
; X64-NEXT: maskmovq %mm1, %mm0

View File

@ -18,13 +18,10 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @test(<4 x i16>* %a, <4 x i16>* %b) {
; AVX-LABEL: test:
; AVX: ## BB#0: ## %body
; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [65533,124,125,14807]
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX-NEXT: vmovq %xmm1, (%rdi)
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,65535]
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: movq {{.*}}(%rip), %rax
; AVX-NEXT: movq %rax, (%rdi)
; AVX-NEXT: movq {{.*}}(%rip), %rax
; AVX-NEXT: movq %rax, (%rsi)
; AVX-NEXT: retq
body:
%predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127>

View File

@ -372,15 +372,10 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
; X86-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u>
; X86-NEXT: pshufb %xmm0, %xmm1
; X86-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; X86-NEXT: pextrw $0, %xmm1, (%edx)
; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; X86-NEXT: pextrw $0, %xmm0, (%edx)
; X86-NEXT: movb $-98, 2(%edx)
; X86-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u>
; X86-NEXT: pshufb %xmm0, %xmm1
; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; X86-NEXT: pextrw $0, %xmm0, (%ecx)
; X86-NEXT: movb $1, 2(%ecx)
; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@ -396,15 +391,10 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa
;
; X64-LABEL: rot:
; X64: # BB#0: # %entry
; X64-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
; X64-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u>
; X64-NEXT: pshufb %xmm0, %xmm1
; X64-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; X64-NEXT: pextrw $0, %xmm1, (%rsi)
; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; X64-NEXT: pextrw $0, %xmm0, (%rsi)
; X64-NEXT: movb $-98, 2(%rsi)
; X64-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u>
; X64-NEXT: pshufb %xmm0, %xmm1
; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; X64-NEXT: pextrw $0, %xmm0, (%rdx)
; X64-NEXT: movb $1, 2(%rdx)
; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero

View File

@ -111,16 +111,14 @@ define void @shuf5(<8 x i8>* %p) nounwind {
; X86-LABEL: shuf5:
; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33]
; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movsd %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: shuf5:
; X64: # BB#0:
; X64-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33]
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: movq {{.*}}(%rip), %rax
; X64-NEXT: movq %rax, (%rdi)
; X64-NEXT: retq
%v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <8 x i8> %v, <8 x i8>* %p, align 8