[InstCombine][SSE] Add DemandedElts support for PSHUFB instructions

Simplify a pshufb shuffle mask based on the elements of the mask that are actually demanded.

Differential Revision: https://reviews.llvm.org/D28745

llvm-svn: 292101
This commit is contained in:
Simon Pilgrim 2017-01-16 11:30:41 +00:00
parent 59d725cabf
commit 73a68c25a0
3 changed files with 30 additions and 15 deletions

View File

@ -2315,10 +2315,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_ssse3_pshuf_b_128:
case Intrinsic::x86_avx2_pshuf_b:
case Intrinsic::x86_avx512_pshuf_b_512:
case Intrinsic::x86_avx512_pshuf_b_512: {
if (Value *V = simplifyX86pshufb(*II, *Builder))
return replaceInstUsesWith(*II, V);
unsigned VWidth = II->getType()->getVectorNumElements();
APInt UndefElts(VWidth, 0);
APInt DemandedElts = APInt::getAllOnesValue(VWidth);
if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) {
if (V != II)
return replaceInstUsesWith(*II, V);
return II;
}
break;
}
case Intrinsic::x86_avx_vpermilvar_ps:
case Intrinsic::x86_avx_vpermilvar_ps_256:

View File

@ -1472,6 +1472,16 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
break;
}
case Intrinsic::x86_ssse3_pshuf_b_128:
case Intrinsic::x86_avx2_pshuf_b:
case Intrinsic::x86_avx512_pshuf_b_512: {
Value *Op1 = II->getArgOperand(1);
TmpV = SimplifyDemandedVectorElts(Op1, DemandedElts, UndefElts,
Depth + 1);
if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
break;
}
// SSE4A instructions leave the upper 64-bits of the 128-bit result
// in an undefined state.
case Intrinsic::x86_sse4a_extrq:

View File

@ -469,15 +469,12 @@ define <64 x i8> @fold_with_allundef_elts_avx512(<64 x i8> %InVec) {
}
; Demanded elts tests.
; FIXME: Missed opportunities to pass demanded elts through the pshufb shuffle mask
define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask, i8 %M0, i8 %M15) {
; CHECK-LABEL: @demanded_elts_insertion(
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 %M15, i32 15
; CHECK-NEXT: [[TMP3:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
; CHECK-NEXT: ret <16 x i8> [[TMP4]]
; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %BaseMask)
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
; CHECK-NEXT: ret <16 x i8> [[TMP2]]
;
%1 = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0
%2 = insertelement <16 x i8> %1, i8 %M15, i32 15
@ -489,9 +486,8 @@ define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask,
define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %BaseMask, i8 %M0, i8 %M22) {
; CHECK-LABEL: @demanded_elts_insertion_avx2(
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <32 x i8> [[TMP1]], i8 %M22, i32 22
; CHECK-NEXT: [[TMP3:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP2]])
; CHECK-NEXT: ret <32 x i8> [[TMP3]]
; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP1]])
; CHECK-NEXT: ret <32 x i8> [[TMP2]]
;
%1 = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
%2 = insertelement <32 x i8> %1, i8 %M22, i32 22
@ -502,11 +498,10 @@ define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %Base
define <64 x i8> @demanded_elts_insertion_avx512(<64 x i8> %InVec, <64 x i8> %BaseMask, i8 %M0, i8 %M30) {
; CHECK-LABEL: @demanded_elts_insertion_avx512(
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <64 x i8> [[TMP1]], i8 %M30, i32 30
; CHECK-NEXT: [[TMP3:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <64 x i8> [[TMP3]], <64 x i8> undef, <64 x i32> zeroinitializer
; CHECK-NEXT: ret <64 x i8> [[TMP4]]
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <64 x i8> undef, i8 %M0, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> undef, <64 x i32> zeroinitializer
; CHECK-NEXT: ret <64 x i8> [[TMP3]]
;
%1 = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0
%2 = insertelement <64 x i8> %1, i8 %M30, i32 30