[X86][AVX2] Missing AVX2 memory folding instructions

Added most of the missing vector folding patterns for AVX2 (as well as fixing the vpermpd and verpmq patterns)

Differential Revision: http://reviews.llvm.org/D7492

llvm-svn: 228688
This commit is contained in:
Simon Pilgrim 2015-02-10 13:22:57 +00:00
parent e76eb41c21
commit d142ab7d08
2 changed files with 229 additions and 29 deletions

View File

@ -358,6 +358,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
{ X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
{ X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
// AVX 128-bit versions of foldable instructions
{ X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE },
{ X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
@ -372,6 +373,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
{ X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
{ X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },
// AVX 256-bit foldable instructions
{ X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
@ -379,6 +381,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
{ X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
// AVX-512 foldable instructions
{ X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
{ X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
@ -391,6 +394,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
{ X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
{ X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
// AVX-512 foldable instructions (256-bit versions)
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
@ -402,6 +406,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
{ X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
// AVX-512 foldable instructions (128-bit versions)
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
@ -413,6 +418,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE },
{ X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE },
{ X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE },
// F16C foldable instructions
{ X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE },
{ X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE }
@ -540,6 +546,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
{ X86::UCOMISDrr, X86::UCOMISDrm, 0 },
{ X86::UCOMISSrr, X86::UCOMISSrm, 0 },
// AVX 128-bit versions of foldable instructions
{ X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 },
{ X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 },
@ -656,6 +663,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSBrr256, X86::VPABSBrm256, 0 },
{ X86::VPABSDrr256, X86::VPABSDrm256, 0 },
{ X86::VPABSWrr256, X86::VPABSWrm256, 0 },
{ X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, 0 },
{ X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, 0 },
{ X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, 0 },
{ X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, 0 },
{ X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, 0 },
{ X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, 0 },
{ X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, 0 },
{ X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, 0 },
{ X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
{ X86::VPERMQYri, X86::VPERMQYmi, 0 },
{ X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, 0 },
{ X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, 0 },
{ X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
{ X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
{ X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
{ X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, 0 },
{ X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, 0 },
{ X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, 0 },
{ X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
{ X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
{ X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
{ X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, 0 },
{ X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
{ X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
{ X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
@ -765,6 +794,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSQZrr, X86::VPABSQZrm, 0 },
{ X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
{ X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
// AVX-512 foldable instructions (256-bit versions)
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
@ -778,6 +808,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
{ X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
{ X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
// AVX-512 foldable instructions (256-bit versions)
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
{ X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
@ -1319,6 +1350,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
{ X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
{ X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
{ X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 },
{ X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
{ X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
{ X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
@ -1330,9 +1362,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
{ X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
{ X86::VPERMDYrr, X86::VPERMDYrm, 0 },
{ X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
{ X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
{ X86::VPERMQYri, X86::VPERMQYmi, 0 },
{ X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
{ X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 },
{ X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
@ -1387,8 +1417,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
{ X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
{ X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
{ X86::VPSUBQYrr, X86::VPSUBQYrm, 0 },
{ X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
{ X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
{ X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 },
{ X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 },
{ X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
{ X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
{ X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
@ -1399,7 +1432,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
{ X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
{ X86::VPXORYrr, X86::VPXORYrm, 0 },
// FIXME: add AVX 256-bit foldable instructions
// FMA4 foldable patterns
{ X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 },

View File

@ -253,7 +253,13 @@ define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
ret <8 x i32> %2
}
; TODO stack_fold_pblendvb
define <32 x i8> @stack_fold_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %c) {
;CHECK-LABEL: stack_fold_pblendvb
;CHECK: vpblendvb {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a1, <32 x i8> %c, <32 x i8> %a0)
ret <32 x i8> %2
}
declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @stack_fold_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
@ -265,28 +271,84 @@ define <16 x i16> @stack_fold_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
}
declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone
; TODO stack_fold_pbroadcastb
define <16 x i8> @stack_fold_pbroadcastb(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pbroadcastb
;CHECK: vpbroadcastb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0)
ret <16 x i8> %2
}
declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
; TODO stack_fold_pbroadcastb_ymm
define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pbroadcastb_ymm
;CHECK: vpbroadcastb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0)
ret <32 x i8> %2
}
declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
; TODO stack_fold_pbroadcastd
define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_pbroadcastd
;CHECK: vpbroadcastd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0)
; add forces execution domain
%3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %3
}
declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
; TODO stack_fold_pbroadcastd_ymm
define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_pbroadcastd_ymm
;CHECK: vpbroadcastd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0)
; add forces execution domain
%3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %3
}
declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
; TODO stack_fold_pbroadcastq
define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) {
;CHECK-LABEL: stack_fold_pbroadcastq
;CHECK: vpbroadcastq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0)
; add forces execution domain
%3 = add <2 x i64> %2, <i64 1, i64 1>
ret <2 x i64> %3
}
declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
; TODO stack_fold_pbroadcastq_ymm
define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) {
;CHECK-LABEL: stack_fold_pbroadcastq_ymm
;CHECK: vpbroadcastq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0)
; add forces execution domain
%3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i64> %3
}
declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
; TODO stack_fold_pbroadcastw
define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pbroadcastw
;CHECK: vpbroadcastw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0)
ret <8 x i16> %2
}
declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
; TODO stack_fold_pbroadcastw_ymm
define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pbroadcastw_ymm
;CHECK: vpbroadcastw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0)
ret <16 x i16> %2
}
declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
define <32 x i8> @stack_fold_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1) {
@ -380,7 +442,15 @@ define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
}
declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
; TODO stack_fold_permpd
define <4 x double> @stack_fold_permpd(<4 x double> %a0) {
;CHECK-LABEL: stack_fold_permpd
;CHECK: vpermpd $255, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
; fadd forces execution domain
%3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
ret <4 x double> %3
}
define <8 x float> @stack_fold_permps(<8 x float> %a0, <8 x float> %a1) {
;CHECK-LABEL: stack_fold_permps
@ -391,7 +461,15 @@ define <8 x float> @stack_fold_permps(<8 x float> %a0, <8 x float> %a1) {
}
declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x float>) nounwind readonly
; TODO stack_fold_permq
define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
;CHECK-LABEL: stack_fold_permq
;CHECK: vpermq $255, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
; add forces execution domain
%3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i64> %3
}
define <8 x i32> @stack_fold_phaddd(<8 x i32> %a0, <8 x i32> %a1) {
;CHECK-LABEL: stack_fold_phaddd
@ -573,40 +651,112 @@ define <16 x i16> @stack_fold_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
}
declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
; TODO stack_fold_pmovsxbd
define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovsxbd
;CHECK: vpmovsxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0)
ret <8 x i32> %2
}
declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
; TODO stack_fold_pmovsxbq
define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovsxbq
;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0)
ret <4 x i64> %2
}
declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
; TODO stack_fold_pmovsxbw
define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovsxbw
;CHECK: vpmovsxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0)
ret <16 x i16> %2
}
declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone
; TODO stack_fold_pmovsxdq
define <4 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_pmovsxdq
;CHECK: vpmovsxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0)
ret <4 x i64> %2
}
declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone
; TODO stack_fold_pmovsxwd
define <8 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pmovsxwd
;CHECK: vpmovsxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0)
ret <8 x i32> %2
}
declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone
; TODO stack_fold_pmovsxwq
define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pmovsxwq
;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0)
ret <4 x i64> %2
}
declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
; TODO stack_fold_pmovzxbd
define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovzxbd
;CHECK: vpmovzxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0)
ret <8 x i32> %2
}
declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
; TODO stack_fold_pmovzxbq
define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovzxbq
;CHECK: vpmovzxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0)
ret <4 x i64> %2
}
declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
; TODO stack_fold_pmovzxbw
define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovzxbw
;CHECK: vpmovzxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0)
ret <16 x i16> %2
}
declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
; TODO stack_fold_pmovzxdq
define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_pmovzxdq
;CHECK: vpmovzxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0)
ret <4 x i64> %2
}
declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
; TODO stack_fold_pmovzxwd
define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pmovzxwd
;CHECK: vpmovzxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0)
ret <8 x i32> %2
}
declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
; TODO stack_fold_pmovzxwq
define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pmovzxwq
;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0)
ret <4 x i64> %2
}
declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) {
@ -915,7 +1065,13 @@ define <8 x i32> @stack_fold_psubd(<8 x i32> %a0, <8 x i32> %a1) {
ret <8 x i32> %2
}
; TODO stack_fold_psubq
define <4 x i64> @stack_fold_psubq(<4 x i64> %a0, <4 x i64> %a1) {
;CHECK-LABEL: stack_fold_psubq
;CHECK: vpsubq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = sub <4 x i64> %a0, %a1
ret <4 x i64> %2
}
define <32 x i8> @stack_fold_psubsb(<32 x i8> %a0, <32 x i8> %a1) {
;CHECK-LABEL: stack_fold_psubsb
@ -935,10 +1091,22 @@ define <16 x i16> @stack_fold_psubsw(<16 x i16> %a0, <16 x i16> %a1) {
}
declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
; TODO stack_fold_psubusb
define <32 x i8> @stack_fold_psubusb(<32 x i8> %a0, <32 x i8> %a1) {
;CHECK-LABEL: stack_fold_psubusb
;CHECK: vpsubusb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %2
}
declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
; TODO stack_fold_psubusw
define <16 x i16> @stack_fold_psubusw(<16 x i16> %a0, <16 x i16> %a1) {
;CHECK-LABEL: stack_fold_psubusw
;CHECK: vpsubusw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %2
}
declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
define <16 x i16> @stack_fold_psubw(<16 x i16> %a0, <16 x i16> %a1) {