[X86][SSE] Add MINSD/MAXSD/MINSS/MAXSS intrinsic scalar load folding support

These are no different in load behaviour to the existing ADD/SUB/MUL/DIV scalar ops but were missing from isNonFoldablePartialRegisterLoad

llvm-svn: 279652
This commit is contained in:
Simon Pilgrim 2016-08-24 18:40:53 +00:00
parent a45c31a5b4
commit e14653e17d
2 changed files with 50 additions and 6 deletions

View File

@ -6190,6 +6190,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
switch (UserOpc) {
case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int:
@ -6212,6 +6214,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
switch (UserOpc) {
case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int:

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=X64
define i16 @test1(float %f) nounwind {
; X32-LABEL: test1:
@ -9,8 +9,7 @@ define i16 @test1(float %f) nounwind {
; X32-NEXT: xorps %xmm1, %xmm1
; X32-NEXT: subss LCPI0_0, %xmm0
; X32-NEXT: mulss LCPI0_1, %xmm0
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: minss %xmm2, %xmm0
; X32-NEXT: minss LCPI0_2, %xmm0
; X32-NEXT: maxss %xmm1, %xmm0
; X32-NEXT: cvttss2si %xmm0, %eax
; X32-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
@ -22,8 +21,7 @@ define i16 @test1(float %f) nounwind {
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-NEXT: subss {{.*}}(%rip), %xmm0
; X64-NEXT: mulss {{.*}}(%rip), %xmm0
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-NEXT: minss %xmm2, %xmm0
; X64-NEXT: minss {{.*}}(%rip), %xmm0
; X64-NEXT: maxss %xmm1, %xmm0
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
@ -156,3 +154,45 @@ entry:
}
declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
define <4 x float> @minss_fold(float* %x, <4 x float> %y) {
; X32-LABEL: minss_fold:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: minss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: minss_fold:
; X64: ## BB#0: ## %entry
; X64-NEXT: minss (%rdi), %xmm0
; X64-NEXT: retq
entry:
%0 = load float, float* %x, align 1
%vecinit.i = insertelement <4 x float> undef, float %0, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float 0.000000e+00, i32 1
%vecinit3.i = insertelement <4 x float> %vecinit2.i, float 0.000000e+00, i32 2
%vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 3
%1 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %y, <4 x float> %vecinit4.i)
ret <4 x float> %1
}
define <4 x float> @maxss_fold(float* %x, <4 x float> %y) {
; X32-LABEL: maxss_fold:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: maxss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: maxss_fold:
; X64: ## BB#0: ## %entry
; X64-NEXT: maxss (%rdi), %xmm0
; X64-NEXT: retq
entry:
%0 = load float, float* %x, align 1
%vecinit.i = insertelement <4 x float> undef, float %0, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float 0.000000e+00, i32 1
%vecinit3.i = insertelement <4 x float> %vecinit2.i, float 0.000000e+00, i32 2
%vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 3
%1 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %y, <4 x float> %vecinit4.i)
ret <4 x float> %1
}