[X86][SSE] Enable commutation between MOVHLPS and UNPCKHPD
Assuming SSE2 is available then we can safely commute between these, removing some unnecessary register moves and improving memory folding opportunities. VEX encoded versions don't benefit so I haven't added support to them. llvm-svn: 277930
This commit is contained in:
parent
45a574130e
commit
7d168e19e8
|
@ -3709,6 +3709,22 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
|
|||
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
|
||||
OpIdx1, OpIdx2);
|
||||
}
|
||||
case X86::MOVHLPSrr:
|
||||
case X86::UNPCKHPDrr: {
|
||||
if (!Subtarget.hasSSE2())
|
||||
return nullptr;
|
||||
|
||||
unsigned Opc = MI.getOpcode();
|
||||
switch (Opc) {
|
||||
default: llvm_unreachable("Unreachable!");
|
||||
case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
|
||||
case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
|
||||
}
|
||||
auto &WorkingMI = cloneIfNew(MI);
|
||||
WorkingMI.setDesc(get(Opc));
|
||||
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
|
||||
OpIdx1, OpIdx2);
|
||||
}
|
||||
case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
|
||||
case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
|
||||
case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
|
||||
|
|
|
@ -1371,6 +1371,7 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
|
|||
[(set VR128:$dst,
|
||||
(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
|
||||
IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
|
||||
let isCommutable = 1 in
|
||||
def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
|
||||
(ins VR128:$src1, VR128:$src2),
|
||||
"movhlps\t{$src2, $dst|$dst, $src2}",
|
||||
|
@ -2641,7 +2642,8 @@ let Predicates = [UseSSE2] in {
|
|||
multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
|
||||
PatFrag mem_frag, RegisterClass RC,
|
||||
X86MemOperand x86memop, string asm,
|
||||
Domain d> {
|
||||
Domain d, bit IsCommutable = 0> {
|
||||
let isCommutable = IsCommutable in
|
||||
def rr : PI<opc, MRMSrcReg,
|
||||
(outs RC:$dst), (ins RC:$src1, RC:$src2),
|
||||
asm, [(set RC:$dst,
|
||||
|
@ -2689,7 +2691,7 @@ let Constraints = "$src1 = $dst" in {
|
|||
SSEPackedSingle>, PS;
|
||||
defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
|
||||
VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
|
||||
SSEPackedDouble>, PD;
|
||||
SSEPackedDouble, 1>, PD;
|
||||
defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
|
||||
VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
|
||||
SSEPackedSingle>, PS;
|
||||
|
|
|
@ -182,10 +182,10 @@ define void @test12() nounwind {
|
|||
; CHECK-NEXT: movapd 0, %xmm0
|
||||
; CHECK-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
|
||||
; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
||||
; CHECK-NEXT: xorpd %xmm2, %xmm2
|
||||
; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
|
||||
; CHECK-NEXT: addps %xmm1, %xmm0
|
||||
; CHECK-NEXT: movaps %xmm0, 0
|
||||
; CHECK-NEXT: xorps %xmm2, %xmm2
|
||||
; CHECK-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
|
||||
; CHECK-NEXT: addps %xmm1, %xmm2
|
||||
; CHECK-NEXT: movaps %xmm2, 0
|
||||
; CHECK-NEXT: retl
|
||||
%tmp1 = load <4 x float>, <4 x float>* null ; <<4 x float>> [#uses=2]
|
||||
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
|
||||
|
|
|
@ -58,16 +58,14 @@ define <4 x float> @t3(<4 x float>* %P) nounwind {
|
|||
; X32-LABEL: t3:
|
||||
; X32: # BB#0:
|
||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-NEXT: movapd (%eax), %xmm0
|
||||
; X32-NEXT: xorpd %xmm1, %xmm1
|
||||
; X32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; X32-NEXT: xorps %xmm0, %xmm0
|
||||
; X32-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: t3:
|
||||
; X64: # BB#0:
|
||||
; X64-NEXT: movapd (%rdi), %xmm0
|
||||
; X64-NEXT: xorpd %xmm1, %xmm1
|
||||
; X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; X64-NEXT: xorps %xmm0, %xmm0
|
||||
; X64-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
|
||||
; X64-NEXT: retq
|
||||
%tmp1 = load <4 x float>, <4 x float>* %P
|
||||
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
|
||||
|
|
|
@ -154,7 +154,7 @@ define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
|
|||
define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
|
||||
; SSE-LABEL: shuffle_v2f64_11:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v2f64_11:
|
||||
|
|
|
@ -319,8 +319,7 @@ define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
|
|||
define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
|
||||
; SSE-LABEL: shuffle_v4f32_6723:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
|
||||
; SSE-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v4f32_6723:
|
||||
|
|
|
@ -1406,8 +1406,7 @@ define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
|
|||
define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
|
||||
; SSE-LABEL: combine_test4:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
|
||||
; SSE-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_test4:
|
||||
|
@ -2326,8 +2325,7 @@ define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
|
|||
define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
|
||||
; SSE-LABEL: combine_undef_input_test4:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
|
||||
; SSE-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_undef_input_test4:
|
||||
|
@ -2432,7 +2430,7 @@ define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
|
|||
define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
|
||||
; SSE-LABEL: combine_undef_input_test9:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_undef_input_test9:
|
||||
|
@ -2511,8 +2509,7 @@ define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
|
|||
define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
|
||||
; SSE-LABEL: combine_undef_input_test14:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
|
||||
; SSE-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_undef_input_test14:
|
||||
|
@ -2623,7 +2620,7 @@ define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
|
|||
define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
|
||||
; SSE-LABEL: combine_undef_input_test19:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_undef_input_test19:
|
||||
|
|
Loading…
Reference in New Issue