From a1c0f09a89690f39683bf22126fe8999e62a6645 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 28 Jun 2021 08:54:03 +0100 Subject: [PATCH] [ARM] Add an extra fold for f32 extract(vdup(i32)) This adds another small fold for extract of a vdup, between a i32 and a f32, converting to a BITCAST. This allows some extra folding to happen, simplifying the resulting code. Differential Revision: https://reviews.llvm.org/D104857 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 2 + llvm/test/CodeGen/Thumb2/mve-vst4.ll | 98 ++++++++++++------------- 2 files changed, 48 insertions(+), 52 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index cfdff80585c8..0bd4306309f2 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14650,6 +14650,8 @@ static SDValue PerformExtractEltCombine(SDNode *N, return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X); if (VT == MVT::i32 && X.getValueType() == MVT::f16) return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X); + if (VT == MVT::f32 && X.getValueType() == MVT::i32) + return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X); while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST) X = X->getOperand(0); diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index 7a46f7920b04..cb933dc41f15 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -205,37 +205,34 @@ define void @vst4_v4i32_align1(<4 x i32> *%src, <16 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: vmov r12, lr, d0 -; CHECK-NEXT: vdup.32 q4, r3 -; CHECK-NEXT: vmov.f64 d0, d6 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s4, s13 -; CHECK-NEXT: vmov.f64 d4, d7 -; CHECK-NEXT: vmov.f32 s12, s15 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov.f64 d2, d8 +; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmov s10, r2 +; CHECK-NEXT: vmov s14, r3 +; CHECK-NEXT: vmov.f32 s8, s18 +; CHECK-NEXT: vmov s20, lr +; CHECK-NEXT: vmov.f32 s9, s2 +; CHECK-NEXT: vmov s6, r12 +; CHECK-NEXT: vmov.f32 s0, s17 +; CHECK-NEXT: vmov.f32 s12, s19 +; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vmov.f32 s15, s14 +; CHECK-NEXT: vmov.f32 s11, s10 ; CHECK-NEXT: vstrb.8 q3, [r1, #48] -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vdup.32 q3, r2 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s11, s15 +; CHECK-NEXT: vmov.f32 s3, s20 ; CHECK-NEXT: vstrb.8 q2, [r1, #32] -; CHECK-NEXT: vdup.32 q2, lr -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vstrb.8 q1, [r1, #16] -; CHECK-NEXT: vdup.32 q1, r12 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vstrb.8 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov.f32 s7, s6 +; CHECK-NEXT: vstrb.8 q0, [r1, #16] +; CHECK-NEXT: vstrb.8 q1, [r1] +; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 @@ -975,37 +972,34 @@ define void @vst4_v4f32_align1(<4 x float> *%src, <16 x float> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f64 d2, d8 +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vdup.32 q5, r3 -; CHECK-NEXT: vmov.f32 s8, s17 -; CHECK-NEXT: vmov.f64 d6, d9 -; CHECK-NEXT: vmov.f32 s16, s19 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vstrb.8 q4, [r1, #48] -; CHECK-NEXT: vmov.f32 s13, s10 -; CHECK-NEXT: vdup.32 q4, r2 ; CHECK-NEXT: vmov r12, lr, d0 -; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f64 d4, d10 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov s14, r2 +; CHECK-NEXT: vmov s18, r3 +; CHECK-NEXT: vmov.f32 s12, s22 +; CHECK-NEXT: vmov s24, lr +; CHECK-NEXT: vmov.f32 s13, s6 +; CHECK-NEXT: vmov.f32 s4, s21 +; CHECK-NEXT: vmov.f32 s16, s23 +; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov s10, r12 +; CHECK-NEXT: vmov.f32 s6, s24 +; CHECK-NEXT: vmov.f32 s19, s18 ; CHECK-NEXT: vmov.f32 s15, s2 +; CHECK-NEXT: vstrb.8 q4, [r1, #48] +; CHECK-NEXT: vmov.f32 s7, s24 ; CHECK-NEXT: vstrb.8 q3, [r1, #32] -; CHECK-NEXT: vdup.32 q3, lr -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s11, s15 -; CHECK-NEXT: vstrb.8 q2, [r1, #16] -; CHECK-NEXT: vdup.32 q2, r12 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vstrb.8 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmov.f32 s11, s0 +; CHECK-NEXT: vstrb.8 q1, [r1, #16] +; CHECK-NEXT: vstrb.8 q2, [r1] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} ; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0