From fc35391f2b4c36ec1a35d6e464ad972cf7117cdb Mon Sep 17 00:00:00 2001
From: Hal Finkel <hfinkel@anl.gov>
Date: Thu, 31 Mar 2016 20:39:41 +0000
Subject: [PATCH] [PowerPC] Add a late MI-level pass for QPX load/splat
 simplification

Chapter 3 of the QPX manual states that, "Scalar floating-point load
instructions, defined in the Power ISA, cause a replication of the source data
across all elements of the target register." Thus, if we have a load followed
by a QPX splat (from the first lane), the splat is redundant. This adds a late
MI-level pass to remove the redundant splats in some of these cases
(specifically when both occur in the same basic block).

This optimization is scheduled just prior to post-RA scheduling. It can't happen
before anything that might replace the load with some already-computed quantity
(i.e. store-to-load forwarding).

llvm-svn: 265047
---
 llvm/lib/Target/PowerPC/CMakeLists.txt       |   1 +
 llvm/lib/Target/PowerPC/PPC.h                |   1 +
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp  |   3 -
 llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp  | 156 +++++++++++++++++++
 llvm/lib/Target/PowerPC/PPCTargetMachine.cpp |  13 +-
 llvm/test/CodeGen/PowerPC/qpx-load-splat.ll  |  83 ++++++++++
 6 files changed, 253 insertions(+), 4 deletions(-)
 create mode 100644 llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
 create mode 100644 llvm/test/CodeGen/PowerPC/qpx-load-splat.ll

diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt
index e8316e937cbb..53c2ed3d51ea 100644
--- a/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ b/llvm/lib/Target/PowerPC/CMakeLists.txt
@@ -29,6 +29,7 @@ add_llvm_target(PowerPCCodeGen
   PPCMachineFunctionInfo.cpp
   PPCMIPeephole.cpp
   PPCRegisterInfo.cpp
+  PPCQPXLoadSplat.cpp
   PPCSubtarget.cpp
   PPCTargetMachine.cpp
   PPCTargetObjectFile.cpp
diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h
index a4235fa6e042..d4eee2204cf0 100644
--- a/llvm/lib/Target/PowerPC/PPC.h
+++ b/llvm/lib/Target/PowerPC/PPC.h
@@ -42,6 +42,7 @@ namespace llvm {
   FunctionPass *createPPCVSXSwapRemovalPass();
   FunctionPass *createPPCMIPeepholePass();
   FunctionPass *createPPCBranchSelectionPass();
+  FunctionPass *createPPCQPXLoadSplatPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCTLSDynamicCallPass();
   FunctionPass *createPPCBoolRetToIntPass();
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d0f43434c39c..c645e076ae69 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7187,9 +7187,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
         SplatIdx -= 4;
       }
 
-      // FIXME: If SplatIdx == 0 and the input came from a load, then there is
-      // nothing to do.
-
       return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
                          DAG.getConstant(SplatIdx, dl, MVT::i32));
     }
diff --git a/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
new file mode 100644
index 000000000000..e15751e444cb
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
@@ -0,0 +1,156 @@
+//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The QPX vector registers overlay the scalar floating-point registers, and
+// any scalar floating-point loads splat their value across all vector lanes.
+// Thus, if we have a scalar load followed by a splat, we can remove the splat
+// (i.e. replace the load with a load-and-splat pseudo instruction).
+//
+// This pass must run after anything that might do store-to-load forwarding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-qpx-load-splat"
+
+STATISTIC(NumSimplified, "Number of QPX load splats simplified");
+
+namespace llvm {
+  void initializePPCQPXLoadSplatPass(PassRegistry&);
+}
+
+namespace {
+  struct PPCQPXLoadSplat : public MachineFunctionPass {
+    static char ID;
+    PPCQPXLoadSplat() : MachineFunctionPass(ID) {
+      initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    const char *getPassName() const override {
+      return "PowerPC QPX Load Splat Simplification";
+    }
+  };
+  char PPCQPXLoadSplat::ID = 0;
+}
+
+INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat",
+                "PowerPC QPX Load Splat Simplification",
+                false, false)
+
+FunctionPass *llvm::createPPCQPXLoadSplatPass() {
+  return new PPCQPXLoadSplat();
+}
+
+bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
+  bool MadeChange = false;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) {
+    MachineBasicBlock *MBB = &*MFI;
+    SmallVector<MachineInstr *, 4> Splats;
+
+    for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) {
+      MachineInstr *MI = &*MBBI;
+
+      if (MI->hasUnmodeledSideEffects() || MI->isCall()) {
+        Splats.clear();
+        continue;
+      }
+
+      // We're looking for a sequence like this:
+      // %F0<def> = LFD 0, %X3<kill>, %QF0<imp-def>; mem:LD8[%a](tbaa=!2)
+      // %QF1<def> = QVESPLATI %QF0<kill>, 0, %RM<imp-use>
+
+      for (auto SI = Splats.begin(); SI != Splats.end();) {
+        MachineInstr *SMI = *SI;
+        unsigned SplatReg = SMI->getOperand(0).getReg();
+        unsigned SrcReg = SMI->getOperand(1).getReg();
+
+        if (MI->modifiesRegister(SrcReg, TRI)) {
+          switch (MI->getOpcode()) {
+          default:
+            SI = Splats.erase(SI);
+            continue;
+          case PPC::LFS:
+          case PPC::LFD:
+          case PPC::LFSU:
+          case PPC::LFDU:
+          case PPC::LFSUX:
+          case PPC::LFDUX:
+          case PPC::LFSX:
+          case PPC::LFDX:
+          case PPC::LFIWAX:
+          case PPC::LFIWZX:
+            if (SplatReg != SrcReg) {
+              // We need to change the load to define the scalar subregister of
+              // the QPX splat source register.
+              unsigned SubRegIndex =
+                TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg());
+              unsigned SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex);
+
+              // Substitute both the explicit defined register, and also the
+              // implicit def of the containing QPX register.
+              MI->getOperand(0).setReg(SplatSubReg);
+              MI->substituteRegister(SrcReg, SplatReg, 0, *TRI);
+            }
+
+            SI = Splats.erase(SI);
+
+            // If SMI is directly after MI, then MBBI's base iterator is
+            // pointing at SMI.  Adjust MBBI around the call to erase SMI to
+            // avoid invalidating MBBI.
+            ++MBBI;
+            SMI->eraseFromParent();
+            --MBBI;
+
+            ++NumSimplified;
+            MadeChange = true;
+            continue;
+          }
+        }
+
+        if (MI->modifiesRegister(SplatReg, TRI)) {
+          SI = Splats.erase(SI);
+          continue;
+        }
+
+        ++SI;
+      }
+
+      if (MI->getOpcode() != PPC::QVESPLATI &&
+          MI->getOpcode() != PPC::QVESPLATIs &&
+          MI->getOpcode() != PPC::QVESPLATIb)
+        continue;
+      if (MI->getOperand(2).getImm() != 0)
+        continue;
+
+      // If there are other uses of the scalar value after this, replacing
+      // those uses might be non-trivial.
+      if (!MI->getOperand(1).isKill())
+        continue;
+
+      Splats.push_back(MI);
+    }
+  }
+
+  return MadeChange;
+}
+
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 8d5af9458be5..5d47e0e3ebd3 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -42,6 +42,10 @@ static cl::
 opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
                                 cl::desc("Disable VSX Swap Removal for PPC"));
 
+static cl::
+opt<bool> DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden,
+                              cl::desc("Disable QPX load splat simplification"));
+
 static cl::
 opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
                             cl::desc("Disable machine peepholes for PPC"));
@@ -388,8 +392,15 @@ void PPCPassConfig::addPreRegAlloc() {
 }
 
 void PPCPassConfig::addPreSched2() {
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
     addPass(&IfConverterID);
+
+    // This optimization must happen after anything that might do store-to-load
+    // forwarding. Here we're after RA (and, thus, when spills are inserted)
+    // but before post-RA scheduling.
+    if (!DisableQPXLoadSplat)
+      addPass(createPPCQPXLoadSplatPass());
+  }
 }
 
 void PPCPassConfig::addPreEmitPass() {
diff --git a/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll b/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll
new file mode 100644
index 000000000000..9feceb996a66
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x double> @foo(double* nocapture readonly %a) #0 {
+entry:
+  %0 = load double, double* %a, align 8, !tbaa !1
+  %vecinit.i = insertelement <4 x double> undef, double %0, i32 0
+  %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %shuffle.i
+
+; CHECK-LABEL: @foo
+; CHECK: lfd 1, 0(3)
+; CHECK: blr
+}
+
+define <4 x double> @foox(double* nocapture readonly %a, i64 %idx) #0 {
+entry:
+  %p = getelementptr double, double* %a, i64 %idx
+  %0 = load double, double* %p, align 8, !tbaa !1
+  %vecinit.i = insertelement <4 x double> undef, double %0, i32 0
+  %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %shuffle.i
+
+; CHECK-LABEL: @foox
+; CHECK: sldi [[REG1:[0-9]+]], 4, 3
+; CHECK: lfdx 1, 3, [[REG1]]
+; CHECK: blr
+}
+
+define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pptr) #0 {
+entry:
+  %p = getelementptr double, double* %a, i64 %idx
+  %0 = load double, double* %p, align 8, !tbaa !1
+  %vecinit.i = insertelement <4 x double> undef, double %0, i32 0
+  %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer
+  store double* %p, double** %pptr, align 8
+  ret <4 x double> %shuffle.i
+
+; CHECK-LABEL: @foox
+; CHECK: sldi [[REG1:[0-9]+]], 4, 3
+; CHECK: lfdux 1, 3, [[REG1]]
+; CHECK: std 3, 0(5)
+; CHECK: blr
+}
+
+define <4 x float> @foof(float* nocapture readonly %a) #0 {
+entry:
+  %0 = load float, float* %a, align 4, !tbaa !1
+  %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
+  %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %shuffle.i
+
+; CHECK-LABEL: @foof
+; CHECK: lfs 1, 0(3)
+; CHECK: blr
+}
+
+define <4 x float> @foofx(float* nocapture readonly %a, i64 %idx) #0 {
+entry:
+  %p = getelementptr float, float* %a, i64 %idx
+  %0 = load float, float* %p, align 4, !tbaa !1
+  %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
+  %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %shuffle.i
+
+; CHECK-LABEL: @foofx
+; CHECK: sldi [[REG1:[0-9]+]], 4, 2
+; CHECK: lfsx 1, 3, [[REG1]]
+; CHECK: blr
+}
+
+attributes #0 = { norecurse nounwind readonly "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"bgclang r264510-20160326 clang version 3.9.0  (based on LLVM 3.9.0svn)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"double", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+