From fc35391f2b4c36ec1a35d6e464ad972cf7117cdb Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Thu, 31 Mar 2016 20:39:41 +0000 Subject: [PATCH] [PowerPC] Add a late MI-level pass for QPX load/splat simplification Chapter 3 of the QPX manual states that, "Scalar floating-point load instructions, defined in the Power ISA, cause a replication of the source data across all elements of the target register." Thus, if we have a load followed by a QPX splat (from the first lane), the splat is redundant. This adds a late MI-level pass to remove the redundant splats in some of these cases (specifically when both occur in the same basic block). This optimization is scheduled just prior to post-RA scheduling. It can't happen before anything that might replace the load with some already-computed quantity (i.e. store-to-load forwarding). llvm-svn: 265047 --- llvm/lib/Target/PowerPC/CMakeLists.txt | 1 + llvm/lib/Target/PowerPC/PPC.h | 1 + llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 3 - llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp | 156 +++++++++++++++++++ llvm/lib/Target/PowerPC/PPCTargetMachine.cpp | 13 +- llvm/test/CodeGen/PowerPC/qpx-load-splat.ll | 83 ++++++++++ 6 files changed, 253 insertions(+), 4 deletions(-) create mode 100644 llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp create mode 100644 llvm/test/CodeGen/PowerPC/qpx-load-splat.ll diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt index e8316e937cbb..53c2ed3d51ea 100644 --- a/llvm/lib/Target/PowerPC/CMakeLists.txt +++ b/llvm/lib/Target/PowerPC/CMakeLists.txt @@ -29,6 +29,7 @@ add_llvm_target(PowerPCCodeGen PPCMachineFunctionInfo.cpp PPCMIPeephole.cpp PPCRegisterInfo.cpp + PPCQPXLoadSplat.cpp PPCSubtarget.cpp PPCTargetMachine.cpp PPCTargetObjectFile.cpp diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h index a4235fa6e042..d4eee2204cf0 100644 --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -42,6 +42,7 @@ namespace llvm { FunctionPass *createPPCVSXSwapRemovalPass(); FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); + FunctionPass *createPPCQPXLoadSplatPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM); FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index d0f43434c39c..c645e076ae69 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -7187,9 +7187,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SplatIdx -= 4; } - // FIXME: If SplatIdx == 0 and the input came from a load, then there is - // nothing to do. - return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, DAG.getConstant(SplatIdx, dl, MVT::i32)); } diff --git a/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp new file mode 100644 index 000000000000..e15751e444cb --- /dev/null +++ b/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp @@ -0,0 +1,156 @@ +//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The QPX vector registers overlay the scalar floating-point registers, and +// any scalar floating-point loads splat their value across all vector lanes. +// Thus, if we have a scalar load followed by a splat, we can remove the splat +// (i.e. replace the load with a load-and-splat pseudo instruction). +// +// This pass must run after anything that might do store-to-load forwarding. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCInstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtargetInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "ppc-qpx-load-splat" + +STATISTIC(NumSimplified, "Number of QPX load splats simplified"); + +namespace llvm { + void initializePPCQPXLoadSplatPass(PassRegistry&); +} + +namespace { + struct PPCQPXLoadSplat : public MachineFunctionPass { + static char ID; + PPCQPXLoadSplat() : MachineFunctionPass(ID) { + initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "PowerPC QPX Load Splat Simplification"; + } + }; + char PPCQPXLoadSplat::ID = 0; +} + +INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat", + "PowerPC QPX Load Splat Simplification", + false, false) + +FunctionPass *llvm::createPPCQPXLoadSplatPass() { + return new PPCQPXLoadSplat(); +} + +bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) { + bool MadeChange = false; + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) { + MachineBasicBlock *MBB = &*MFI; + SmallVector Splats; + + for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) { + MachineInstr *MI = &*MBBI; + + if (MI->hasUnmodeledSideEffects() || MI->isCall()) { + Splats.clear(); + continue; + } + + // We're looking for a sequence like this: + // %F0 = LFD 0, %X3, %QF0; mem:LD8[%a](tbaa=!2) + // %QF1 = QVESPLATI %QF0, 0, %RM + + for (auto SI = Splats.begin(); SI != Splats.end();) { + MachineInstr *SMI = *SI; + unsigned SplatReg = SMI->getOperand(0).getReg(); + unsigned SrcReg = SMI->getOperand(1).getReg(); + + if (MI->modifiesRegister(SrcReg, TRI)) { + switch (MI->getOpcode()) { + default: + SI = Splats.erase(SI); + continue; + case PPC::LFS: + case PPC::LFD: + case PPC::LFSU: + case PPC::LFDU: + case PPC::LFSUX: + case PPC::LFDUX: + case PPC::LFSX: + case PPC::LFDX: + case PPC::LFIWAX: + case PPC::LFIWZX: + if (SplatReg != SrcReg) { + // We need to change the load to define the scalar subregister of + // the QPX splat source register. + unsigned SubRegIndex = + TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg()); + unsigned SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex); + + // Substitute both the explicit defined register, and also the + // implicit def of the containing QPX register. + MI->getOperand(0).setReg(SplatSubReg); + MI->substituteRegister(SrcReg, SplatReg, 0, *TRI); + } + + SI = Splats.erase(SI); + + // If SMI is directly after MI, then MBBI's base iterator is + // pointing at SMI. Adjust MBBI around the call to erase SMI to + // avoid invalidating MBBI. + ++MBBI; + SMI->eraseFromParent(); + --MBBI; + + ++NumSimplified; + MadeChange = true; + continue; + } + } + + if (MI->modifiesRegister(SplatReg, TRI)) { + SI = Splats.erase(SI); + continue; + } + + ++SI; + } + + if (MI->getOpcode() != PPC::QVESPLATI && + MI->getOpcode() != PPC::QVESPLATIs && + MI->getOpcode() != PPC::QVESPLATIb) + continue; + if (MI->getOperand(2).getImm() != 0) + continue; + + // If there are other uses of the scalar value after this, replacing + // those uses might be non-trivial. + if (!MI->getOperand(1).isKill()) + continue; + + Splats.push_back(MI); + } + } + + return MadeChange; +} + diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 8d5af9458be5..5d47e0e3ebd3 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -42,6 +42,10 @@ static cl:: opt DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden, cl::desc("Disable VSX Swap Removal for PPC")); +static cl:: +opt DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden, + cl::desc("Disable QPX load splat simplification")); + static cl:: opt DisableMIPeephole("disable-ppc-peephole", cl::Hidden, cl::desc("Disable machine peepholes for PPC")); @@ -388,8 +392,15 @@ void PPCPassConfig::addPreRegAlloc() { } void PPCPassConfig::addPreSched2() { - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None) { addPass(&IfConverterID); + + // This optimization must happen after anything that might do store-to-load + // forwarding. Here we're after RA (and, thus, when spills are inserted) + // but before post-RA scheduling. + if (!DisableQPXLoadSplat) + addPass(createPPCQPXLoadSplatPass()); + } } void PPCPassConfig::addPreEmitPass() { diff --git a/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll b/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll new file mode 100644 index 000000000000..9feceb996a66 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll @@ -0,0 +1,83 @@ +; RUN: llc < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-bgq-linux" + +; Function Attrs: norecurse nounwind readonly +define <4 x double> @foo(double* nocapture readonly %a) #0 { +entry: + %0 = load double, double* %a, align 8, !tbaa !1 + %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 + %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %shuffle.i + +; CHECK-LABEL: @foo +; CHECK: lfd 1, 0(3) +; CHECK: blr +} + +define <4 x double> @foox(double* nocapture readonly %a, i64 %idx) #0 { +entry: + %p = getelementptr double, double* %a, i64 %idx + %0 = load double, double* %p, align 8, !tbaa !1 + %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 + %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %shuffle.i + +; CHECK-LABEL: @foox +; CHECK: sldi [[REG1:[0-9]+]], 4, 3 +; CHECK: lfdx 1, 3, [[REG1]] +; CHECK: blr +} + +define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pptr) #0 { +entry: + %p = getelementptr double, double* %a, i64 %idx + %0 = load double, double* %p, align 8, !tbaa !1 + %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 + %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer + store double* %p, double** %pptr, align 8 + ret <4 x double> %shuffle.i + +; CHECK-LABEL: @foox +; CHECK: sldi [[REG1:[0-9]+]], 4, 3 +; CHECK: lfdux 1, 3, [[REG1]] +; CHECK: std 3, 0(5) +; CHECK: blr +} + +define <4 x float> @foof(float* nocapture readonly %a) #0 { +entry: + %0 = load float, float* %a, align 4, !tbaa !1 + %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 + %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %shuffle.i + +; CHECK-LABEL: @foof +; CHECK: lfs 1, 0(3) +; CHECK: blr +} + +define <4 x float> @foofx(float* nocapture readonly %a, i64 %idx) #0 { +entry: + %p = getelementptr float, float* %a, i64 %idx + %0 = load float, float* %p, align 4, !tbaa !1 + %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 + %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %shuffle.i + +; CHECK-LABEL: @foofx +; CHECK: sldi [[REG1:[0-9]+]], 4, 2 +; CHECK: lfsx 1, 3, [[REG1]] +; CHECK: blr +} + +attributes #0 = { norecurse nounwind readonly "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"bgclang r264510-20160326 clang version 3.9.0 (based on LLVM 3.9.0svn)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"double", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} +