[PowerPC] Add a late MI-level pass for QPX load/splat simplification
Chapter 3 of the QPX manual states that, "Scalar floating-point load instructions, defined in the Power ISA, cause a replication of the source data across all elements of the target register." Thus, if we have a load followed by a QPX splat (from the first lane), the splat is redundant. This adds a late MI-level pass to remove the redundant splats in some of these cases (specifically when both occur in the same basic block). This optimization is scheduled just prior to post-RA scheduling. It can't happen before anything that might replace the load with some already-computed quantity (i.e. store-to-load forwarding). llvm-svn: 265047
This commit is contained in:
parent
132cd62121
commit
fc35391f2b
|
@ -29,6 +29,7 @@ add_llvm_target(PowerPCCodeGen
|
|||
PPCMachineFunctionInfo.cpp
|
||||
PPCMIPeephole.cpp
|
||||
PPCRegisterInfo.cpp
|
||||
PPCQPXLoadSplat.cpp
|
||||
PPCSubtarget.cpp
|
||||
PPCTargetMachine.cpp
|
||||
PPCTargetObjectFile.cpp
|
||||
|
|
|
@ -42,6 +42,7 @@ namespace llvm {
|
|||
FunctionPass *createPPCVSXSwapRemovalPass();
|
||||
FunctionPass *createPPCMIPeepholePass();
|
||||
FunctionPass *createPPCBranchSelectionPass();
|
||||
FunctionPass *createPPCQPXLoadSplatPass();
|
||||
FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
|
||||
FunctionPass *createPPCTLSDynamicCallPass();
|
||||
FunctionPass *createPPCBoolRetToIntPass();
|
||||
|
|
|
@ -7187,9 +7187,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
|
|||
SplatIdx -= 4;
|
||||
}
|
||||
|
||||
// FIXME: If SplatIdx == 0 and the input came from a load, then there is
|
||||
// nothing to do.
|
||||
|
||||
return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
|
||||
DAG.getConstant(SplatIdx, dl, MVT::i32));
|
||||
}
|
||||
|
|
|
@ -0,0 +1,156 @@
|
|||
//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// The QPX vector registers overlay the scalar floating-point registers, and
|
||||
// any scalar floating-point loads splat their value across all vector lanes.
|
||||
// Thus, if we have a scalar load followed by a splat, we can remove the splat
|
||||
// (i.e. replace the load with a load-and-splat pseudo instruction).
|
||||
//
|
||||
// This pass must run after anything that might do store-to-load forwarding.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "PPC.h"
|
||||
#include "PPCInstrBuilder.h"
|
||||
#include "PPCInstrInfo.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetSubtargetInfo.h"
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "ppc-qpx-load-splat"
|
||||
|
||||
STATISTIC(NumSimplified, "Number of QPX load splats simplified");
|
||||
|
||||
namespace llvm {
|
||||
void initializePPCQPXLoadSplatPass(PassRegistry&);
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct PPCQPXLoadSplat : public MachineFunctionPass {
|
||||
static char ID;
|
||||
PPCQPXLoadSplat() : MachineFunctionPass(ID) {
|
||||
initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &Fn) override;
|
||||
|
||||
const char *getPassName() const override {
|
||||
return "PowerPC QPX Load Splat Simplification";
|
||||
}
|
||||
};
|
||||
char PPCQPXLoadSplat::ID = 0;
|
||||
}
|
||||
|
||||
INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat",
|
||||
"PowerPC QPX Load Splat Simplification",
|
||||
false, false)
|
||||
|
||||
FunctionPass *llvm::createPPCQPXLoadSplatPass() {
|
||||
return new PPCQPXLoadSplat();
|
||||
}
|
||||
|
||||
bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
|
||||
bool MadeChange = false;
|
||||
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
|
||||
|
||||
for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) {
|
||||
MachineBasicBlock *MBB = &*MFI;
|
||||
SmallVector<MachineInstr *, 4> Splats;
|
||||
|
||||
for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) {
|
||||
MachineInstr *MI = &*MBBI;
|
||||
|
||||
if (MI->hasUnmodeledSideEffects() || MI->isCall()) {
|
||||
Splats.clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
// We're looking for a sequence like this:
|
||||
// %F0<def> = LFD 0, %X3<kill>, %QF0<imp-def>; mem:LD8[%a](tbaa=!2)
|
||||
// %QF1<def> = QVESPLATI %QF0<kill>, 0, %RM<imp-use>
|
||||
|
||||
for (auto SI = Splats.begin(); SI != Splats.end();) {
|
||||
MachineInstr *SMI = *SI;
|
||||
unsigned SplatReg = SMI->getOperand(0).getReg();
|
||||
unsigned SrcReg = SMI->getOperand(1).getReg();
|
||||
|
||||
if (MI->modifiesRegister(SrcReg, TRI)) {
|
||||
switch (MI->getOpcode()) {
|
||||
default:
|
||||
SI = Splats.erase(SI);
|
||||
continue;
|
||||
case PPC::LFS:
|
||||
case PPC::LFD:
|
||||
case PPC::LFSU:
|
||||
case PPC::LFDU:
|
||||
case PPC::LFSUX:
|
||||
case PPC::LFDUX:
|
||||
case PPC::LFSX:
|
||||
case PPC::LFDX:
|
||||
case PPC::LFIWAX:
|
||||
case PPC::LFIWZX:
|
||||
if (SplatReg != SrcReg) {
|
||||
// We need to change the load to define the scalar subregister of
|
||||
// the QPX splat source register.
|
||||
unsigned SubRegIndex =
|
||||
TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg());
|
||||
unsigned SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex);
|
||||
|
||||
// Substitute both the explicit defined register, and also the
|
||||
// implicit def of the containing QPX register.
|
||||
MI->getOperand(0).setReg(SplatSubReg);
|
||||
MI->substituteRegister(SrcReg, SplatReg, 0, *TRI);
|
||||
}
|
||||
|
||||
SI = Splats.erase(SI);
|
||||
|
||||
// If SMI is directly after MI, then MBBI's base iterator is
|
||||
// pointing at SMI. Adjust MBBI around the call to erase SMI to
|
||||
// avoid invalidating MBBI.
|
||||
++MBBI;
|
||||
SMI->eraseFromParent();
|
||||
--MBBI;
|
||||
|
||||
++NumSimplified;
|
||||
MadeChange = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (MI->modifiesRegister(SplatReg, TRI)) {
|
||||
SI = Splats.erase(SI);
|
||||
continue;
|
||||
}
|
||||
|
||||
++SI;
|
||||
}
|
||||
|
||||
if (MI->getOpcode() != PPC::QVESPLATI &&
|
||||
MI->getOpcode() != PPC::QVESPLATIs &&
|
||||
MI->getOpcode() != PPC::QVESPLATIb)
|
||||
continue;
|
||||
if (MI->getOperand(2).getImm() != 0)
|
||||
continue;
|
||||
|
||||
// If there are other uses of the scalar value after this, replacing
|
||||
// those uses might be non-trivial.
|
||||
if (!MI->getOperand(1).isKill())
|
||||
continue;
|
||||
|
||||
Splats.push_back(MI);
|
||||
}
|
||||
}
|
||||
|
||||
return MadeChange;
|
||||
}
|
||||
|
|
@ -42,6 +42,10 @@ static cl::
|
|||
opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
|
||||
cl::desc("Disable VSX Swap Removal for PPC"));
|
||||
|
||||
static cl::
|
||||
opt<bool> DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden,
|
||||
cl::desc("Disable QPX load splat simplification"));
|
||||
|
||||
static cl::
|
||||
opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
|
||||
cl::desc("Disable machine peepholes for PPC"));
|
||||
|
@ -388,8 +392,15 @@ void PPCPassConfig::addPreRegAlloc() {
|
|||
}
|
||||
|
||||
void PPCPassConfig::addPreSched2() {
|
||||
if (getOptLevel() != CodeGenOpt::None)
|
||||
if (getOptLevel() != CodeGenOpt::None) {
|
||||
addPass(&IfConverterID);
|
||||
|
||||
// This optimization must happen after anything that might do store-to-load
|
||||
// forwarding. Here we're after RA (and, thus, when spills are inserted)
|
||||
// but before post-RA scheduling.
|
||||
if (!DisableQPXLoadSplat)
|
||||
addPass(createPPCQPXLoadSplatPass());
|
||||
}
|
||||
}
|
||||
|
||||
void PPCPassConfig::addPreEmitPass() {
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
; RUN: llc < %s | FileCheck %s
|
||||
target datalayout = "E-m:e-i64:64-n32:64"
|
||||
target triple = "powerpc64-bgq-linux"
|
||||
|
||||
; Function Attrs: norecurse nounwind readonly
|
||||
define <4 x double> @foo(double* nocapture readonly %a) #0 {
|
||||
entry:
|
||||
%0 = load double, double* %a, align 8, !tbaa !1
|
||||
%vecinit.i = insertelement <4 x double> undef, double %0, i32 0
|
||||
%shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x double> %shuffle.i
|
||||
|
||||
; CHECK-LABEL: @foo
|
||||
; CHECK: lfd 1, 0(3)
|
||||
; CHECK: blr
|
||||
}
|
||||
|
||||
define <4 x double> @foox(double* nocapture readonly %a, i64 %idx) #0 {
|
||||
entry:
|
||||
%p = getelementptr double, double* %a, i64 %idx
|
||||
%0 = load double, double* %p, align 8, !tbaa !1
|
||||
%vecinit.i = insertelement <4 x double> undef, double %0, i32 0
|
||||
%shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x double> %shuffle.i
|
||||
|
||||
; CHECK-LABEL: @foox
|
||||
; CHECK: sldi [[REG1:[0-9]+]], 4, 3
|
||||
; CHECK: lfdx 1, 3, [[REG1]]
|
||||
; CHECK: blr
|
||||
}
|
||||
|
||||
define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pptr) #0 {
|
||||
entry:
|
||||
%p = getelementptr double, double* %a, i64 %idx
|
||||
%0 = load double, double* %p, align 8, !tbaa !1
|
||||
%vecinit.i = insertelement <4 x double> undef, double %0, i32 0
|
||||
%shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer
|
||||
store double* %p, double** %pptr, align 8
|
||||
ret <4 x double> %shuffle.i
|
||||
|
||||
; CHECK-LABEL: @foox
|
||||
; CHECK: sldi [[REG1:[0-9]+]], 4, 3
|
||||
; CHECK: lfdux 1, 3, [[REG1]]
|
||||
; CHECK: std 3, 0(5)
|
||||
; CHECK: blr
|
||||
}
|
||||
|
||||
define <4 x float> @foof(float* nocapture readonly %a) #0 {
|
||||
entry:
|
||||
%0 = load float, float* %a, align 4, !tbaa !1
|
||||
%vecinit.i = insertelement <4 x float> undef, float %0, i32 0
|
||||
%shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x float> %shuffle.i
|
||||
|
||||
; CHECK-LABEL: @foof
|
||||
; CHECK: lfs 1, 0(3)
|
||||
; CHECK: blr
|
||||
}
|
||||
|
||||
define <4 x float> @foofx(float* nocapture readonly %a, i64 %idx) #0 {
|
||||
entry:
|
||||
%p = getelementptr float, float* %a, i64 %idx
|
||||
%0 = load float, float* %p, align 4, !tbaa !1
|
||||
%vecinit.i = insertelement <4 x float> undef, float %0, i32 0
|
||||
%shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
|
||||
ret <4 x float> %shuffle.i
|
||||
|
||||
; CHECK-LABEL: @foofx
|
||||
; CHECK: sldi [[REG1:[0-9]+]], 4, 2
|
||||
; CHECK: lfsx 1, 3, [[REG1]]
|
||||
; CHECK: blr
|
||||
}
|
||||
|
||||
attributes #0 = { norecurse nounwind readonly "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
|
||||
!llvm.ident = !{!0}
|
||||
|
||||
!0 = !{!"bgclang r264510-20160326 clang version 3.9.0 (based on LLVM 3.9.0svn)"}
|
||||
!1 = !{!2, !2, i64 0}
|
||||
!2 = !{!"double", !3, i64 0}
|
||||
!3 = !{!"omnipotent char", !4, i64 0}
|
||||
!4 = !{!"Simple C/C++ TBAA"}
|
||||
|
Loading…
Reference in New Issue