Improve instruction scheduling for the PPC POWER7

Aside from a few minor latency corrections, the major change here is a new
hazard recognizer which focuses on better dispatch-group formation on the
POWER7. As with the PPC970's hazard recognizer, the most important thing it
does is avoid load-after-store hazards within the same dispatch group. It uses
the POWER7's special dispatch-group-terminating nop instruction (instead of
inserting multiple regular nop instructions). This new hazard recognizer makes
use of the scheduling dependency graph itself, built using AA information, to
robustly detect the possibility of load-after-store hazards.

significant test-suite performance changes (the error bars are 99.5% confidence
intervals based on 5 test-suite runs both with and without the change --
speedups are negative):

speedups:

MultiSource/Benchmarks/FreeBench/pcompress2/pcompress2
	-0.55171% +/- 0.333168%

MultiSource/Benchmarks/TSVC/CrossingThresholds-dbl/CrossingThresholds-dbl
	-17.5576% +/- 14.598%

MultiSource/Benchmarks/TSVC/Reductions-dbl/Reductions-dbl
	-29.5708% +/- 7.09058%

MultiSource/Benchmarks/TSVC/Reductions-flt/Reductions-flt
	-34.9471% +/- 11.4391%

SingleSource/Benchmarks/BenchmarkGame/puzzle
	-25.1347% +/- 11.0104%

SingleSource/Benchmarks/Misc/flops-8
	-17.7297% +/- 9.79061%

SingleSource/Benchmarks/Shootout-C++/ary3
	-35.5018% +/- 23.9458%

SingleSource/Regression/C/uint64_to_float
	-56.3165% +/- 25.4234%

SingleSource/UnitTests/Vectorizer/gcc-loops
	-18.5309% +/- 6.8496%

regressions:

MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000
	18.351% +/- 12.156%

SingleSource/Benchmarks/Shootout-C++/methcall
	27.3086% +/- 14.4733%

llvm-svn: 197099
This commit is contained in:
Hal Finkel 2013-12-12 00:19:11 +00:00
parent 03071ab74c
commit ceb1f12d9a
8 changed files with 369 additions and 4 deletions

View File

@ -15,12 +15,221 @@
#include "PPCHazardRecognizers.h"
#include "PPC.h"
#include "PPCInstrInfo.h"
#include "PPCTargetMachine.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
bool PPCDispatchGroupSBHazardRecognizer::isLoadAfterStore(SUnit *SU) {
// FIXME: Move this.
if (isBCTRAfterSet(SU))
return true;
const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
if (!MCID)
return false;
if (!MCID->mayLoad())
return false;
// SU is a load; for any predecessors in this dispatch group, that are stores,
// and with which we have an ordering dependency, return true.
for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) {
const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit());
if (!PredMCID || !PredMCID->mayStore())
continue;
if (!SU->Preds[i].isNormalMemory() && !SU->Preds[i].isBarrier())
continue;
for (unsigned j = 0, je = CurGroup.size(); j != je; ++j)
if (SU->Preds[i].getSUnit() == CurGroup[j])
return true;
}
return false;
}
bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) {
const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
if (!MCID)
return false;
if (!MCID->isBranch())
return false;
// SU is a branch; for any predecessors in this dispatch group, with which we
// have a data dependence and set the counter register, return true.
for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) {
const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit());
if (!PredMCID || PredMCID->getSchedClass() != PPC::Sched::IIC_SprMTSPR)
continue;
if (SU->Preds[i].isCtrl())
continue;
for (unsigned j = 0, je = CurGroup.size(); j != je; ++j)
if (SU->Preds[i].getSUnit() == CurGroup[j])
return true;
}
return false;
}
// FIXME: Remove this when we don't need this:
namespace llvm { namespace PPC { extern int getNonRecordFormOpcode(uint16_t); } }
// FIXME: A lot of code in PPCDispatchGroupSBHazardRecognizer is P7 specific.
bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID,
unsigned &NSlots) {
// FIXME: Indirectly, this information is contained in the itinerary, and
// we should derive it from there instead of separately specifying it
// here.
unsigned IIC = MCID->getSchedClass();
switch (IIC) {
default:
NSlots = 1;
break;
case PPC::Sched::IIC_IntDivW:
case PPC::Sched::IIC_IntDivD:
case PPC::Sched::IIC_LdStLoadUpd:
case PPC::Sched::IIC_LdStLDU:
case PPC::Sched::IIC_LdStLFDU:
case PPC::Sched::IIC_LdStLFDUX:
case PPC::Sched::IIC_LdStLHA:
case PPC::Sched::IIC_LdStLHAU:
case PPC::Sched::IIC_LdStLWA:
case PPC::Sched::IIC_LdStSTDU:
case PPC::Sched::IIC_LdStSTFDU:
NSlots = 2;
break;
case PPC::Sched::IIC_LdStLoadUpdX:
case PPC::Sched::IIC_LdStLDUX:
case PPC::Sched::IIC_LdStLHAUX:
case PPC::Sched::IIC_LdStLWARX:
case PPC::Sched::IIC_LdStLDARX:
case PPC::Sched::IIC_LdStSTDUX:
case PPC::Sched::IIC_LdStSTDCX:
case PPC::Sched::IIC_LdStSTWCX:
case PPC::Sched::IIC_BrMCRX: // mtcr
// FIXME: Add sync/isync (here and in the itinerary).
NSlots = 4;
break;
}
// FIXME: record-form instructions need a different itinerary class.
if (NSlots == 1 && PPC::getNonRecordFormOpcode(MCID->getOpcode()) != -1)
NSlots = 2;
switch (IIC) {
default:
// All multi-slot instructions must come first.
return NSlots > 1;
case PPC::Sched::IIC_SprMFCR:
case PPC::Sched::IIC_SprMFCRF:
case PPC::Sched::IIC_SprMTSPR:
return true;
}
}
ScheduleHazardRecognizer::HazardType
PPCDispatchGroupSBHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
if (Stalls == 0 && isLoadAfterStore(SU))
return NoopHazard;
return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
}
bool PPCDispatchGroupSBHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
unsigned NSlots;
if (MCID && mustComeFirst(MCID, NSlots) && CurSlots)
return true;
return ScoreboardHazardRecognizer::ShouldPreferAnother(SU);
}
unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) {
// We only need to fill out a maximum of 5 slots here: The 6th slot could
// only be a second branch, and otherwise the next instruction will start a
// new group.
if (isLoadAfterStore(SU) && CurSlots < 6) {
unsigned Directive =
DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
// If we're using a special group-terminating nop, then we need only one.
if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7)
return 1;
return 5 - CurSlots;
}
return ScoreboardHazardRecognizer::PreEmitNoops(SU);
}
void PPCDispatchGroupSBHazardRecognizer::EmitInstruction(SUnit *SU) {
const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
if (MCID) {
if (CurSlots == 5 || (MCID->isBranch() && CurBranches == 1)) {
CurGroup.clear();
CurSlots = CurBranches = 0;
} else {
DEBUG(dbgs() << "**** Adding to dispatch group: SU(" <<
SU->NodeNum << "): ");
DEBUG(DAG->dumpNode(SU));
unsigned NSlots;
bool MustBeFirst = mustComeFirst(MCID, NSlots);
// If this instruction must come first, but does not, then it starts a
// new group.
if (MustBeFirst && CurSlots) {
CurSlots = CurBranches = 0;
CurGroup.clear();
}
CurSlots += NSlots;
CurGroup.push_back(SU);
if (MCID->isBranch())
++CurBranches;
}
}
return ScoreboardHazardRecognizer::EmitInstruction(SU);
}
void PPCDispatchGroupSBHazardRecognizer::AdvanceCycle() {
return ScoreboardHazardRecognizer::AdvanceCycle();
}
void PPCDispatchGroupSBHazardRecognizer::RecedeCycle() {
llvm_unreachable("Bottom-up scheduling not supported");
}
void PPCDispatchGroupSBHazardRecognizer::Reset() {
CurGroup.clear();
CurSlots = CurBranches = 0;
return ScoreboardHazardRecognizer::Reset();
}
void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
unsigned Directive =
DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
// If the group has now filled all of its slots, or if we're using a special
// group-terminating nop, the group is complete.
if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
CurSlots == 6) {
CurGroup.clear();
CurSlots = CurBranches = 0;
} else {
CurGroup.push_back(0);
++CurSlots;
}
}
//===----------------------------------------------------------------------===//
// PowerPC 970 Hazard Recognizer
//

View File

@ -21,6 +21,32 @@
namespace llvm {
/// PPCDispatchGroupSBHazardRecognizer - This class implements a scoreboard-based
/// hazard recognizer for PPC ooo processors with dispatch-group hazards.
class PPCDispatchGroupSBHazardRecognizer : public ScoreboardHazardRecognizer {
const ScheduleDAG *DAG;
SmallVector<SUnit *, 7> CurGroup;
unsigned CurSlots, CurBranches;
bool isLoadAfterStore(SUnit *SU);
bool isBCTRAfterSet(SUnit *SU);
bool mustComeFirst(const MCInstrDesc *MCID, unsigned &NSlots);
public:
PPCDispatchGroupSBHazardRecognizer(const InstrItineraryData *ItinData,
const ScheduleDAG *DAG_) :
ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_),
CurSlots(0), CurBranches(0) {}
virtual HazardType getHazardType(SUnit *SU, int Stalls);
virtual bool ShouldPreferAnother(SUnit* SU);
virtual unsigned PreEmitNoops(SUnit *SU);
virtual void EmitInstruction(SUnit *SU);
virtual void AdvanceCycle();
virtual void RecedeCycle();
virtual void Reset();
virtual void EmitNoop();
};
/// PPCHazardRecognizer970 - This class defines a finite state automata that
/// models the dispatch logic on the PowerPC 970 (aka G5) processor. This
/// promotes good dispatch group formation and implements noop insertion to

View File

@ -258,6 +258,15 @@ class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr,
let Addr = 0;
}
class DForm_4_fixedreg_zero<bits<6> opcode, bits<5> R, dag OOL, dag IOL,
string asmstr, InstrItinClass itin,
list<dag> pattern>
: DForm_4<opcode, OOL, IOL, asmstr, itin, pattern> {
let A = R;
let B = R;
let C = 0;
}
class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>

View File

@ -74,6 +74,9 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
const ScheduleDAG *DAG) const {
unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
if (Directive == PPC::DIR_PWR7)
return new PPCDispatchGroupSBHazardRecognizer(II, DAG);
// Most subtargets use a PPC970 recognizer.
if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
@ -85,6 +88,56 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
return new ScoreboardHazardRecognizer(II, DAG);
}
int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
const MachineInstr *DefMI, unsigned DefIdx,
const MachineInstr *UseMI,
unsigned UseIdx) const {
int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
UseMI, UseIdx);
const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
unsigned Reg = DefMO.getReg();
const TargetRegisterInfo *TRI = &getRegisterInfo();
bool IsRegCR;
if (TRI->isVirtualRegister(Reg)) {
const MachineRegisterInfo *MRI =
&DefMI->getParent()->getParent()->getRegInfo();
IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) ||
MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRBITRCRegClass);
} else {
IsRegCR = PPC::CRRCRegClass.contains(Reg) ||
PPC::CRBITRCRegClass.contains(Reg);
}
if (UseMI->isBranch() && IsRegCR) {
if (Latency < 0)
Latency = getInstrLatency(ItinData, DefMI);
// On some cores, there is an additional delay between writing to a condition
// register, and using it from a branch.
unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
switch (Directive) {
default: break;
case PPC::DIR_7400:
case PPC::DIR_750:
case PPC::DIR_970:
case PPC::DIR_E5500:
case PPC::DIR_PWR4:
case PPC::DIR_PWR5:
case PPC::DIR_PWR5X:
case PPC::DIR_PWR6:
case PPC::DIR_PWR6X:
case PPC::DIR_PWR7:
Latency += 2;
break;
}
}
return Latency;
}
// Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
unsigned &SrcReg, unsigned &DstReg,
@ -218,10 +271,19 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const {
DebugLoc DL;
BuildMI(MBB, MI, DL, get(PPC::NOP));
}
// This function is used for scheduling, and the nop wanted here is the type
// that terminates dispatch groups on the POWER cores.
unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
unsigned Opcode;
switch (Directive) {
default: Opcode = PPC::NOP; break;
case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break;
case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break;
}
DebugLoc DL;
BuildMI(MBB, MI, DL, get(Opcode));
}
// Branch analysis.
// Note: If the condition register is set to CTR or CTR8 then this is a

View File

@ -95,6 +95,18 @@ public:
CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
const ScheduleDAG *DAG) const;
virtual
int getOperandLatency(const InstrItineraryData *ItinData,
const MachineInstr *DefMI, unsigned DefIdx,
const MachineInstr *UseMI, unsigned UseIdx) const;
virtual
int getOperandLatency(const InstrItineraryData *ItinData,
SDNode *DefNode, unsigned DefIdx,
SDNode *UseNode, unsigned UseIdx) const {
return PPCGenInstrInfo::getOperandLatency(ItinData, DefNode, DefIdx,
UseNode, UseIdx);
}
bool isCoalescableExtInstr(const MachineInstr &MI,
unsigned &SrcReg, unsigned &DstReg,
unsigned &SubIdx) const;

View File

@ -1616,8 +1616,17 @@ def XORI : DForm_4<26, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
def XORIS : DForm_4<27, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
"xoris $dst, $src1, $src2", IIC_IntSimple,
[(set i32:$dst, (xor i32:$src1, imm16ShiftedZExt:$src2))]>;
def NOP : DForm_4_zero<24, (outs), (ins), "nop", IIC_IntSimple,
[]>;
let isCodeGenOnly = 1 in {
// The POWER6 and POWER7 have special group-terminating nops.
def NOP_GT_PWR6 : DForm_4_fixedreg_zero<24, 1, (outs), (ins),
"ori 1, 1, 0", IIC_IntSimple, []>;
def NOP_GT_PWR7 : DForm_4_fixedreg_zero<24, 2, (outs), (ins),
"ori 2, 2, 0", IIC_IntSimple, []>;
}
let isCompare = 1, neverHasSideEffects = 1 in {
def CMPWI : DForm_5_ext<11, (outs crrc:$crD), (ins gprc:$rA, s16imm:$imm),
"cmpwi $crD, $rA, $imm", IIC_IntCompare>;

View File

@ -93,6 +93,7 @@ def P7Itineraries : ProcessorItineraries<
P7_DU3, P7_DU4], 0>,
InstrStage<1, [P7_FX1, P7_FX2]>],
[1, 1, 1]>,
// FIXME: Add record-form itinerary data.
InstrItinData<IIC_IntDivW , [InstrStage<1, [P7_DU1], 0>,
InstrStage<1, [P7_DU2], 0>,
InstrStage<36, [P7_FX1, P7_FX2]>],
@ -290,7 +291,10 @@ def P7Itineraries : ProcessorItineraries<
InstrStage<1, [P7_DU4], 0>,
InstrStage<1, [P7_LS1, P7_LS2]>],
[1, 1, 1]>,
InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU4], 0>,
InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU1], 0>,
InstrStage<1, [P7_DU2], 0>,
InstrStage<1, [P7_DU3], 0>,
InstrStage<1, [P7_DU4], 0>,
InstrStage<1, [P7_CRU]>,
InstrStage<1, [P7_FX1, P7_FX2]>],
[3, 1]>, // mtcr
@ -300,6 +304,9 @@ def P7Itineraries : ProcessorItineraries<
InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P7_DU1], 0>,
InstrStage<1, [P7_CRU]>],
[3, 1]>,
InstrItinData<IIC_SprMTSPR , [InstrStage<1, [P7_DU1], 0>,
InstrStage<1, [P7_FX1]>],
[4, 1]>, // mtctr
InstrItinData<IIC_FPGeneral , [InstrStage<1, [P7_DU1, P7_DU2,
P7_DU3, P7_DU4], 0>,
InstrStage<1, [P7_VS1, P7_VS2]>],

View File

@ -0,0 +1,31 @@
; RUN: llc < %s -mcpu=pwr7 | FileCheck %s
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
; Function Attrs: nounwind
define void @foo(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c, float* nocapture %d) #0 {
; CHECK-LABEL: @foo
entry:
%0 = load float* %b, align 4
store float %0, float* %a, align 4
%1 = load float* %c, align 4
store float %1, float* %b, align 4
%2 = load float* %a, align 4
store float %2, float* %d, align 4
ret void
; CHECK: lfs [[REG1:[0-9]+]], 0(4)
; CHECK: stfs [[REG1]], 0(3)
; CHECK: ori 2, 2, 0
; CHECK: lfs [[REG2:[0-9]+]], 0(5)
; CHECK: stfs [[REG2]], 0(4)
; CHECK: ori 2, 2, 0
; CHECK: lfs [[REG3:[0-9]+]], 0(3)
; CHECK: stfs [[REG3]], 0(6)
; CHECK: blr
}
attributes #0 = { nounwind }