Revert revision 171524. Original message:

URL: http://llvm.org/viewvc/llvm-project?rev=171524&view=rev
Log:
The current Intel Atom microarchitecture has a feature whereby when a function
returns early then it is slightly faster to execute a sequence of NOP
instructions to wait until the return address is ready,
as opposed to simply stalling on the ret instruction
until the return address is ready.

When compiling for X86 Atom only, this patch will run a pass, called
"X86PadShortFunction" which will add NOP instructions where less than four
cycles elapse between function entry and return.

It includes tests.

Patch by Andy Zhang.

llvm-svn: 171603
This commit is contained in:
Nadav Rotem 2013-01-05 05:42:48 +00:00
parent c1520bbb34
commit 478b6a47ec
11 changed files with 8 additions and 282 deletions

View File

@ -25,7 +25,6 @@ set(sources
X86JITInfo.cpp X86JITInfo.cpp
X86MCInstLower.cpp X86MCInstLower.cpp
X86MachineFunctionInfo.cpp X86MachineFunctionInfo.cpp
X86PadShortFunction.cpp
X86RegisterInfo.cpp X86RegisterInfo.cpp
X86SelectionDAGInfo.cpp X86SelectionDAGInfo.cpp
X86Subtarget.cpp X86Subtarget.cpp

View File

@ -63,11 +63,6 @@ FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
/// ///
FunctionPass *createEmitX86CodeToMemory(); FunctionPass *createEmitX86CodeToMemory();
/// createX86PadShortFunctions - Return a pass that pads short functions
/// with NOOPs. This will prevent a stall when returning from the function
/// on the Atom.
FunctionPass *createX86PadShortFunctions();
} // End llvm namespace } // End llvm namespace
#endif #endif

View File

@ -123,11 +123,8 @@ def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
"Use LEA for adjusting the stack pointer">; "Use LEA for adjusting the stack pointer">;
def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb", def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
"HasSlowDivide", "true", "HasSlowDivide", "true",
"Use small divide for positive values less than 256">; "Use small divide for positive values less than 256">;
def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
"Pad short functions">;
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// X86 processors supported. // X86 processors supported.
@ -170,7 +167,7 @@ def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B,
FeatureSlowBTMem]>; FeatureSlowBTMem]>;
def : AtomProc<"atom", [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B, def : AtomProc<"atom", [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP, FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
FeatureSlowDivide, FeaturePadShortFunctions]>; FeatureSlowDivide]>;
// "Arrandale" along with corei3 and corei5 // "Arrandale" along with corei3 and corei5
def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B, def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B,
FeatureSlowBTMem, FeatureFastUAMem, FeatureSlowBTMem, FeatureFastUAMem,

View File

@ -1,184 +0,0 @@
//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the pass which will pad short functions to prevent
// a stall if a function returns before the return address is ready. This
// is needed for some Intel Atom processors.
//
//===----------------------------------------------------------------------===//
#include <map>
#include <algorithm>
#define DEBUG_TYPE "x86-pad-short-functions"
#include "X86.h"
#include "X86InstrInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
STATISTIC(NumBBsPadded, "Number of basic blocks padded");
namespace {
struct PadShortFunc : public MachineFunctionPass {
static char ID;
PadShortFunc() : MachineFunctionPass(ID)
, Threshold(4)
{}
virtual bool runOnMachineFunction(MachineFunction &MF);
virtual const char *getPassName() const
{
return "X86 Atom pad short functions";
}
private:
bool addPadding(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
unsigned int NOOPsToAdd);
void findReturn(MachineFunction &MF,
MachineBasicBlock &MBB,
unsigned int Cycles);
bool cyclesUntilReturn(MachineFunction &MF,
MachineBasicBlock &MBB,
unsigned int &Cycles,
MachineBasicBlock::iterator *Location = 0);
const unsigned int Threshold;
std::map<int, unsigned int> ReturnBBs;
};
char PadShortFunc::ID = 0;
}
FunctionPass *llvm::createX86PadShortFunctions() {
return new PadShortFunc();
}
/// runOnMachineFunction - Loop over all of the basic blocks, inserting
/// NOOP instructions before early exits.
bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
// Process all basic blocks.
ReturnBBs.clear();
// Search through basic blocks and mark the ones that have early returns
findReturn(MF, *MF.begin(), 0);
int BBNum;
MachineBasicBlock::iterator ReturnLoc;
MachineBasicBlock *MBB;
unsigned int Cycles = 0;
unsigned int BBCycles;
// Pad the identified basic blocks with NOOPs
for (std::map<int, unsigned int>::iterator I = ReturnBBs.begin();
I != ReturnBBs.end(); ++I) {
BBNum = I->first;
Cycles = I->second;
if (Cycles < Threshold) {
MBB = MF.getBlockNumbered(BBNum);
if (!cyclesUntilReturn(MF, *MBB, BBCycles, &ReturnLoc))
continue;
addPadding(MF, *MBB, ReturnLoc, Threshold - Cycles);
NumBBsPadded++;
}
}
return false;
}
/// findReturn - Starting at MBB, follow control flow and add all
/// basic blocks that contain a return to ReturnBBs.
void PadShortFunc::findReturn(MachineFunction &MF,
MachineBasicBlock &MBB,
unsigned int Cycles)
{
// If this BB has a return, note how many cycles it takes to get there.
bool hasReturn = cyclesUntilReturn(MF, MBB, Cycles);
if (Cycles >= Threshold)
return;
if (hasReturn) {
int BBNum = MBB.getNumber();
ReturnBBs[BBNum] = std::max(ReturnBBs[BBNum], Cycles);
return;
}
// Follow branches in BB and look for returns
for (MachineBasicBlock::succ_iterator I = MBB.succ_begin();
I != MBB.succ_end(); ++I) {
findReturn(MF, **I, Cycles);
}
}
/// cyclesUntilReturn - if the MBB has a return instruction, set Location to
/// to the instruction and return true. Return false otherwise.
/// Cycles will be incremented by the number of cycles taken to reach the
/// return or the end of the BB, whichever occurs first.
bool PadShortFunc::cyclesUntilReturn(MachineFunction &MF,
MachineBasicBlock &MBB,
unsigned int &Cycles,
MachineBasicBlock::iterator *Location)
{
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
const TargetMachine &Target = MF.getTarget();
for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBB.end();
++MBBI) {
MachineInstr *MI = MBBI;
// Mark basic blocks with a return instruction. Calls to other functions
// do not count because the called function will be padded, if necessary
if (MI->isReturn() && !MI->isCall()) {
if (Location)
*Location = MBBI;
return true;
}
Cycles += TII.getInstrLatency(Target.getInstrItineraryData(), MI);
}
return false;
}
/// addPadding - Add the given number of NOOP instructions to the function
/// right before the return at MBBI
bool PadShortFunc::addPadding(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
unsigned int NOOPsToAdd)
{
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
DebugLoc DL = MBBI->getDebugLoc();
while (NOOPsToAdd-- > 0) {
// Since Atom has two instruction execution ports,
// the code emits two noops, which will be executed in parallell
// during one cycle.
BuildMI(MBB, MBBI, DL, TII.get(X86::NOOP));
BuildMI(MBB, MBBI, DL, TII.get(X86::NOOP));
}
return true;
}

View File

@ -350,7 +350,6 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
, UseLeaForSP(false) , UseLeaForSP(false)
, HasSlowDivide(false) , HasSlowDivide(false)
, PostRAScheduler(false) , PostRAScheduler(false)
, PadShortFunctions(false)
, stackAlignment(4) , stackAlignment(4)
// FIXME: this is a known good value for Yonah. How about others? // FIXME: this is a known good value for Yonah. How about others?
, MaxInlineSizeThreshold(128) , MaxInlineSizeThreshold(128)

View File

@ -146,10 +146,6 @@ protected:
/// PostRAScheduler - True if using post-register-allocation scheduler. /// PostRAScheduler - True if using post-register-allocation scheduler.
bool PostRAScheduler; bool PostRAScheduler;
/// PadShortFunctions - True if the short functions should be padded to prevent
/// a stall when returning too early.
bool PadShortFunctions;
/// stackAlignment - The minimum alignment known to hold of the stack frame on /// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function. /// entry to the function and which must be maintained by every function.
unsigned stackAlignment; unsigned stackAlignment;
@ -235,7 +231,6 @@ public:
bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; } bool useLeaForSP() const { return UseLeaForSP; }
bool hasSlowDivide() const { return HasSlowDivide; } bool hasSlowDivide() const { return HasSlowDivide; }
bool padShortFunctions() const { return PadShortFunctions; }
bool isAtom() const { return X86ProcFamily == IntelAtom; } bool isAtom() const { return X86ProcFamily == IntelAtom; }

View File

@ -190,10 +190,6 @@ bool X86PassConfig::addPreEmitPass() {
addPass(createX86IssueVZeroUpperPass()); addPass(createX86IssueVZeroUpperPass());
ShouldPrint = true; ShouldPrint = true;
} }
if (getX86Subtarget().padShortFunctions()){
addPass(createX86PadShortFunctions());
ShouldPrint = true;
}
return ShouldPrint; return ShouldPrint;
} }

View File

@ -1,71 +0,0 @@
; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
declare void @external_function(...)
define i32 @test_return_val(i32 %a) nounwind {
; CHECK: test_return_val
; CHECK: movl
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: ret
ret i32 %a
}
define i32 @test_add(i32 %a, i32 %b) nounwind {
; CHECK: test_add
; CHECK: addl
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: ret
%result = add i32 %a, %b
ret i32 %result
}
define i32 @test_multiple_ret(i32 %a, i32 %b, i1 %c) nounwind {
; CHECK: @test_multiple_ret
; CHECK: je
; CHECK: nop
; CHECK: nop
; CHECK: ret
; CHECK: nop
; CHECK: nop
; CHECK: ret
br i1 %c, label %bb1, label %bb2
bb1:
ret i32 %a
bb2:
ret i32 %b
}
define void @test_call_others(i32 %x) nounwind
{
; CHECK: test_call_others
; CHECK: je
%tobool = icmp eq i32 %x, 0
br i1 %tobool, label %if.end, label %true.case
; CHECK: jmp external_function
true.case:
tail call void bitcast (void (...)* @external_function to void ()*)() nounwind
br label %if.end
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: nop
; CHECK: ret
if.end:
ret void
}

View File

@ -1,5 +1,5 @@
; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s ; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
; RUN: llc < %s -mattr=+avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX ; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0" target triple = "x86_64-apple-darwin10.0.0"

View File

@ -1,4 +1,4 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mcpu=core2 -mattr=+mmx,+sse2 | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mattr=+mmx,+sse2 | FileCheck %s
; rdar://6602459 ; rdar://6602459
@g_v1di = external global <1 x i64> @g_v1di = external global <1 x i64>

View File

@ -282,7 +282,7 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
; ATOM: test13: ; ATOM: test13:
; ATOM: cmpl ; ATOM: cmpl
; ATOM-NEXT: sbbl ; ATOM-NEXT: sbbl
; ATOM: ret ; ATOM-NEXT: ret
} }
define i32 @test14(i32 %a, i32 %b) nounwind { define i32 @test14(i32 %a, i32 %b) nounwind {
@ -299,7 +299,7 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
; ATOM: cmpl ; ATOM: cmpl
; ATOM-NEXT: sbbl ; ATOM-NEXT: sbbl
; ATOM-NEXT: notl ; ATOM-NEXT: notl
; ATOM: ret ; ATOM-NEXT: ret
} }
; rdar://10961709 ; rdar://10961709