CXX_FAST_TLS calling convention: performance improvement for x86-64.
This is the same change on x86-64 as r255821 on AArch64. rdar://9001553 llvm-svn: 257428
This commit is contained in:
parent
921b04e9a4
commit
ed967f3752
|
@ -831,6 +831,12 @@ def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
|
|||
def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
|
||||
R8, R9, R10, R11)>;
|
||||
|
||||
// CSRs that are handled by prologue, epilogue.
|
||||
def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>;
|
||||
|
||||
// CSRs that are handled explicitly via copies.
|
||||
def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>;
|
||||
|
||||
// All GPRs - except r11
|
||||
def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
|
||||
R8, R9, R10, RSP)>;
|
||||
|
|
|
@ -1002,6 +1002,9 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
|
|||
if (!FuncInfo.CanLowerReturn)
|
||||
return false;
|
||||
|
||||
if (TLI.supportSplitCSR(FuncInfo.MF))
|
||||
return false;
|
||||
|
||||
CallingConv::ID CC = F.getCallingConv();
|
||||
if (CC != CallingConv::C &&
|
||||
CC != CallingConv::Fast &&
|
||||
|
|
|
@ -2311,6 +2311,18 @@ X86TargetLowering::LowerReturn(SDValue Chain,
|
|||
DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
|
||||
}
|
||||
|
||||
const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
|
||||
const MCPhysReg *I =
|
||||
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
|
||||
if (I) {
|
||||
for (; *I; ++I) {
|
||||
if (X86::GR64RegClass.contains(*I))
|
||||
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
|
||||
else
|
||||
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
|
||||
}
|
||||
}
|
||||
|
||||
RetOps[0] = Chain; // Update chain.
|
||||
|
||||
// Add the flag if we have it.
|
||||
|
@ -28827,3 +28839,51 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
|
|||
Attribute::MinSize);
|
||||
return OptSize && !VT.isVector();
|
||||
}
|
||||
|
||||
void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
|
||||
if (!Subtarget->is64Bit())
|
||||
return;
|
||||
|
||||
// Update IsSplitCSR in X86MachineFunctionInfo.
|
||||
X86MachineFunctionInfo *AFI =
|
||||
Entry->getParent()->getInfo<X86MachineFunctionInfo>();
|
||||
AFI->setIsSplitCSR(true);
|
||||
}
|
||||
|
||||
void X86TargetLowering::insertCopiesSplitCSR(
|
||||
MachineBasicBlock *Entry,
|
||||
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
|
||||
const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
|
||||
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
|
||||
if (!IStart)
|
||||
return;
|
||||
|
||||
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
|
||||
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
|
||||
for (const MCPhysReg *I = IStart; *I; ++I) {
|
||||
const TargetRegisterClass *RC = nullptr;
|
||||
if (X86::GR64RegClass.contains(*I))
|
||||
RC = &X86::GR64RegClass;
|
||||
else
|
||||
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
|
||||
|
||||
unsigned NewVR = MRI->createVirtualRegister(RC);
|
||||
// Create copy from CSR to a virtual register.
|
||||
// FIXME: this currently does not emit CFI pseudo-instructions, it works
|
||||
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
|
||||
// nounwind. If we want to generalize this later, we may need to emit
|
||||
// CFI pseudo-instructions.
|
||||
assert(Entry->getParent()->getFunction()->hasFnAttribute(
|
||||
Attribute::NoUnwind) &&
|
||||
"Function should be nounwind in insertCopiesSplitCSR!");
|
||||
Entry->addLiveIn(*I);
|
||||
BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
|
||||
NewVR)
|
||||
.addReg(*I);
|
||||
|
||||
for (auto *Exit : Exits)
|
||||
BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
|
||||
*I)
|
||||
.addReg(NewVR);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1057,6 +1057,15 @@ namespace llvm {
|
|||
const SmallVectorImpl<SDValue> &OutVals,
|
||||
SDLoc dl, SelectionDAG &DAG) const override;
|
||||
|
||||
bool supportSplitCSR(MachineFunction *MF) const override {
|
||||
return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
|
||||
MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
|
||||
}
|
||||
void initializeSplitCSR(MachineBasicBlock *Entry) const override;
|
||||
void insertCopiesSplitCSR(
|
||||
MachineBasicBlock *Entry,
|
||||
const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
|
||||
|
||||
bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
|
||||
|
||||
bool mayBeEmittedAsTailCall(CallInst *CI) const override;
|
||||
|
|
|
@ -92,6 +92,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
|
|||
/// used to address arguments in a function using a base pointer.
|
||||
int SEHFramePtrSaveIndex = 0;
|
||||
|
||||
/// True if this function has a subset of CSRs that is handled explicitly via
|
||||
/// copies.
|
||||
bool IsSplitCSR = false;
|
||||
|
||||
private:
|
||||
/// ForwardedMustTailRegParms - A list of virtual and physical registers
|
||||
/// that must be forwarded to every musttail call.
|
||||
|
@ -160,6 +164,9 @@ public:
|
|||
SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
|
||||
return ForwardedMustTailRegParms;
|
||||
}
|
||||
|
||||
bool isSplitCSR() const { return IsSplitCSR; }
|
||||
void setIsSplitCSR(bool s) { IsSplitCSR = s; }
|
||||
};
|
||||
|
||||
} // End llvm namespace
|
||||
|
|
|
@ -250,7 +250,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
|
|||
return CSR_64_RT_AllRegs_SaveList;
|
||||
case CallingConv::CXX_FAST_TLS:
|
||||
if (Is64Bit)
|
||||
return CSR_64_TLS_Darwin_SaveList;
|
||||
return MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR() ?
|
||||
CSR_64_CXX_TLS_Darwin_PE_SaveList : CSR_64_TLS_Darwin_SaveList;
|
||||
break;
|
||||
case CallingConv::Intel_OCL_BI: {
|
||||
if (HasAVX512 && IsWin64)
|
||||
|
@ -305,6 +306,15 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
|
|||
return CSR_32_SaveList;
|
||||
}
|
||||
|
||||
const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy(
|
||||
const MachineFunction *MF) const {
|
||||
assert(MF && "Invalid MachineFunction pointer.");
|
||||
if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
|
||||
MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR())
|
||||
return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const uint32_t *
|
||||
X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
|
||||
CallingConv::ID CC) const {
|
||||
|
|
|
@ -99,6 +99,8 @@ public:
|
|||
/// callee-save registers on this target.
|
||||
const MCPhysReg *
|
||||
getCalleeSavedRegs(const MachineFunction* MF) const override;
|
||||
const MCPhysReg *
|
||||
getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
|
||||
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
|
||||
CallingConv::ID) const override;
|
||||
const uint32_t *getNoPreservedMask() const override;
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
; TLS function were wrongly model and after fixing that, shrink-wrapping
|
||||
; cannot help here. To achieve the expected lowering, we need to playing
|
||||
; tricks similar to AArch64 fast TLS calling convention (r255821).
|
||||
; Re-enable the following run line when
|
||||
; _RUN_: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck --check-prefix=SHRINK %s
|
||||
; Applying tricks on x86-64 similar to r255821.
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck %s
|
||||
%struct.S = type { i8 }
|
||||
|
||||
@sg = internal thread_local global %struct.S zeroinitializer, align 1
|
||||
|
@ -16,51 +16,28 @@ declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
|
|||
|
||||
; Every GPR should be saved - except rdi, rax, and rsp
|
||||
; CHECK-LABEL: _ZTW2sg
|
||||
; CHECK: pushq %r11
|
||||
; CHECK: pushq %r10
|
||||
; CHECK: pushq %r9
|
||||
; CHECK: pushq %r8
|
||||
; CHECK: pushq %rsi
|
||||
; CHECK: pushq %rdx
|
||||
; CHECK: pushq %rcx
|
||||
; CHECK: pushq %rbx
|
||||
; CHECK-NOT: pushq %r11
|
||||
; CHECK-NOT: pushq %r10
|
||||
; CHECK-NOT: pushq %r9
|
||||
; CHECK-NOT: pushq %r8
|
||||
; CHECK-NOT: pushq %rsi
|
||||
; CHECK-NOT: pushq %rdx
|
||||
; CHECK-NOT: pushq %rcx
|
||||
; CHECK-NOT: pushq %rbx
|
||||
; CHECK: callq
|
||||
; CHECK: jne
|
||||
; CHECK: callq
|
||||
; CHECK: tlv_atexit
|
||||
; CHECK: callq
|
||||
; CHECK: popq %rbx
|
||||
; CHECK: popq %rcx
|
||||
; CHECK: popq %rdx
|
||||
; CHECK: popq %rsi
|
||||
; CHECK: popq %r8
|
||||
; CHECK: popq %r9
|
||||
; CHECK: popq %r10
|
||||
; CHECK: popq %r11
|
||||
; SHRINK-LABEL: _ZTW2sg
|
||||
; SHRINK: callq
|
||||
; SHRINK: jne
|
||||
; SHRINK: pushq %r11
|
||||
; SHRINK: pushq %r10
|
||||
; SHRINK: pushq %r9
|
||||
; SHRINK: pushq %r8
|
||||
; SHRINK: pushq %rsi
|
||||
; SHRINK: pushq %rdx
|
||||
; SHRINK: pushq %rcx
|
||||
; SHRINK: pushq %rbx
|
||||
; SHRINK: callq
|
||||
; SHRINK: tlv_atexit
|
||||
; SHRINK: popq %rbx
|
||||
; SHRINK: popq %rcx
|
||||
; SHRINK: popq %rdx
|
||||
; SHRINK: popq %rsi
|
||||
; SHRINK: popq %r8
|
||||
; SHRINK: popq %r9
|
||||
; SHRINK: popq %r10
|
||||
; SHRINK: popq %r11
|
||||
; SHRINK: LBB{{.*}}:
|
||||
; SHRINK: callq
|
||||
define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() {
|
||||
; CHECK-NOT: popq %rbx
|
||||
; CHECK-NOT: popq %rcx
|
||||
; CHECK-NOT: popq %rdx
|
||||
; CHECK-NOT: popq %rsi
|
||||
; CHECK-NOT: popq %r8
|
||||
; CHECK-NOT: popq %r9
|
||||
; CHECK-NOT: popq %r10
|
||||
; CHECK-NOT: popq %r11
|
||||
define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
|
||||
%.b.i = load i1, i1* @__tls_guard, align 1
|
||||
br i1 %.b.i, label %__tls_init.exit, label %init.i
|
||||
|
||||
|
|
Loading…
Reference in New Issue