[X86] Elide references to _chkstk for dynamic allocas

The _chkstk function is called by the compiler to probe the stack in an
order consistent with Windows' expectations.  However, it is possible to
elide the call to _chkstk and manually adjust the stack pointer if we
can prove that the allocation is fixed size and smaller than the probe
size.

This shrinks chrome.dll, chrome_child.dll and chrome.exe by a
cummulative ~133 KB.

Differential Revision: http://reviews.llvm.org/D17679

llvm-svn: 262370
This commit is contained in:
David Majnemer 2016-03-01 19:20:23 +00:00
parent 2abc587c1e
commit 791b88b6da
8 changed files with 67 additions and 40 deletions

View File

@ -16366,9 +16366,8 @@ SDValue
X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const { SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction(); MachineFunction &MF = DAG.getMachineFunction();
const Function *F = MF.getFunction();
bool SplitStack = MF.shouldSplitStack(); bool SplitStack = MF.shouldSplitStack();
bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
SplitStack;
SDLoc dl(Op); SDLoc dl(Op);
// Get the inputs. // Get the inputs.
@ -16382,21 +16381,45 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
// pointer when other instructions are using the stack. // pointer when other instructions are using the stack.
Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
bool Is64Bit = Subtarget.is64Bit(); bool Is64Bit = Subtarget.is64Bit();
MVT SPTy = getPointerTy(DAG.getDataLayout()); MVT SPTy = getPointerTy(DAG.getDataLayout());
bool CheckStack = SplitStack;
if (!CheckStack && Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) {
// The Windows ABI requires us to probe the stack for allocations beyond
// the probe size.
if (auto *SizeC = dyn_cast<ConstantSDNode>(Size)) {
// Try to elide the probe if we can prove that this dynamic allocation is
// smaller than the probe size.
unsigned StackProbeSize = 4096;
if (F->hasFnAttribute("stack-probe-size"))
F->getFnAttribute("stack-probe-size")
.getValueAsString()
.getAsInteger(0, StackProbeSize);
unsigned AlignedAlloc = SizeC->getZExtValue();
// Round the dynamic alloca's size up to it's alignment.
if (Align)
AlignedAlloc = alignTo(AlignedAlloc, Align);
// If the aligned allocation is smaller than the probe size, then we don't
// need to probe the stack.
CheckStack = AlignedAlloc >= StackProbeSize;
} else {
// We cannot tell how big this dynamic alloca will be, probe the stack.
CheckStack = true;
}
}
SDValue Result; SDValue Result;
if (!Lower) { if (!CheckStack) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!"); " not tell us which reg is the stack pointer!");
EVT VT = Node->getValueType(0);
SDValue Tmp3 = Node->getOperand(2);
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1); Chain = SP.getValue(1);
unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
unsigned StackAlign = TFI.getStackAlignment(); unsigned StackAlign = TFI.getStackAlignment();
Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
@ -16410,8 +16433,6 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
if (Is64Bit) { if (Is64Bit) {
// The 64 bit implementation of segmented stacks needs to clobber both r10 // The 64 bit implementation of segmented stacks needs to clobber both r10
// r11. This makes it impossible to use it along with nested parameters. // r11. This makes it impossible to use it along with nested parameters.
const Function *F = MF.getFunction();
for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
I != E; ++I) I != E; ++I)
if (I->hasNestAttr()) if (I->hasNestAttr())
@ -16434,7 +16455,6 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned SPReg = RegInfo->getStackRegister(); unsigned SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1); Chain = SP.getValue(1);

View File

@ -38,8 +38,9 @@ ehcleanup: ; preds = %entry
; CHECK: pushl %ebp ; CHECK: pushl %ebp
; CHECK: movl %esp, %ebp ; CHECK: movl %esp, %ebp
; CHECK: subl ${{[0-9]+}}, %esp ; CHECK: subl ${{[0-9]+}}, %esp
; CHECK: movl $8, %eax ; CHECK: movl %esp, %[[tmp_sp1:.*]]
; CHECK: calll __chkstk ; CHECK: leal -8(%[[tmp_sp1]]), %[[tmp_sp2:.*]]
; CHECK: %[[tmp_sp2]], %esp
; CHECK: calll "??0A@@QAE@XZ" ; CHECK: calll "??0A@@QAE@XZ"
; CHECK: calll "??0A@@QAE@XZ" ; CHECK: calll "??0A@@QAE@XZ"
; CHECK: calll _takes_two ; CHECK: calll _takes_two

View File

@ -15,5 +15,8 @@ define void @bar() {
ret void ret void
} }
; CHECK-LABEL: _bar: ; CHECK-LABEL: _bar:
; CHECK: calll __chkstk ; CHECK: movl %esp, %ebp
; CHECK: movl %esp, %[[sp_tmp:.*]]
; CHECK: addl $-4, %[[sp_tmp]]
; CHECK: movl %[[sp_tmp]], %esp
; CHECK: retl ; CHECK: retl

View File

@ -10,13 +10,14 @@ declare void @Foo_ctor(%Foo* %this)
define void @g() { define void @g() {
entry: entry:
; CHECK: movl %esp, %ebp
%args = alloca inalloca %frame %args = alloca inalloca %frame
%c = getelementptr %frame, %frame* %args, i32 0, i32 2 %c = getelementptr %frame, %frame* %args, i32 0, i32 2
; CHECK: movl $20, %eax ; CHECK: movl %esp, %[[tmp_sp1:.*]]
; CHECK: calll __chkstk ; CHECK: leal -20(%[[tmp_sp1]]), %[[tmp_sp2:.*]]
; CHECK: movl %esp, ; CHECK: movl %[[tmp_sp2]], %esp
call void @Foo_ctor(%Foo* %c) call void @Foo_ctor(%Foo* %c)
; CHECK: leal 12(%{{.*}}), ; CHECK: leal -8(%[[tmp_sp1]]),
; CHECK-NEXT: pushl ; CHECK-NEXT: pushl
; CHECK-NEXT: calll _Foo_ctor ; CHECK-NEXT: calll _Foo_ctor
; CHECK: addl $4, %esp ; CHECK: addl $4, %esp

View File

@ -12,6 +12,7 @@ declare void @plus(%Iter* sret, %Iter*, i32)
declare void @reverse(%frame.reverse* inalloca align 4) declare void @reverse(%frame.reverse* inalloca align 4)
define i32 @main() personality i32 (...)* @pers { define i32 @main() personality i32 (...)* @pers {
; CHECK: movl %esp, %ebp
%temp.lvalue = alloca %Iter %temp.lvalue = alloca %Iter
br label %blah br label %blah
@ -21,9 +22,10 @@ blah:
%beg = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 0 %beg = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 0
%end = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 1 %end = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 1
; CHECK: calll __chkstk ; CHECK: movl %esp, %[[end:.*]]
; CHECK: movl %esp, %[[beg:[^ ]*]] ; CHECK: leal -24(%[[end]]), %[[beg:.*]]
; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]] ; CHECK: movl %[[beg]], %esp
; CHECK: addl $-12, %[[end]]
call void @begin(%Iter* sret %temp.lvalue) call void @begin(%Iter* sret %temp.lvalue)
; CHECK: calll _begin ; CHECK: calll _begin

View File

@ -7,16 +7,16 @@ declare x86_stdcallcc void @i(i32 %a)
define void @g() { define void @g() {
; CHECK-LABEL: _g: ; CHECK-LABEL: _g:
; CHECK: movl %esp, %ebp
%b = alloca inalloca %Foo %b = alloca inalloca %Foo
; CHECK: movl $8, %eax ; CHECK: movl %esp, %[[tmp_sp:.*]]
; CHECK: calll __chkstk ; CHECK: leal -8(%[[tmp_sp]]), %esp
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0 %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1 %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1 store i32 13, i32* %f1
store i32 42, i32* %f2 store i32 42, i32* %f2
; CHECK: movl %esp, %eax ; CHECK: movl $13, -8(%[[tmp_sp]])
; CHECK: movl $13, (%eax) ; CHECK: movl $42, -4(%[[tmp_sp]])
; CHECK: movl $42, 4(%eax)
call x86_stdcallcc void @f(%Foo* inalloca %b) call x86_stdcallcc void @f(%Foo* inalloca %b)
; CHECK: calll _f@8 ; CHECK: calll _f@8
; CHECK-NOT: %esp ; CHECK-NOT: %esp

View File

@ -7,16 +7,16 @@ declare void @f(%Foo* inalloca %b)
define void @a() { define void @a() {
; CHECK-LABEL: _a: ; CHECK-LABEL: _a:
entry: entry:
; CHECK: movl %esp, %ebp
%b = alloca inalloca %Foo %b = alloca inalloca %Foo
; CHECK: movl $8, %eax ; CHECK: movl %esp, %[[tmp_sp:.*]]
; CHECK: calll __chkstk ; CHECK: leal -8(%[[tmp_sp]]), %esp
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0 %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1 %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1 store i32 13, i32* %f1
store i32 42, i32* %f2 store i32 42, i32* %f2
; CHECK: movl %esp, %eax ; CHECK: movl $13, -8(%[[tmp_sp]])
; CHECK: movl $13, (%eax) ; CHECK: movl $42, -4(%[[tmp_sp]])
; CHECK: movl $42, 4(%eax)
call void @f(%Foo* inalloca %b) call void @f(%Foo* inalloca %b)
; CHECK: calll _f ; CHECK: calll _f
ret void ret void
@ -27,16 +27,16 @@ declare void @inreg_with_inalloca(i32 inreg %a, %Foo* inalloca %b)
define void @b() { define void @b() {
; CHECK-LABEL: _b: ; CHECK-LABEL: _b:
entry: entry:
; CHECK: movl %esp, %ebp
%b = alloca inalloca %Foo %b = alloca inalloca %Foo
; CHECK: movl $8, %eax ; CHECK: movl %esp, %[[tmp_sp:.*]]
; CHECK: calll __chkstk ; CHECK: leal -8(%[[tmp_sp]]), %esp
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0 %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1 %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1 store i32 13, i32* %f1
store i32 42, i32* %f2 store i32 42, i32* %f2
; CHECK: movl %esp, %eax ; CHECK: movl $13, -8(%[[tmp_sp]])
; CHECK: movl $13, (%eax) ; CHECK: movl $42, -4(%[[tmp_sp]])
; CHECK: movl $42, 4(%eax)
call void @inreg_with_inalloca(i32 inreg 1, %Foo* inalloca %b) call void @inreg_with_inalloca(i32 inreg 1, %Foo* inalloca %b)
; CHECK: movl $1, %eax ; CHECK: movl $1, %eax
; CHECK: calll _inreg_with_inalloca ; CHECK: calll _inreg_with_inalloca
@ -48,16 +48,16 @@ declare x86_thiscallcc void @thiscall_with_inalloca(i8* %a, %Foo* inalloca %b)
define void @c() { define void @c() {
; CHECK-LABEL: _c: ; CHECK-LABEL: _c:
entry: entry:
; CHECK: movl %esp, %ebp
%b = alloca inalloca %Foo %b = alloca inalloca %Foo
; CHECK: movl $8, %eax ; CHECK: movl %esp, %[[tmp_sp:.*]]
; CHECK: calll __chkstk ; CHECK: leal -8(%[[tmp_sp]]), %esp
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0 %f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1 %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1 store i32 13, i32* %f1
store i32 42, i32* %f2 store i32 42, i32* %f2
; CHECK: movl %esp, %eax ; CHECK-DAG: movl $13, -8(%[[tmp_sp]])
; CHECK-DAG: movl $13, (%eax) ; CHECK-DAG: movl $42, -4(%[[tmp_sp]])
; CHECK-DAG: movl $42, 4(%eax)
call x86_thiscallcc void @thiscall_with_inalloca(i8* null, %Foo* inalloca %b) call x86_thiscallcc void @thiscall_with_inalloca(i8* null, %Foo* inalloca %b)
; CHECK-DAG: xorl %ecx, %ecx ; CHECK-DAG: xorl %ecx, %ecx
; CHECK: calll _thiscall_with_inalloca ; CHECK: calll _thiscall_with_inalloca

View File

@ -9,7 +9,7 @@ target triple = "i686-pc-windows-msvc18.0.0"
%struct.S = type { [12 x i8] } %struct.S = type { [12 x i8] }
define x86_thiscallcc void @call_inalloca(i1 %x) { define x86_thiscallcc void @call_inalloca(i1 %x) "stack-probe-size"="12" {
entry: entry:
%argmem = alloca inalloca <{ %struct.S }>, align 4 %argmem = alloca inalloca <{ %struct.S }>, align 4
%argidx1 = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0, i32 0 %argidx1 = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0, i32 0