[Stackmap] Emit multi-byte nops for X86.

llvm-svn: 196334
This commit is contained in:
Juergen Ributzka 2013-12-04 00:39:08 +00:00
parent 72bfbd8615
commit 17e0d9ee6c
4 changed files with 350 additions and 29 deletions

View File

@ -674,27 +674,76 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
.addExpr(tlsRef));
}
/// \brief Emit the optimal amount of multi-byte nops on X86.
static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit) {
// This works only for 64bit. For 32bit we have to do additional checking if
// the CPU supports multi-byte nops.
assert(Is64Bit && "EmitNops only supports X86-64");
while (NumBytes) {
unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
Opc = IndexReg = Displacement = SegmentReg = 0;
BaseReg = X86::RAX; ScaleVal = 1;
switch (NumBytes) {
case 0: llvm_unreachable("Zero nops?"); break;
case 1: NumBytes -= 1; Opc = X86::NOOP; break;
case 2: NumBytes -= 2; Opc = X86::XCHG16ar; break;
case 3: NumBytes -= 3; Opc = X86::NOOPL; break;
case 4: NumBytes -= 4; Opc = X86::NOOPL; Displacement = 8; break;
case 5: NumBytes -= 5; Opc = X86::NOOPL; Displacement = 8;
IndexReg = X86::RAX; break;
case 6: NumBytes -= 6; Opc = X86::NOOPW; Displacement = 8;
IndexReg = X86::RAX; break;
case 7: NumBytes -= 7; Opc = X86::NOOPL; Displacement = 512; break;
case 8: NumBytes -= 8; Opc = X86::NOOPL; Displacement = 512;
IndexReg = X86::RAX; break;
case 9: NumBytes -= 9; Opc = X86::NOOPW; Displacement = 512;
IndexReg = X86::RAX; break;
default: NumBytes -= 10; Opc = X86::NOOPW; Displacement = 512;
IndexReg = X86::RAX; SegmentReg = X86::CS; break;
}
unsigned NumPrefixes = std::min(NumBytes, 5U);
NumBytes -= NumPrefixes;
for (unsigned i = 0; i != NumPrefixes; ++i)
OS.EmitBytes("\x66");
switch (Opc) {
default: llvm_unreachable("Unexpected opcode"); break;
case X86::NOOP:
OS.EmitInstruction(MCInstBuilder(Opc));
break;
case X86::XCHG16ar:
OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX));
break;
case X86::NOOPL:
case X86::NOOPW:
OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg).addImm(ScaleVal)
.addReg(IndexReg)
.addImm(Displacement)
.addReg(SegmentReg));
break;
}
} // while (NumBytes)
}
// Lower a stackmap of the form:
// <id>, <shadowBytes>, ...
static void LowerSTACKMAP(MCStreamer &OutStreamer,
StackMaps &SM,
const MachineInstr &MI)
{
unsigned NumNOPBytes = MI.getOperand(1).getImm();
static void LowerSTACKMAP(MCStreamer &OS, StackMaps &SM,
const MachineInstr &MI, bool Is64Bit) {
unsigned NumBytes = MI.getOperand(1).getImm();
SM.recordStackMap(MI);
// Emit padding.
// FIXME: These nops ensure that the stackmap's shadow is covered by
// instructions from the same basic block, but the nops should not be
// necessary if instructions from the same block follow the stackmap.
for (unsigned i = 0; i < NumNOPBytes; ++i)
OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP));
EmitNops(OS, NumBytes, Is64Bit);
}
// Lower a patchpoint of the form:
// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
static void LowerPATCHPOINT(MCStreamer &OutStreamer,
StackMaps &SM,
const MachineInstr &MI) {
static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM,
const MachineInstr &MI, bool Is64Bit) {
assert(Is64Bit && "Patchpoint currently only supports X86-64");
SM.recordPatchPoint(MI);
PatchPointOpers opers(&MI);
@ -704,22 +753,21 @@ static void LowerPATCHPOINT(MCStreamer &OutStreamer,
if (CallTarget) {
// Emit MOV to materialize the target address and the CALL to target.
// This is encoded with 12-13 bytes, depending on which register is used.
// We conservatively assume that it is 12 bytes and emit in worst case one
// extra NOP byte.
EncodedBytes = 12;
OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64ri)
.addReg(MI.getOperand(ScratchIdx).getReg())
.addImm(CallTarget));
OutStreamer.EmitInstruction(MCInstBuilder(X86::CALL64r)
.addReg(MI.getOperand(ScratchIdx).getReg()));
unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
if (X86II::isX86_64ExtendedReg(ScratchReg))
EncodedBytes = 13;
else
EncodedBytes = 12;
OS.EmitInstruction(MCInstBuilder(X86::MOV64ri).addReg(ScratchReg)
.addImm(CallTarget));
OS.EmitInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
}
// Emit padding.
unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
assert(NumBytes >= EncodedBytes &&
"Patchpoint can't request size less than the length of a call.");
for (unsigned i = EncodedBytes; i < NumBytes; ++i)
OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP));
EmitNops(OS, NumBytes - EncodedBytes, Is64Bit);
}
void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
@ -813,10 +861,10 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
case TargetOpcode::STACKMAP:
return LowerSTACKMAP(OutStreamer, SM, *MI);
return LowerSTACKMAP(OutStreamer, SM, *MI, Subtarget->is64Bit());
case TargetOpcode::PATCHPOINT:
return LowerPATCHPOINT(OutStreamer, SM, *MI);
return LowerPATCHPOINT(OutStreamer, SM, *MI, Subtarget->is64Bit());
case X86::MORESTACK_RET:
OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));

View File

@ -7,10 +7,10 @@ entry:
; CHECK-LABEL: trivial_patchpoint_codegen:
; CHECK: movabsq $-559038736, %r11
; CHECK-NEXT: callq *%r11
; CHECK-NEXT: nop
; CHECK-NEXT: xchgw %ax, %ax
; CHECK: movq %rax, %[[REG:r.+]]
; CHECK: callq *%r11
; CHECK-NEXT: nop
; CHECK-NEXT: xchgw %ax, %ax
; CHECK: movq %[[REG]], %rax
; CHECK: ret
%resolveCall2 = inttoptr i64 -559038736 to i8*
@ -84,11 +84,7 @@ define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
entry:
; CHECK-LABEL: small_patchpoint_codegen:
; CHECK: Ltmp
; CHECK: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK: nopl 8(%rax,%rax)
; CHECK-NEXT: popq
; CHECK-NEXT: ret
%result = tail call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)

View File

@ -0,0 +1,230 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
define void @nop_test() {
entry:
; CHECK-LABEL: nop_test:
; CHECK: nop
; CHECK: xchgw %ax, %ax
; CHECK: nopl (%rax)
; CHECK: nopl 8(%rax)
; CHECK: nopl 8(%rax,%rax)
; CHECK: nopw 8(%rax,%rax)
; CHECK: nopl 512(%rax)
; CHECK: nopl 512(%rax,%rax)
; CHECK: nopw 512(%rax,%rax)
; CHECK: nopw %cs:512(%rax,%rax)
; 11
; CHECK: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; 12
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; 13
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; 14
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; 15
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; 16
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: nop
; 17
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: xchgw %ax, %ax
; 18
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: nopl (%rax)
; 19
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: nopl 8(%rax)
; 20
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: nopl 8(%rax,%rax)
; 21
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: nopw 8(%rax,%rax)
; 22
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: nopl 512(%rax)
; 23
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: nopl 512(%rax,%rax)
; 24
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: nopw 512(%rax,%rax)
; 25
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; 26
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; 27
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; 28
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
;29
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; 30
; CHECK: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: .byte 102
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 0, i32 0)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 1, i32 1)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 2, i32 2)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 3, i32 3)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 4, i32 4)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 5, i32 5)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 6, i32 6)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 7, i32 7)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 8, i32 8)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 9, i32 9)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 10, i32 10)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 11, i32 11)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 12, i32 12)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 13, i32 13)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 14)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 15, i32 15)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 16, i32 16)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 17, i32 17)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 18, i32 18)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 19, i32 19)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 20, i32 20)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 21, i32 21)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 22, i32 22)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 23, i32 23)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 24, i32 24)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 25, i32 25)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 26, i32 26)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 27, i32 27)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 28, i32 28)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 29, i32 29)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 30, i32 30)
ret void
}
declare void @llvm.experimental.stackmap(i32, i32, ...)

View File

@ -0,0 +1,47 @@
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim -filetype=obj %s -o - | llvm-objdump -d - | FileCheck %s
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim -filetype=asm %s -o - | llvm-mc -filetype=obj - | llvm-objdump -d - | FileCheck %s
define void @nop_test() {
entry:
; CHECK: 0: 55
; CHECK: 1: 48 89 e5
; CHECK: 4: 90
; CHECK: 5: 66 90
; CHECK: 7: 0f 1f 00
; CHECK: a: 0f 1f 40 08
; CHECK: e: 0f 1f 44 00 08
; CHECK: 13: 66 0f 1f 44 00 08
; CHECK: 19: 0f 1f 80 00 02 00 00
; CHECK: 20: 0f 1f 84 00 00 02 00 00
; CHECK: 28: 66 0f 1f 84 00 00 02 00 00
; CHECK: 31: 2e 66 0f 1f 84 00 00 02 00 00
; CHECK: 3b: 66 2e 66 0f 1f 84 00 00 02 00 00
; CHECK: 46: 66 66 2e 66 0f 1f 84 00 00 02 00 00
; CHECK: 52: 66 66 66 2e 66 0f 1f 84 00 00 02 00 00
; CHECK: 5f: 66 66 66 66 2e 66 0f 1f 84 00 00 02 00 00
; CHECK: 6d: 66 66 66 66 66 2e 66 0f 1f 84 00 00 02 00 00
; CHECK: 7c: 5d
; CHECK: 7d: c3
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 0, i32 0)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 1, i32 1)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 2, i32 2)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 3, i32 3)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 4, i32 4)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 5, i32 5)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 6, i32 6)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 7, i32 7)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 8, i32 8)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 9, i32 9)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 10, i32 10)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 11, i32 11)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 12, i32 12)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 13, i32 13)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 14)
tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 15, i32 15)
ret void
}
declare void @llvm.experimental.stackmap(i32, i32, ...)