Add branch weights to branches for static initializers.

The initializer for a static local variable cannot be hot, because it runs at
most once per program. That's not quite the same thing as having a low branch
probability, but under the assumption that the function is invoked many times,
modeling this as a branch probability seems reasonable.

For TLS variables, the situation is less clear, since the initialization side
of the branch can run multiple times in a program execution, but we still
expect initialization to be rare relative to non-initialization uses. It would
seem worthwhile to add a PGO counter along this path to make this estimation
more accurate in future.

For globals with guarded initialization, we don't yet apply any branch weights.
Due to our use of COMDATs, the guard will be reached exactly once per DSO, but
we have no idea how many DSOs will define the variable.

llvm-svn: 309195
This commit is contained in:
Richard Smith 2017-07-26 22:01:09 +00:00
parent f63d4d121b
commit ae8d62c9c5
8 changed files with 192 additions and 13 deletions

View File

@ -18,6 +18,7 @@
#include "clang/Frontend/CodeGenOptions.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Support/Path.h"
using namespace clang;
@ -259,6 +260,43 @@ void CodeGenFunction::EmitCXXGuardedInit(const VarDecl &D,
CGM.getCXXABI().EmitGuardedInit(*this, D, DeclPtr, PerformInit);
void CodeGenFunction::EmitCXXGuardedInitBranch(llvm::Value *NeedsInit,
llvm::BasicBlock *InitBlock,
llvm::BasicBlock *NoInitBlock,
GuardKind Kind,
const VarDecl *D) {
assert((Kind == GuardKind::TlsGuard || D) && "no guarded variable");
// A guess at how many times we will enter the initialization of a
// variable, depending on the kind of variable.
static const uint64_t InitsPerTLSVar = 1024;
static const uint64_t InitsPerLocalVar = 1024 * 1024;
llvm::MDNode *Weights;
if (Kind == GuardKind::VariableGuard && !D->isLocalVarDecl()) {
// For non-local variables, don't apply any weighting for now. Due to our
// use of COMDATs, we expect there to be at most one initialization of the
// variable per DSO, but we have no way to know how many DSOs will try to
// initialize the variable.
Weights = nullptr;
} else {
uint64_t NumInits;
// FIXME: For the TLS case, collect and use profiling information to
// determine a more accurate brach weight.
if (Kind == GuardKind::TlsGuard || D->getTLSKind())
NumInits = InitsPerTLSVar;
NumInits = InitsPerLocalVar;
// The probability of us entering the initializer is
// 1 / (total number of times we attempt to initialize the variable).
llvm::MDBuilder MDHelper(CGM.getLLVMContext());
Weights = MDHelper.createBranchWeights(1, NumInits - 1);
Builder.CreateCondBr(NeedsInit, InitBlock, NoInitBlock, Weights);
llvm::Function *CodeGenModule::CreateGlobalInitOrDestructFunction(
llvm::FunctionType *FTy, const Twine &Name, const CGFunctionInfo &FI,
SourceLocation Loc, bool TLS) {
@ -539,7 +577,8 @@ CodeGenFunction::GenerateCXXGlobalInitFunc(llvm::Function *Fn,
llvm::BasicBlock *InitBlock = createBasicBlock("init");
ExitBlock = createBasicBlock("exit");
Builder.CreateCondBr(Uninit, InitBlock, ExitBlock);
EmitCXXGuardedInitBranch(Uninit, InitBlock, ExitBlock,
GuardKind::TlsGuard, nullptr);
// Mark as initialized before initializing anything else. If the
// initializers use previously-initialized thread_local vars, that's

View File

@ -3496,6 +3496,14 @@ public:
void EmitCXXGuardedInit(const VarDecl &D, llvm::GlobalVariable *DeclPtr,
bool PerformInit);
enum class GuardKind { VariableGuard, TlsGuard };
/// Emit a branch to select whether or not to perform guarded initialization.
void EmitCXXGuardedInitBranch(llvm::Value *NeedsInit,
llvm::BasicBlock *InitBlock,
llvm::BasicBlock *NoInitBlock,
GuardKind Kind, const VarDecl *D);
/// GenerateCXXGlobalInitFunc - Generates code for initializing global
/// variables.
void GenerateCXXGlobalInitFunc(llvm::Function *Fn,

View File

@ -2113,13 +2113,14 @@ void ItaniumCXXABI::EmitGuardedInit(CodeGenFunction &CGF,
(UseARMGuardVarABI && !useInt8GuardVariable)
? Builder.CreateAnd(LI, llvm::ConstantInt::get(CGM.Int8Ty, 1))
: LI;
llvm::Value *isInitialized = Builder.CreateIsNull(V, "guard.uninitialized");
llvm::Value *NeedsInit = Builder.CreateIsNull(V, "guard.uninitialized");
llvm::BasicBlock *InitCheckBlock = CGF.createBasicBlock("init.check");
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("init.end");
// Check if the first byte of the guard variable is zero.
Builder.CreateCondBr(isInitialized, InitCheckBlock, EndBlock);
CGF.EmitCXXGuardedInitBranch(NeedsInit, InitCheckBlock, EndBlock,
CodeGenFunction::GuardKind::VariableGuard, &D);

View File

@ -2463,11 +2463,12 @@ void MicrosoftCXXABI::EmitGuardedInit(CodeGenFunction &CGF, const VarDecl &D,
// Test our bit from the guard variable.
llvm::ConstantInt *Bit = llvm::ConstantInt::get(GuardTy, 1ULL << GuardNum);
llvm::LoadInst *LI = Builder.CreateLoad(GuardAddr);
llvm::Value *IsInitialized =
Builder.CreateICmpNE(Builder.CreateAnd(LI, Bit), Zero);
llvm::Value *NeedsInit =
Builder.CreateICmpEQ(Builder.CreateAnd(LI, Bit), Zero);
llvm::BasicBlock *InitBlock = CGF.createBasicBlock("init");
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("init.end");
Builder.CreateCondBr(IsInitialized, EndBlock, InitBlock);
CGF.EmitCXXGuardedInitBranch(NeedsInit, InitBlock, EndBlock,
CodeGenFunction::GuardKind::VariableGuard, &D);
// Set our bit in the guard variable and emit the initializer and add a global
// destructor if appropriate.
@ -2502,7 +2503,8 @@ void MicrosoftCXXABI::EmitGuardedInit(CodeGenFunction &CGF, const VarDecl &D,
Builder.CreateICmpSGT(FirstGuardLoad, InitThreadEpoch);
llvm::BasicBlock *AttemptInitBlock = CGF.createBasicBlock("init.attempt");
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("init.end");
Builder.CreateCondBr(IsUninitialized, AttemptInitBlock, EndBlock);
CGF.EmitCXXGuardedInitBranch(IsUninitialized, AttemptInitBlock, EndBlock,
CodeGenFunction::GuardKind::VariableGuard, &D);
// This BasicBlock attempts to determine whether or not this thread is
// responsible for doing the initialization.

View File

@ -146,7 +146,7 @@ inline S &getS() {
// CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.S* @"\01?getS@@YAAAUS@@XZ"() {{.*}} comdat
// CHECK: load i32, i32* @"\01??_B?1??getS@@YAAAUS@@XZ@51"
// CHECK: and i32 {{.*}}, 1
// CHECK: icmp ne i32 {{.*}}, 0
// CHECK: icmp eq i32 {{.*}}, 0
// CHECK: br i1
// init:
// CHECK: or i32 {{.*}}, 1

View File

@ -24,8 +24,8 @@ extern inline S &f() {
static thread_local S s;
// CHECK: %[[guard:.*]] = load i32, i32* @"\01??__J?1??f@@YAAAUS@@XZ@51"
// CHECK-NEXT: %[[mask:.*]] = and i32 %[[guard]], 1
// CHECK-NEXT: %[[cmp:.*]] = icmp ne i32 %[[mask]], 0
// CHECK-NEXT: br i1 %[[cmp]], label %[[init_end:.*]], label %[[init:.*]]
// CHECK-NEXT: %[[cmp:.*]] = icmp eq i32 %[[mask]], 0
// CHECK-NEXT: br i1 %[[cmp]], label %[[init:.*]], label %[[init_end:.*]], !prof ![[unlikely_threadlocal:.*]]
// CHECK: [[init]]:
// CHECK-NEXT: %[[or:.*]] = or i32 %[[guard]], 1
@ -56,7 +56,7 @@ extern inline S &g() {
// CHECK: %[[guard:.*]] = load atomic i32, i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA" unordered, align 4
// CHECK-NEXT: %[[epoch:.*]] = load i32, i32* @_Init_thread_epoch
// CHECK-NEXT: %[[cmp:.*]] = icmp sgt i32 %[[guard]], %[[epoch]]
// CHECK-NEXT: br i1 %[[cmp]], label %[[init_attempt:.*]], label %[[init_end:.*]]
// CHECK-NEXT: br i1 %[[cmp]], label %[[init_attempt:.*]], label %[[init_end:.*]], !prof ![[unlikely_staticlocal:.*]]
// CHECK: [[init_attempt]]:
// CHECK-NEXT: call void @_Init_thread_header(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA")
@ -95,3 +95,6 @@ int g1() {
static int i = f1();
return i;
// CHECK-DAG: ![[unlikely_threadlocal]] = !{!"branch_weights", i32 1, i32 1023}
// CHECK-DAG: ![[unlikely_staticlocal]] = !{!"branch_weights", i32 1, i32 1048575}

View File

@ -20,7 +20,7 @@ void g() {
// WEBASSEMBLY32: %[[R0:.+]] = load atomic i8, i8* bitcast (i32* @_ZGVZ1gvE1a to i8*) acquire, align 4
// WEBASSEMBLY32-NEXT: %[[R1:.+]] = and i8 %[[R0]], 1
// WEBASSEMBLY32-NEXT: %[[R2:.+]] = icmp eq i8 %[[R1]], 0
// WEBASSEMBLY32-NEXT: br i1 %[[R2]], label %[[CHECK:.+]], label %[[END:.+]]
// WEBASSEMBLY32-NEXT: br i1 %[[R2]], label %[[CHECK:.+]], label %[[END:.+]],
// WEBASSEMBLY32: call i32 @__cxa_guard_acquire
@ -30,7 +30,7 @@ void g() {
// WEBASSEMBLY64: %[[R0:.+]] = load atomic i8, i8* bitcast (i64* @_ZGVZ1gvE1a to i8*) acquire, align 8
// WEBASSEMBLY64-NEXT: %[[R1:.+]] = and i8 %[[R0]], 1
// WEBASSEMBLY64-NEXT: %[[R2:.+]] = icmp eq i8 %[[R1]], 0
// WEBASSEMBLY64-NEXT: br i1 %[[R2]], label %[[CHECK:.+]], label %[[END:.+]]
// WEBASSEMBLY64-NEXT: br i1 %[[R2]], label %[[CHECK:.+]], label %[[END:.+]],
// WEBASSEMBLY64: call i32 @__cxa_guard_acquire

View File

@ -0,0 +1,126 @@
// RUN: %clang_cc1 -emit-llvm -std=c++1z %s -o - -triple=x86_64-linux-gnu | FileCheck %s
struct S { S(); ~S(); };
// CHECK-LABEL: define {{.*}}global_var_init
// CHECK-NOT: br
// CHECK: call void @_ZN1SC1Ev({{.*}}* @global)
S global;
// CHECK-LABEL: define {{.*}}global_var_init
// FIXME: Do we really need thread-safe initialization here? We don't run
// global ctors on multiple threads. (If we were to do so, we'd need thread-safe
// init for B<int>::member and B<int>::inline_member too.)
// CHECK: load atomic i8, i8* bitcast (i64* @_ZGV13inline_global to i8*) acquire,
// CHECK: icmp eq i8 {{.*}}, 0
// CHECK: br i1
// CHECK-NOT: !prof
// CHECK: call void @_ZN1SC1Ev({{.*}}* @inline_global)
inline S inline_global;
// CHECK-LABEL: define {{.*}}global_var_init
// CHECK-NOT: br
// CHECK: call void @_ZN1SC1Ev({{.*}}* @thread_local_global)
thread_local S thread_local_global;
// CHECK-LABEL: define {{.*}}global_var_init
// CHECK: load i8, i8* bitcast (i64* @_ZGV26thread_local_inline_global to i8*)
// CHECK: icmp eq i8 {{.*}}, 0
// CHECK: br i1
// CHECK-NOT: !prof
// CHECK: call void @_ZN1SC1Ev({{.*}}* @thread_local_inline_global)
thread_local inline S thread_local_inline_global;
struct A {
static S member;
static thread_local S thread_local_member;
// CHECK-LABEL: define {{.*}}global_var_init
// CHECK: load atomic i8, i8* bitcast (i64* @_ZGVN1A13inline_memberE to i8*) acquire,
// CHECK: icmp eq i8 {{.*}}, 0
// CHECK: br i1
// CHECK-NOT: !prof
// CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1A13inline_memberE)
static inline S inline_member;
// CHECK-LABEL: define {{.*}}global_var_init
// CHECK: load i8, i8* bitcast (i64* @_ZGVN1A26thread_local_inline_memberE to i8*)
// CHECK: icmp eq i8 {{.*}}, 0
// CHECK: br i1
// CHECK-NOT: !prof
// CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1A26thread_local_inline_memberE)
static thread_local inline S thread_local_inline_member;
// CHECK-LABEL: define void @_Z1fv()
void f() {
// CHECK: load atomic i8, i8* bitcast (i64* @_ZGVZ1fvE12static_local to i8*) acquire,
// CHECK: icmp eq i8 {{.*}}, 0
// CHECK: br i1 {{.*}}, !prof ![[WEIGHTS_LOCAL:[0-9]*]]
static S static_local;
// CHECK: load i8, i8* @_ZGVZ1fvE19static_thread_local,
// CHECK: icmp eq i8 {{.*}}, 0
// CHECK: br i1 {{.*}}, !prof ![[WEIGHTS_THREAD_LOCAL:[0-9]*]]
static thread_local S static_thread_local;
// CHECK-LABEL: define {{.*}}global_var_init
// CHECK-NOT: br
// CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1A6memberE)
S A::member;
// CHECK-LABEL: define {{.*}}global_var_init
// CHECK-NOT: br
// CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1A19thread_local_memberE)
thread_local S A::thread_local_member;
template <typename T> struct B {
// CHECK-LABEL: define {{.*}}global_var_init
// CHECK: load i8, i8* bitcast (i64* @_ZGVN1BIiE6memberE to i8*)
// CHECK: icmp eq i8 {{.*}}, 0
// CHECK: br i1
// CHECK-NOT: !prof
// CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1BIiE6memberE)
static S member;
// CHECK-LABEL: define {{.*}}global_var_init
// CHECK: load i8, i8* bitcast (i64* @_ZGVN1BIiE13inline_memberE to i8*)
// CHECK: icmp eq i8 {{.*}}, 0
// CHECK: br i1
// CHECK-NOT: !prof
// CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1BIiE13inline_memberE)
static inline S inline_member;
// CHECK-LABEL: define {{.*}}global_var_init
// CHECK: load i8, i8* bitcast (i64* @_ZGVN1BIiE19thread_local_memberE to i8*)
// CHECK: icmp eq i8 {{.*}}, 0
// CHECK: br i1
// CHECK-NOT: !prof
// CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1BIiE19thread_local_memberE)
static thread_local S thread_local_member;
// CHECK-LABEL: define {{.*}}global_var_init
// CHECK: load i8, i8* bitcast (i64* @_ZGVN1BIiE26thread_local_inline_memberE to i8*)
// CHECK: icmp eq i8 {{.*}}, 0
// CHECK: br i1
// CHECK-NOT: !prof
// CHECK: call void @_ZN1SC1Ev({{.*}}* @_ZN1BIiE26thread_local_inline_memberE)
static thread_local inline S thread_local_inline_member;
template<typename T> S B<T>::member;
template<typename T> thread_local S B<T>::thread_local_member;
template<typename ...T> void use(T &...);
void use_b() {
use(B<int>::member, B<int>::inline_member, B<int>::thread_local_member,
// CHECK-LABEL: define {{.*}}tls_init()
// CHECK: load i8, i8* @__tls_guard, align 1
// CHECK: icmp eq i8 {{.*}}, 0
// CHECK: br i1 {{.*}}, !prof ![[WEIGHTS_THREAD_LOCAL]]
// CHECK-DAG: ![[WEIGHTS_THREAD_LOCAL]] = !{!"branch_weights", i32 1, i32 1023}
// CHECK-DAG: ![[WEIGHTS_LOCAL]] = !{!"branch_weights", i32 1, i32 1048575}