tsan: optimize memory access functions

The optimization is two-fold:
First, the algorithm now uses SSE instructions to
handle all 4 shadow slots at once. This makes processing
faster.
Second, if shadow contains the same access, we do not
store the event into trace. This increases effective
trace size, that is, tsan can remember up to 10x more
previous memory accesses.

Perofrmance impact:
Before:
[       OK ] DISABLED_BENCH.Mop8Read (2461 ms)
[       OK ] DISABLED_BENCH.Mop8Write (1836 ms)
After:
[       OK ] DISABLED_BENCH.Mop8Read (1204 ms)
[       OK ] DISABLED_BENCH.Mop8Write (976 ms)
But this measures only fast-path.
On large real applications the speedup is ~20%.

Trace size impact:
On app1:
Memory accesses                   :       1163265870
  Including same                  :        791312905 (68%)
on app2:
Memory accesses                   :        166875345
  Including same                  :        150449689 (90%)
90% of filtered events means that trace size is effectively 10x larger.

llvm-svn: 209897
This commit is contained in:
Dmitry Vyukov 2014-05-30 13:36:29 +00:00
parent a2332425c4
commit afdcc96d9f
10 changed files with 202 additions and 83 deletions

View File

@ -268,9 +268,14 @@ void RunMultipleEpochsTest() {
}
EXPECT_EQ(d.testOnlyGetEpoch(), 4 * d.size());
#if TSAN_DEBUG == 0
// EXPECT_DEATH clones a thread with 4K stack,
// which is overflown by tsan memory accesses functions in debug mode.
// Can not handle the locks from the previous epoch.
// The caller should update the lock id.
EXPECT_DEATH(d.onLock(&dtls, l0), "CHECK failed.*current_epoch_");
#endif
}
TEST(DeadlockDetector, MultipleEpochsTest) {

View File

@ -8,11 +8,11 @@ PrintRes() {
PrintRes
mops="write1 \
wmops="write1 \
write2 \
write4 \
write8 \
read1 \
write8"
rmops="read1 \
read2 \
read4 \
read8"
@ -27,10 +27,16 @@ check() {
fi
}
for f in $mops; do
check $f rsp 1 # To read caller pc.
check $f push 0
check $f pop 0
for f in $wmops; do
check $f rsp 3
check $f push 1
check $f pop 5
done
for f in $rmops; do
check $f rsp 3
check $f push 1
check $f pop 4
done
for f in $func; do

View File

@ -1,4 +1,4 @@
CXXFLAGS = -std=c++11 -fPIE -g -Wall -Werror -fno-builtin -DTSAN_DEBUG=$(DEBUG) -DSANITIZER_DEBUG=$(DEBUG)
CXXFLAGS = -std=c++11 -fPIE -g -Wall -Werror -fno-builtin -msse3 -DTSAN_DEBUG=$(DEBUG) -DSANITIZER_DEBUG=$(DEBUG)
CLANG=clang
ifeq ($(DEBUG), 0)
CXXFLAGS += -O3

View File

@ -54,6 +54,7 @@ const uptr kShadowCnt = TSAN_SHADOW_COUNT;
# endif
#else
// Count of shadow values in a shadow cell.
#define TSAN_SHADOW_COUNT 4
const uptr kShadowCnt = 4;
#endif

View File

@ -25,6 +25,16 @@
#include "tsan_suppressions.h"
#include "tsan_symbolize.h"
#ifdef __SSE3__
// <emmintrin.h> transitively includes <stdlib.h>,
// and it's prohibited to include std headers into tsan runtime.
// So we do this dirty trick.
#define _MM_MALLOC_H_INCLUDED
#define __MM_MALLOC_H
#include <emmintrin.h>
typedef __m128i m128;
#endif
volatile int __tsan_resumed = 0;
extern "C" void __tsan_resume() {
@ -471,7 +481,8 @@ void StoreIfNotYetStored(u64 *sp, u64 *s) {
*s = 0;
}
static inline void HandleRace(ThreadState *thr, u64 *shadow_mem,
ALWAYS_INLINE
void HandleRace(ThreadState *thr, u64 *shadow_mem,
Shadow cur, Shadow old) {
thr->racy_state[0] = cur.raw();
thr->racy_state[1] = old.raw();
@ -483,16 +494,12 @@ static inline void HandleRace(ThreadState *thr, u64 *shadow_mem,
#endif
}
static inline bool OldIsInSameSynchEpoch(Shadow old, ThreadState *thr) {
return old.epoch() >= thr->fast_synch_epoch;
}
static inline bool HappensBefore(Shadow old, ThreadState *thr) {
return thr->clock.get(old.TidWithIgnore()) >= old.epoch();
}
ALWAYS_INLINE USED
void MemoryAccessImpl(ThreadState *thr, uptr addr,
ALWAYS_INLINE
void MemoryAccessImpl1(ThreadState *thr, uptr addr,
int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
u64 *shadow_mem, Shadow cur) {
StatInc(thr, StatMop);
@ -586,6 +593,90 @@ void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr,
}
}
ALWAYS_INLINE
bool ContainsSameAccessSlow(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
Shadow cur(a);
for (uptr i = 0; i < kShadowCnt; i++) {
Shadow old(LoadShadow(&s[i]));
if (Shadow::Addr0AndSizeAreEqual(cur, old) &&
old.TidWithIgnore() == cur.TidWithIgnore() &&
old.epoch() > sync_epoch &&
old.IsAtomic() == cur.IsAtomic() &&
old.IsRead() <= cur.IsRead())
return true;
}
return false;
}
#if defined(__SSE3__) && TSAN_SHADOW_COUNT == 4
#define SHUF(v0, v1, i0, i1, i2, i3) _mm_castps_si128(_mm_shuffle_ps( \
_mm_castsi128_ps(v0), _mm_castsi128_ps(v1), \
(i0)*1 + (i1)*4 + (i2)*16 + (i3)*64))
ALWAYS_INLINE
bool ContainsSameAccessFast(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
// This is an optimized version of ContainsSameAccessSlow.
// load current access into access[0:63]
const m128 access = _mm_cvtsi64_si128(a);
// duplicate high part of access in addr0:
// addr0[0:31] = access[32:63]
// addr0[32:63] = access[32:63]
// addr0[64:95] = access[32:63]
// addr0[96:127] = access[32:63]
const m128 addr0 = SHUF(access, access, 1, 1, 1, 1);
// load 4 shadow slots
const m128 shadow0 = _mm_load_si128((__m128i*)s);
const m128 shadow1 = _mm_load_si128((__m128i*)s + 1);
// load high parts of 4 shadow slots into addr_vect:
// addr_vect[0:31] = shadow0[32:63]
// addr_vect[32:63] = shadow0[96:127]
// addr_vect[64:95] = shadow1[32:63]
// addr_vect[96:127] = shadow1[96:127]
m128 addr_vect = SHUF(shadow0, shadow1, 1, 3, 1, 3);
if (!is_write) {
// set IsRead bit in addr_vect
const m128 rw_mask1 = _mm_cvtsi64_si128(1<<15);
const m128 rw_mask = SHUF(rw_mask1, rw_mask1, 0, 0, 0, 0);
addr_vect = _mm_or_si128(addr_vect, rw_mask);
}
// addr0 == addr_vect?
const m128 addr_res = _mm_cmpeq_epi32(addr0, addr_vect);
// epoch1[0:63] = sync_epoch
const m128 epoch1 = _mm_cvtsi64_si128(sync_epoch);
// epoch[0:31] = sync_epoch[0:31]
// epoch[32:63] = sync_epoch[0:31]
// epoch[64:95] = sync_epoch[0:31]
// epoch[96:127] = sync_epoch[0:31]
const m128 epoch = SHUF(epoch1, epoch1, 0, 0, 0, 0);
// load low parts of shadow cell epochs into epoch_vect:
// epoch_vect[0:31] = shadow0[0:31]
// epoch_vect[32:63] = shadow0[64:95]
// epoch_vect[64:95] = shadow1[0:31]
// epoch_vect[96:127] = shadow1[64:95]
const m128 epoch_vect = SHUF(shadow0, shadow1, 0, 2, 0, 2);
// epoch_vect >= sync_epoch?
const m128 epoch_res = _mm_cmpgt_epi32(epoch_vect, epoch);
// addr_res & epoch_res
const m128 res = _mm_and_si128(addr_res, epoch_res);
// mask[0] = res[7]
// mask[1] = res[15]
// ...
// mask[15] = res[127]
const int mask = _mm_movemask_epi8(res);
return mask != 0;
}
#endif
ALWAYS_INLINE
bool ContainsSameAccess(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
#if defined(__SSE3__) && TSAN_SHADOW_COUNT == 4
bool res = ContainsSameAccessFast(s, a, sync_epoch, is_write);
DCHECK_EQ(res, ContainsSameAccessSlow(s, a, sync_epoch, is_write));
return res;
#else
return ContainsSameAccessSlow(s, a, sync_epoch, is_write);
#endif
}
ALWAYS_INLINE USED
void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic) {
@ -618,14 +709,12 @@ void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
}
FastState fast_state = thr->fast_state;
if (fast_state.GetIgnoreBit())
if (fast_state.GetIgnoreBit()) {
StatInc(thr, StatMop);
StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
StatInc(thr, StatMopIgnored);
return;
if (kCollectHistory) {
fast_state.IncrementEpoch();
thr->fast_state = fast_state;
// We must not store to the trace if we do not store to the shadow.
// That is, this call must be moved somewhere below.
TraceAddEvent(thr, fast_state, EventTypeMop, pc);
}
Shadow cur(fast_state);
@ -633,7 +722,40 @@ void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
cur.SetWrite(kAccessIsWrite);
cur.SetAtomic(kIsAtomic);
MemoryAccessImpl(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
thr->fast_synch_epoch, kAccessIsWrite))) {
StatInc(thr, StatMop);
StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
StatInc(thr, StatMopSame);
return;
}
if (kCollectHistory) {
fast_state.IncrementEpoch();
TraceAddEvent(thr, fast_state, EventTypeMop, pc);
thr->fast_state = fast_state;
cur.IncrementEpoch();
}
MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
shadow_mem, cur);
}
// Called by MemoryAccessRange in tsan_rtl_thread.cc
void MemoryAccessImpl(ThreadState *thr, uptr addr,
int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
u64 *shadow_mem, Shadow cur) {
if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
thr->fast_synch_epoch, kAccessIsWrite))) {
StatInc(thr, StatMop);
StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
StatInc(thr, StatMopSame);
return;
}
MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
shadow_mem, cur);
}

View File

@ -78,14 +78,14 @@ const u64 kShadowRodata = (u64)-1; // .rodata shadow marker
// FastState (from most significant bit):
// ignore : 1
// tid : kTidBits
// epoch : kClkBits
// unused : -
// history_size : 3
// epoch : kClkBits
class FastState {
public:
FastState(u64 tid, u64 epoch) {
x_ = tid << kTidShift;
x_ |= epoch << kClkShift;
x_ |= epoch;
DCHECK_EQ(tid, this->tid());
DCHECK_EQ(epoch, this->epoch());
DCHECK_EQ(GetIgnoreBit(), false);
@ -110,13 +110,13 @@ class FastState {
}
u64 epoch() const {
u64 res = (x_ << (kTidBits + 1)) >> (64 - kClkBits);
u64 res = x_ & ((1ull << kClkBits) - 1);
return res;
}
void IncrementEpoch() {
u64 old_epoch = epoch();
x_ += 1 << kClkShift;
x_ += 1;
DCHECK_EQ(old_epoch + 1, epoch());
(void)old_epoch;
}
@ -128,17 +128,19 @@ class FastState {
void SetHistorySize(int hs) {
CHECK_GE(hs, 0);
CHECK_LE(hs, 7);
x_ = (x_ & ~7) | hs;
x_ = (x_ & ~(kHistoryMask << kHistoryShift)) | (u64(hs) << kHistoryShift);
}
ALWAYS_INLINE
int GetHistorySize() const {
return (int)(x_ & 7);
return (int)((x_ >> kHistoryShift) & kHistoryMask);
}
void ClearHistorySize() {
x_ &= ~7;
SetHistorySize(0);
}
ALWAYS_INLINE
u64 GetTracePos() const {
const int hs = GetHistorySize();
// When hs == 0, the trace consists of 2 parts.
@ -149,20 +151,21 @@ class FastState {
private:
friend class Shadow;
static const int kTidShift = 64 - kTidBits - 1;
static const int kClkShift = kTidShift - kClkBits;
static const u64 kIgnoreBit = 1ull << 63;
static const u64 kFreedBit = 1ull << 63;
static const u64 kHistoryShift = kClkBits;
static const u64 kHistoryMask = 7;
u64 x_;
};
// Shadow (from most significant bit):
// freed : 1
// tid : kTidBits
// epoch : kClkBits
// is_atomic : 1
// is_read : 1
// size_log : 2
// addr0 : 3
// epoch : kClkBits
class Shadow : public FastState {
public:
explicit Shadow(u64 x)
@ -175,10 +178,10 @@ class Shadow : public FastState {
}
void SetAddr0AndSizeLog(u64 addr0, unsigned kAccessSizeLog) {
DCHECK_EQ(x_ & 31, 0);
DCHECK_EQ((x_ >> kClkBits) & 31, 0);
DCHECK_LE(addr0, 7);
DCHECK_LE(kAccessSizeLog, 3);
x_ |= (kAccessSizeLog << 3) | addr0;
x_ |= ((kAccessSizeLog << 3) | addr0) << kClkBits;
DCHECK_EQ(kAccessSizeLog, size_log());
DCHECK_EQ(addr0, this->addr0());
}
@ -211,47 +214,34 @@ class Shadow : public FastState {
return shifted_xor == 0;
}
static inline bool Addr0AndSizeAreEqual(const Shadow s1, const Shadow s2) {
u64 masked_xor = (s1.x_ ^ s2.x_) & 31;
static ALWAYS_INLINE
bool Addr0AndSizeAreEqual(const Shadow s1, const Shadow s2) {
u64 masked_xor = ((s1.x_ ^ s2.x_) >> kClkBits) & 31;
return masked_xor == 0;
}
static inline bool TwoRangesIntersect(Shadow s1, Shadow s2,
static ALWAYS_INLINE bool TwoRangesIntersect(Shadow s1, Shadow s2,
unsigned kS2AccessSize) {
bool res = false;
u64 diff = s1.addr0() - s2.addr0();
if ((s64)diff < 0) { // s1.addr0 < s2.addr0 // NOLINT
// if (s1.addr0() + size1) > s2.addr0()) return true;
if (s1.size() > -diff) res = true;
if (s1.size() > -diff)
res = true;
} else {
// if (s2.addr0() + kS2AccessSize > s1.addr0()) return true;
if (kS2AccessSize > diff) res = true;
if (kS2AccessSize > diff)
res = true;
}
DCHECK_EQ(res, TwoRangesIntersectSLOW(s1, s2));
DCHECK_EQ(res, TwoRangesIntersectSLOW(s2, s1));
DCHECK_EQ(res, TwoRangesIntersectSlow(s1, s2));
DCHECK_EQ(res, TwoRangesIntersectSlow(s2, s1));
return res;
}
// The idea behind the offset is as follows.
// Consider that we have 8 bool's contained within a single 8-byte block
// (mapped to a single shadow "cell"). Now consider that we write to the bools
// from a single thread (which we consider the common case).
// W/o offsetting each access will have to scan 4 shadow values at average
// to find the corresponding shadow value for the bool.
// With offsetting we start scanning shadow with the offset so that
// each access hits necessary shadow straight off (at least in an expected
// optimistic case).
// This logic works seamlessly for any layout of user data. For example,
// if user data is {int, short, char, char}, then accesses to the int are
// offsetted to 0, short - 4, 1st char - 6, 2nd char - 7. Hopefully, accesses
// from a single thread won't need to scan all 8 shadow values.
unsigned ComputeSearchOffset() {
return x_ & 7;
}
u64 addr0() const { return x_ & 7; }
u64 size() const { return 1ull << size_log(); }
bool IsWrite() const { return !IsRead(); }
bool IsRead() const { return x_ & kReadBit; }
u64 ALWAYS_INLINE addr0() const { return (x_ >> kClkBits) & 7; }
u64 ALWAYS_INLINE size() const { return 1ull << size_log(); }
bool ALWAYS_INLINE IsWrite() const { return !IsRead(); }
bool ALWAYS_INLINE IsRead() const { return x_ & kReadBit; }
// The idea behind the freed bit is as follows.
// When the memory is freed (or otherwise unaccessible) we write to the shadow
@ -276,15 +266,14 @@ class Shadow : public FastState {
return res;
}
bool IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
// analyzes 5-th bit (is_read) and 6-th bit (is_atomic)
bool v = x_ & u64(((kIsWrite ^ 1) << kReadShift)
| (kIsAtomic << kAtomicShift));
bool ALWAYS_INLINE IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
bool v = x_ & ((u64(kIsWrite ^ 1) << kReadShift)
| (u64(kIsAtomic) << kAtomicShift));
DCHECK_EQ(v, (!IsWrite() && !kIsWrite) || (IsAtomic() && kIsAtomic));
return v;
}
bool IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
bool ALWAYS_INLINE IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
bool v = ((x_ >> kReadShift) & 3)
<= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
DCHECK_EQ(v, (IsAtomic() < kIsAtomic) ||
@ -292,7 +281,7 @@ class Shadow : public FastState {
return v;
}
bool IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
bool ALWAYS_INLINE IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
bool v = ((x_ >> kReadShift) & 3)
>= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
DCHECK_EQ(v, (IsAtomic() > kIsAtomic) ||
@ -301,14 +290,14 @@ class Shadow : public FastState {
}
private:
static const u64 kReadShift = 5;
static const u64 kReadShift = 5 + kClkBits;
static const u64 kReadBit = 1ull << kReadShift;
static const u64 kAtomicShift = 6;
static const u64 kAtomicShift = 6 + kClkBits;
static const u64 kAtomicBit = 1ull << kAtomicShift;
u64 size_log() const { return (x_ >> 3) & 3; }
u64 size_log() const { return (x_ >> (3 + kClkBits)) & 3; }
static bool TwoRangesIntersectSLOW(const Shadow s1, const Shadow s2) {
static bool TwoRangesIntersectSlow(const Shadow s1, const Shadow s2) {
if (s1.addr0() == s2.addr0()) return true;
if (s1.addr0() < s2.addr0() && s1.addr0() + s1.size() > s2.addr0())
return true;

View File

@ -37,6 +37,7 @@ void StatOutput(u64 *stat) {
name[StatMop4] = " size 4 ";
name[StatMop8] = " size 8 ";
name[StatMopSame] = " Including same ";
name[StatMopIgnored] = " Including ignored ";
name[StatMopRange] = " Including range ";
name[StatMopRodata] = " Including .rodata ";
name[StatMopRangeRodata] = " Including .rodata range ";

View File

@ -26,6 +26,7 @@ enum StatType {
StatMop4,
StatMop8,
StatMopSame,
StatMopIgnored,
StatMopRange,
StatMopRodata,
StatMopRangeRodata,

View File

@ -16,8 +16,7 @@
do {
StatInc(thr, StatShadowProcessed);
const unsigned kAccessSize = 1 << kAccessSizeLog;
unsigned off = cur.ComputeSearchOffset();
u64 *sp = &shadow_mem[(idx + off) % kShadowCnt];
u64 *sp = &shadow_mem[idx];
old = LoadShadow(sp);
if (old.IsZero()) {
StatInc(thr, StatShadowZero);
@ -33,16 +32,6 @@ do {
// same thread?
if (Shadow::TidsAreEqual(old, cur)) {
StatInc(thr, StatShadowSameThread);
if (OldIsInSameSynchEpoch(old, thr)) {
if (old.IsRWNotWeaker(kAccessIsWrite, kIsAtomic)) {
// found a slot that holds effectively the same info
// (that is, same tid, same sync epoch and same size)
StatInc(thr, StatMopSame);
return;
}
StoreIfNotYetStored(sp, &store_word);
break;
}
if (old.IsRWWeakerOrEqual(kAccessIsWrite, kIsAtomic))
StoreIfNotYetStored(sp, &store_word);
break;

View File

@ -144,6 +144,11 @@ TEST(Mman, Stats) {
}
TEST(Mman, CallocOverflow) {
#if TSAN_DEBUG
// EXPECT_DEATH clones a thread with 4K stack,
// which is overflown by tsan memory accesses functions in debug mode.
return;
#endif
size_t kArraySize = 4096;
volatile size_t kMaxSizeT = std::numeric_limits<size_t>::max();
volatile size_t kArraySize2 = kMaxSizeT / kArraySize + 10;