[STATS] Use partitioned timer scheme

This change removes the current timers with ones that partition time properly.
The current timers are nested, so that if a new timer, B, starts when the
current timer, A, is already timing, A's time will include B's. To eliminate
this problem, the partitioned timers are designed to stop the current timer (A),
let the new timer run (B), and when the new timer is finished, restart the
previously running timer (A). With this partitioning of time, a threads' timers
all sum up to the OMP_worker_thread_life time and can now easily show the
percentage of time a thread is spending in different parts of the runtime or
user code.

There is also a new state variable associated with each thread which tells where
it is executing a task. This corresponds with the timers: OMP_task_*, e.g., if
time is spent in OMP_task_taskwait, then that thread executed tasks inside a
#pragma omp taskwait construct.

The changes are mostly changing the MACROs to use the new PARITIONED_* macros,
the new partitionedTimers class and its methods, and new state logic.

Differential Revision: http://reviews.llvm.org/D19229

llvm-svn: 268640
This commit is contained in:
Jonathan Peyton 2016-05-05 16:15:57 +00:00
parent 3e0b7837bf
commit 11dc82fa83
11 changed files with 369 additions and 68 deletions

View File

@ -1048,6 +1048,8 @@ __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
void *reduce_data, void (*reduce)(void *, void *))
{
KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
register int tid = __kmp_tid_from_gtid(gtid);
register kmp_info_t *this_thr = __kmp_threads[gtid];
register kmp_team_t *team = this_thr->th.th_team;
@ -1348,6 +1350,8 @@ __kmp_end_split_barrier(enum barrier_type bt, int gtid)
void
__kmp_join_barrier(int gtid)
{
KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
register kmp_info_t *this_thr = __kmp_threads[gtid];
register kmp_team_t *team;
@ -1463,6 +1467,18 @@ __kmp_join_barrier(int gtid)
__kmp_task_team_wait(this_thr, team
USE_ITT_BUILD_ARG(itt_sync_obj) );
}
#if KMP_STATS_ENABLED
// Have master thread flag the workers to indicate they are now waiting for
// next parallel region, Also wake them up so they switch their timers to idle.
for (int i=0; i<team->t.t_nproc; ++i) {
kmp_info_t* team_thread = team->t.t_threads[i];
if (team_thread == this_thr)
continue;
team_thread->th.th_stats->setIdleFlag();
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL)
__kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc);
}
#endif
#if USE_ITT_BUILD
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
__kmp_itt_barrier_middle(gtid, itt_sync_obj);
@ -1546,6 +1562,8 @@ __kmp_join_barrier(int gtid)
void
__kmp_fork_barrier(int gtid, int tid)
{
KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
kmp_info_t *this_thr = __kmp_threads[gtid];
kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;

View File

@ -290,7 +290,6 @@ __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
}
else
{
KMP_STOP_EXPLICIT_TIMER(OMP_serial);
KMP_COUNT_BLOCK(OMP_PARALLEL);
}
#endif
@ -345,10 +344,6 @@ __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
}
#endif
}
#if (KMP_STATS_ENABLED)
if (!inParallel)
KMP_START_EXPLICIT_TIMER(OMP_serial);
#endif
}
#if OMP_40_ENABLED
@ -669,7 +664,6 @@ void
__kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
{
KMP_COUNT_BLOCK(OMP_BARRIER);
KMP_TIME_BLOCK(OMP_barrier);
KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
if (! TCR_4(__kmp_init_parallel))
@ -713,7 +707,7 @@ __kmpc_master(ident_t *loc, kmp_int32 global_tid)
if( KMP_MASTER_GTID( global_tid )) {
KMP_COUNT_BLOCK(OMP_MASTER);
KMP_START_EXPLICIT_TIMER(OMP_master);
KMP_PUSH_PARTITIONED_TIMER(OMP_master);
status = 1;
}
@ -763,7 +757,7 @@ __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
KMP_STOP_EXPLICIT_TIMER(OMP_master);
KMP_POP_PARTITIONED_TIMER();
#if OMPT_SUPPORT && OMPT_TRACE
kmp_info_t *this_thr = __kmp_threads[ global_tid ];
@ -1088,7 +1082,7 @@ __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit )
__kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
#else
KMP_COUNT_BLOCK(OMP_CRITICAL);
KMP_TIME_BLOCK(OMP_critical_wait); /* Time spent waiting to enter the critical section */
KMP_TIME_PARTITIONED_BLOCK(OMP_critical_wait); /* Time spent waiting to enter the critical section */
kmp_user_lock_p lck;
KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
@ -1250,6 +1244,7 @@ __kmpc_critical_with_hint( ident_t * loc, kmp_int32 global_tid, kmp_critical_nam
__kmp_itt_critical_acquired( lck );
#endif /* USE_ITT_BUILD */
KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
} // __kmpc_critical_with_hint
@ -1342,7 +1337,7 @@ __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
#endif
#endif // KMP_USE_DYNAMIC_LOCK
KMP_STOP_EXPLICIT_TIMER(OMP_critical);
KMP_POP_PARTITIONED_TIMER();
KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
}
@ -1464,7 +1459,7 @@ __kmpc_single(ident_t *loc, kmp_int32 global_tid)
if (rc) {
// We are going to execute the single statement, so we should count it.
KMP_COUNT_BLOCK(OMP_SINGLE);
KMP_START_EXPLICIT_TIMER(OMP_single);
KMP_PUSH_PARTITIONED_TIMER(OMP_single);
}
#if OMPT_SUPPORT && OMPT_TRACE
@ -1507,7 +1502,7 @@ void
__kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
{
__kmp_exit_single( global_tid );
KMP_STOP_EXPLICIT_TIMER(OMP_single);
KMP_POP_PARTITIONED_TIMER();
#if OMPT_SUPPORT && OMPT_TRACE
kmp_info_t *this_thr = __kmp_threads[ global_tid ];

View File

@ -1424,7 +1424,7 @@ __kmp_dispatch_next(
// This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
// is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
// more than a compile time choice to use static scheduling would.)
KMP_TIME_BLOCK(FOR_dynamic_scheduling);
KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
int status;
dispatch_private_info_template< T > * pr;

View File

@ -1543,7 +1543,8 @@ __kmp_fork_call(
#endif
{
KMP_TIME_BLOCK(OMP_work);
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
#if OMPT_SUPPORT
, exit_runtime_p
@ -1618,7 +1619,8 @@ __kmp_fork_call(
gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
{
KMP_TIME_BLOCK(OMP_work);
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
if (! parent_team->t.t_invoke( gtid )) {
KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
}
@ -1738,7 +1740,8 @@ __kmp_fork_call(
#endif
{
KMP_TIME_BLOCK(OMP_work);
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
#if OMPT_SUPPORT
, exit_runtime_p
@ -1795,7 +1798,8 @@ __kmp_fork_call(
team->t.t_level--;
// AC: call special invoker for outer "parallel" of the teams construct
{
KMP_TIME_BLOCK(OMP_work);
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
invoker(gtid);
}
} else {
@ -1842,7 +1846,8 @@ __kmp_fork_call(
#endif
{
KMP_TIME_BLOCK(OMP_work);
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask( microtask, gtid, 0, argc, args
#if OMPT_SUPPORT
, exit_runtime_p
@ -2178,7 +2183,8 @@ __kmp_fork_call(
} // END of timer KMP_fork_call block
{
KMP_TIME_BLOCK(OMP_work);
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
// KMP_TIME_DEVELOPER_BLOCK(USER_master_invoke);
if (! team->t.t_invoke( gtid )) {
KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
@ -5448,6 +5454,8 @@ __kmp_launch_thread( kmp_info_t *this_thr )
KMP_STOP_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
{
KMP_TIME_DEVELOPER_BLOCK(USER_worker_invoke);
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
rc = (*pteam)->t.t_invoke( gtid );
}
KMP_START_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
@ -6783,7 +6791,8 @@ __kmp_invoke_task_func( int gtid )
#endif
{
KMP_TIME_BLOCK(OMP_work);
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
#if OMPT_SUPPORT

View File

@ -84,7 +84,7 @@ __kmp_for_static_init(
typename traits_t< T >::signed_t chunk
) {
KMP_COUNT_BLOCK(OMP_FOR_static);
KMP_TIME_BLOCK (FOR_static_scheduling);
KMP_TIME_PARTITIONED_BLOCK(FOR_static_scheduling);
typedef typename traits_t< T >::unsigned_t UT;
typedef typename traits_t< T >::signed_t ST;

View File

@ -157,6 +157,7 @@ std::string statistic::format(char unit, bool total) const
void explicitTimer::start(timer_e timerEnumValue) {
startTime = tsc_tick_count::now();
totalPauseTime = 0;
if(timeStat::logEvent(timerEnumValue)) {
__kmp_stats_thread_ptr->incrementNestValue();
}
@ -170,7 +171,7 @@ void explicitTimer::stop(timer_e timerEnumValue) {
tsc_tick_count finishTime = tsc_tick_count::now();
//stat->addSample ((tsc_tick_count::now() - startTime).ticks());
stat->addSample ((finishTime - startTime).ticks());
stat->addSample(((finishTime - startTime) - totalPauseTime).ticks());
if(timeStat::logEvent(timerEnumValue)) {
__kmp_stats_thread_ptr->push_event(startTime.getValue() - __kmp_stats_start_time.getValue(), finishTime.getValue() - __kmp_stats_start_time.getValue(), __kmp_stats_thread_ptr->getNestValue(), timerEnumValue);
@ -182,6 +183,74 @@ void explicitTimer::stop(timer_e timerEnumValue) {
return;
}
/* ************************************************************** */
/* ************* partitionedTimers member functions ************* */
partitionedTimers::partitionedTimers() {
timer_stack.reserve(8);
}
// add a timer to this collection of partitioned timers.
void partitionedTimers::add_timer(explicit_timer_e timer_index, explicitTimer* timer_pointer) {
KMP_DEBUG_ASSERT((int)timer_index < (int)EXPLICIT_TIMER_LAST+1);
timers[timer_index] = timer_pointer;
}
// initialize the paritioned timers to an initial timer
void partitionedTimers::init(timerPair init_timer_pair) {
KMP_DEBUG_ASSERT(this->timer_stack.size() == 0);
timer_stack.push_back(init_timer_pair);
timers[init_timer_pair.get_index()]->start(init_timer_pair.get_timer());
}
// stop/save the current timer, and start the new timer (timer_pair)
// There is a special condition where if the current timer is equal to
// the one you are trying to push, then it only manipulates the stack,
// and it won't stop/start the currently running timer.
void partitionedTimers::push(timerPair timer_pair) {
// get the current timer
// stop current timer
// push new timer
// start the new timer
KMP_DEBUG_ASSERT(this->timer_stack.size() > 0);
timerPair current_timer = timer_stack.back();
timer_stack.push_back(timer_pair);
if(current_timer != timer_pair) {
timers[current_timer.get_index()]->pause();
timers[timer_pair.get_index()]->start(timer_pair.get_timer());
}
}
// stop/discard the current timer, and start the previously saved timer
void partitionedTimers::pop() {
// get the current timer
// stop current timer
// pop current timer
// get the new current timer and start it back up
KMP_DEBUG_ASSERT(this->timer_stack.size() > 1);
timerPair current_timer = timer_stack.back();
timer_stack.pop_back();
timerPair new_timer = timer_stack.back();
if(current_timer != new_timer) {
timers[current_timer.get_index()]->stop(current_timer.get_timer());
timers[new_timer.get_index()]->resume();
}
}
// Wind up all the currently running timers.
// This pops off all the timers from the stack and clears the stack
// After this is called, init() must be run again to initialize the
// stack of timers
void partitionedTimers::windup() {
while(timer_stack.size() > 1) {
this->pop();
}
if(timer_stack.size() > 0) {
timerPair last_timer = timer_stack.back();
timer_stack.pop_back();
timers[last_timer.get_index()]->stop(last_timer.get_timer());
}
}
/* ******************************************************************* */
/* ************* kmp_stats_event_vector member functions ************* */
@ -397,8 +466,10 @@ void kmp_stats_output_module::windupExplicitTimers()
// If the timer wasn't running, this won't record anything anyway.
kmp_stats_list::iterator it;
for(it = __kmp_stats_list.begin(); it != __kmp_stats_list.end(); it++) {
kmp_stats_list* ptr = *it;
ptr->getPartitionedTimers()->windup();
for (int timer=0; timer<EXPLICIT_TIMER_LAST; timer++) {
(*it)->getExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer);
ptr->getExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer);
}
}
}
@ -595,11 +666,7 @@ void __kmp_reset_stats()
// reset the event vector so all previous events are "erased"
(*it)->resetEventVector();
// May need to restart the explicit timers in thread zero?
}
KMP_START_EXPLICIT_TIMER(OMP_serial);
KMP_START_EXPLICIT_TIMER(OMP_start_end);
}
// This function will reset all stats and stop all threads' explicit timers if they haven't been stopped already.

View File

@ -27,6 +27,7 @@
#include <limits>
#include <math.h>
#include <vector>
#include <string>
#include <stdint.h>
#include <new> // placement new
@ -51,6 +52,23 @@ enum stats_flags_e {
logEvent = 1<<4 //!< statistic can be logged on the event timeline when KMP_STATS_EVENTS is on (valid only for timers)
};
/*!
* @ingroup STATS_GATHERING
* \brief the states which a thread can be in
*
*/
enum stats_state_e {
IDLE,
SERIAL_REGION,
FORK_JOIN_BARRIER,
PLAIN_BARRIER,
TASKWAIT,
TASKYIELD,
TASKGROUP,
IMPLICIT_TASK,
EXPLICIT_TASK
};
/*!
* \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
*
@ -103,18 +121,25 @@ enum stats_flags_e {
*
* @ingroup STATS_GATHERING2
*/
#define KMP_FOREACH_TIMER(macro, arg) \
macro (OMP_start_end, stats_flags_e::onlyInMaster | stats_flags_e::noTotal, arg) \
macro (OMP_serial, stats_flags_e::onlyInMaster | stats_flags_e::noTotal, arg) \
macro (OMP_work, 0, arg) \
macro (OMP_barrier, 0, arg) \
macro (FOR_static_scheduling, 0, arg) \
macro (FOR_dynamic_scheduling, 0, arg) \
macro (OMP_task, 0, arg) \
macro (OMP_critical, 0, arg) \
macro (OMP_critical_wait, 0, arg) \
macro (OMP_single, 0, arg) \
macro (OMP_master, 0, arg) \
#define KMP_FOREACH_TIMER(macro, arg) \
macro (OMP_worker_thread_life, 0, arg) \
macro (FOR_static_scheduling, 0, arg) \
macro (FOR_dynamic_scheduling, 0, arg) \
macro (OMP_critical, 0, arg) \
macro (OMP_critical_wait, 0, arg) \
macro (OMP_single, 0, arg) \
macro (OMP_master, 0, arg) \
macro (OMP_idle, 0, arg) \
macro (OMP_plain_barrier, 0, arg) \
macro (OMP_fork_join_barrier, 0, arg) \
macro (OMP_parallel, 0, arg) \
macro (OMP_task_immediate, 0, arg) \
macro (OMP_task_taskwait, 0, arg) \
macro (OMP_task_taskyield, 0, arg) \
macro (OMP_task_taskgroup, 0, arg) \
macro (OMP_task_join_bar, 0, arg) \
macro (OMP_task_plain_bar, 0, arg) \
macro (OMP_serial, 0, arg) \
macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
macro (FOR_static_iterations, stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
@ -129,7 +154,16 @@ enum stats_flags_e {
// OMP_barrier -- Time at "real" barriers (includes task time)
// FOR_static_scheduling -- Time spent doing scheduling for a static "for"
// FOR_dynamic_scheduling -- Time spent doing scheduling for a dynamic "for"
// OMP_task -- Time spent executing tasks
// OMP_idle -- Worker threads time spent waiting for inclusion in a parallel region
// OMP_plain_barrier -- Time spent in a barrier construct
// OMP_fork_join_barrier -- Time spent in a the fork-join barrier surrounding a parallel region
// OMP_parallel -- Time spent inside a parallel construct
// OMP_task_immediate -- Time spent executing non-deferred tasks
// OMP_task_taskwait -- Time spent executing tasks inside a taskwait construct
// OMP_task_taskyield -- Time spent executing tasks inside a taskyield construct
// OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup construct
// OMP_task_join_bar -- Time spent executing tasks inside a join barrier
// OMP_task_plain_bar -- Time spent executing tasks inside a barrier construct
// OMP_single -- Time spent executing a "single" region
// OMP_master -- Time spent executing a "master" region
// OMP_set_numthreads -- Values passed to omp_set_num_threads
@ -197,12 +231,25 @@ enum stats_flags_e {
*
* @ingroup STATS_GATHERING
*/
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \
macro(OMP_serial, 0, arg) \
macro(OMP_start_end, 0, arg) \
macro(OMP_critical, 0, arg) \
macro(OMP_single, 0, arg) \
macro(OMP_master, 0, arg) \
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \
macro(OMP_worker_thread_life, 0, arg) \
macro(FOR_static_scheduling, 0, arg) \
macro(FOR_dynamic_scheduling, 0, arg) \
macro(OMP_critical, 0, arg) \
macro(OMP_critical_wait, 0, arg) \
macro(OMP_single, 0, arg) \
macro(OMP_master, 0, arg) \
macro(OMP_idle, 0, arg) \
macro(OMP_plain_barrier, 0, arg) \
macro(OMP_fork_join_barrier, 0, arg) \
macro(OMP_parallel, 0, arg) \
macro(OMP_task_immediate, 0, arg) \
macro(OMP_task_taskwait, 0, arg) \
macro(OMP_task_taskyield, 0, arg) \
macro(OMP_task_taskgroup, 0, arg) \
macro(OMP_task_join_bar, 0, arg) \
macro(OMP_task_plain_bar, 0, arg) \
macro(OMP_serial, 0, arg) \
KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro,arg) \
macro(LAST, 0, arg)
@ -227,6 +274,21 @@ enum counter_e {
};
#undef ENUMERATE
class timerPair {
explicit_timer_e timer_index;
timer_e timer;
public:
timerPair(explicit_timer_e ti, timer_e t) : timer_index(ti), timer(t) {}
inline explicit_timer_e get_index() const { return timer_index; }
inline timer_e get_timer() const { return timer; }
bool operator==(const timerPair & rhs) {
return this->get_index() == rhs.get_index();
}
bool operator!=(const timerPair & rhs) {
return !(*this == rhs);
}
};
class statistic
{
double minVal;
@ -294,15 +356,19 @@ class explicitTimer
{
timeStat * stat;
tsc_tick_count startTime;
tsc_tick_count pauseStartTime;
tsc_tick_count::tsc_interval_t totalPauseTime;
public:
explicitTimer () : stat(0), startTime(0) { }
explicitTimer (timeStat * s) : stat(s), startTime() { }
explicitTimer () : stat(0), startTime(0), pauseStartTime(0), totalPauseTime() { }
explicitTimer (timeStat * s) : stat(s), startTime(), pauseStartTime(0), totalPauseTime() { }
void setStat (timeStat *s) { stat = s; }
void start(timer_e timerEnumValue);
void pause() { pauseStartTime = tsc_tick_count::now(); }
void resume() { totalPauseTime += (tsc_tick_count::now() - pauseStartTime); }
void stop(timer_e timerEnumValue);
void reset() { startTime = 0; }
void reset() { startTime = 0; pauseStartTime = 0; totalPauseTime = 0; }
};
// Where all you need is to time a block, this is enough.
@ -315,6 +381,49 @@ class blockTimer : public explicitTimer
~blockTimer() { stop(timerEnumValue); }
};
// Where you need to partition a threads clock ticks into separate states
// e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
// DOING_NOTHING would render these conditions:
// time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
// No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice versa
class partitionedTimers
{
private:
explicitTimer* timers[EXPLICIT_TIMER_LAST+1];
std::vector<timerPair> timer_stack;
public:
partitionedTimers();
void add_timer(explicit_timer_e timer_index, explicitTimer* timer_pointer);
void init(timerPair timer_index);
void push(timerPair timer_index);
void pop();
void windup();
};
// Special wrapper around the partioned timers to aid timing code blocks
// It avoids the need to have an explicit end, leaving the scope suffices.
class blockPartitionedTimer
{
partitionedTimers* part_timers;
timerPair timer_pair;
public:
blockPartitionedTimer(partitionedTimers* pt, timerPair tp) : part_timers(pt), timer_pair(tp) { part_timers->push(timer_pair); }
~blockPartitionedTimer() { part_timers->pop(); }
};
// Special wrapper around the thread state to aid in keeping state in code blocks
// It avoids the need to have an explicit end, leaving the scope suffices.
class blockThreadState
{
stats_state_e* state_pointer;
stats_state_e old_state;
public:
blockThreadState(stats_state_e* thread_state_pointer, stats_state_e new_state) : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
*state_pointer = new_state;
}
~blockThreadState() { *state_pointer = old_state; }
};
// If all you want is a count, then you can use this...
// The individual per-thread counts will be aggregated into a statistic at program exit.
class counter
@ -473,14 +582,19 @@ class kmp_stats_list {
timeStat _timers[TIMER_LAST+1];
counter _counters[COUNTER_LAST+1];
explicitTimer _explicitTimers[EXPLICIT_TIMER_LAST+1];
partitionedTimers _partitionedTimers;
int _nestLevel; // one per thread
kmp_stats_event_vector _event_vector;
kmp_stats_list* next;
kmp_stats_list* prev;
stats_state_e state;
int thread_is_idle_flag;
public:
kmp_stats_list() : next(this) , prev(this) , _event_vector(), _nestLevel(0) {
kmp_stats_list() : _nestLevel(0), _event_vector(), next(this), prev(this),
state(IDLE), thread_is_idle_flag(0) {
#define doInit(name,ignore1,ignore2) \
getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name));
getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name)); \
_partitionedTimers.add_timer(EXPLICIT_TIMER_##name, getExplicitTimer(EXPLICIT_TIMER_##name));
KMP_FOREACH_EXPLICIT_TIMER(doInit,0);
#undef doInit
}
@ -488,6 +602,7 @@ class kmp_stats_list {
inline timeStat * getTimer(timer_e idx) { return &_timers[idx]; }
inline counter * getCounter(counter_e idx) { return &_counters[idx]; }
inline explicitTimer * getExplicitTimer(explicit_timer_e idx) { return &_explicitTimers[idx]; }
inline partitionedTimers * getPartitionedTimers() { return &_partitionedTimers; }
inline timeStat * getTimers() { return _timers; }
inline counter * getCounters() { return _counters; }
inline explicitTimer * getExplicitTimers() { return _explicitTimers; }
@ -498,6 +613,12 @@ class kmp_stats_list {
inline void decrementNestValue() { _nestLevel--; }
inline int getGtid() const { return gtid; }
inline void setGtid(int newgtid) { gtid = newgtid; }
inline void setState(stats_state_e newstate) { state = newstate; }
inline stats_state_e getState() const { return state; }
inline stats_state_e * getStatePointer() { return &state; }
inline bool isIdle() { return thread_is_idle_flag==1; }
inline void setIdleFlag() { thread_is_idle_flag = 1; }
inline void resetIdleFlag() { thread_is_idle_flag = 0; }
kmp_stats_list* push_back(int gtid); // returns newly created list node
inline void push_event(uint64_t start_time, uint64_t stop_time, int nest_level, timer_e name) {
_event_vector.push_back(start_time, stop_time, nest_level, name);
@ -698,6 +819,35 @@ extern kmp_stats_output_module __kmp_stats_output;
#define KMP_OUTPUT_STATS(heading_string) \
__kmp_output_stats(heading_string)
/*!
* \brief Initializes the paritioned timers to begin with name.
*
* @param name timer which you want this thread to begin with
*
* @ingroup STATS_GATHERING
*/
#define KMP_INIT_PARTITIONED_TIMERS(name) \
__kmp_stats_thread_ptr->getPartitionedTimers()->init(timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
#define KMP_TIME_PARTITIONED_BLOCK(name) \
blockPartitionedTimer __PBLOCKTIME__(__kmp_stats_thread_ptr->getPartitionedTimers(), \
timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
#define KMP_PUSH_PARTITIONED_TIMER(name) \
__kmp_stats_thread_ptr->getPartitionedTimers()->push(timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
#define KMP_POP_PARTITIONED_TIMER() \
__kmp_stats_thread_ptr->getPartitionedTimers()->pop()
#define KMP_SET_THREAD_STATE(state_name) \
__kmp_stats_thread_ptr->setState(state_name)
#define KMP_GET_THREAD_STATE() \
__kmp_stats_thread_ptr->getState()
#define KMP_SET_THREAD_STATE_BLOCK(state_name) \
blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), state_name)
/*!
* \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
*
@ -739,6 +889,13 @@ extern kmp_stats_output_module __kmp_stats_output;
#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
#define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
#define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
#define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
#define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
#define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
#define KMP_POP_PARTITIONED_TIMER() ((void)0)
#define KMP_SET_THREAD_STATE(state_name) ((void)0)
#define KMP_GET_THREAD_STATE() ((void)0)
#define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
#endif // KMP_STATS_ENABLED
#endif // KMP_STATS_H

View File

@ -40,11 +40,16 @@ class tsc_tick_count {
#endif
double ticks() const { return double(value); }
int64_t getValue() const { return value; }
tsc_interval_t& operator=(int64_t nvalue) { value = nvalue; return *this; }
friend class tsc_tick_count;
friend tsc_interval_t operator-(
const tsc_tick_count t1, const tsc_tick_count t0);
friend tsc_interval_t operator-(const tsc_tick_count& t1,
const tsc_tick_count& t0);
friend tsc_interval_t operator-(const tsc_tick_count::tsc_interval_t& i1,
const tsc_tick_count::tsc_interval_t& i0);
friend tsc_interval_t& operator+=(tsc_tick_count::tsc_interval_t& i1,
const tsc_tick_count::tsc_interval_t& i0);
};
#if KMP_HAVE___BUILTIN_READCYCLECOUNTER
@ -66,14 +71,25 @@ class tsc_tick_count {
static double tick_time(); // returns seconds per cycle (period) of clock
#endif
static tsc_tick_count now() { return tsc_tick_count(); } // returns the rdtsc register value
friend tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count t1, const tsc_tick_count t0);
friend tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count& t1, const tsc_tick_count& t0);
};
inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count t1, const tsc_tick_count t0)
inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count& t1, const tsc_tick_count& t0)
{
return tsc_tick_count::tsc_interval_t( t1.my_count-t0.my_count );
}
inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count::tsc_interval_t& i1, const tsc_tick_count::tsc_interval_t& i0)
{
return tsc_tick_count::tsc_interval_t( i1.value-i0.value );
}
inline tsc_tick_count::tsc_interval_t& operator+=(tsc_tick_count::tsc_interval_t& i1, const tsc_tick_count::tsc_interval_t& i0)
{
i1.value += i0.value;
return i1;
}
#if KMP_HAVE_TICK_TIME
inline double tsc_tick_count::tsc_interval_t::seconds() const
{

View File

@ -36,16 +36,6 @@ static int __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t
static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask );
#endif
static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
if (!flag) return;
// Attempt to wake up a thread: examine its type and call appropriate template
switch (((kmp_flag_64 *)flag)->get_type()) {
case flag32: __kmp_resume_32(gtid, NULL); break;
case flag64: __kmp_resume_64(gtid, NULL); break;
case flag_oncore: __kmp_resume_oncore(gtid, NULL); break;
}
}
#ifdef BUILD_TIED_TASK_STACK
//---------------------------------------------------------------------------
@ -1207,8 +1197,17 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta
// Thunks generated by gcc take a different argument list.
//
if (!discard) {
#if KMP_STATS_ENABLED
KMP_COUNT_BLOCK(TASK_executed);
KMP_TIME_BLOCK (OMP_task);
switch(KMP_GET_THREAD_STATE()) {
case FORK_JOIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); break;
case PLAIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); break;
case TASKYIELD: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); break;
case TASKWAIT: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); break;
case TASKGROUP: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); break;
default: KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); break;
}
#endif // KMP_STATS_ENABLED
#endif // OMP_40_ENABLED
#if OMPT_SUPPORT && OMPT_TRACE
@ -1231,6 +1230,7 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta
{
(*(task->routine))(gtid, task);
}
KMP_POP_PARTITIONED_TIMER();
#if OMPT_SUPPORT && OMPT_TRACE
/* let OMPT know that we're returning to the callee task */
@ -1369,6 +1369,7 @@ kmp_int32
__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
{
kmp_int32 res;
KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
#if KMP_DEBUG
kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
@ -1392,6 +1393,7 @@ __kmpc_omp_taskwait( ident_t *loc_ref, kmp_int32 gtid )
kmp_taskdata_t * taskdata;
kmp_info_t * thread;
int thread_finished = FALSE;
KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref) );
@ -1481,6 +1483,7 @@ __kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part )
int thread_finished = FALSE;
KMP_COUNT_BLOCK(OMP_TASKYIELD);
KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
gtid, loc_ref, end_part) );
@ -1561,6 +1564,7 @@ __kmpc_end_taskgroup( ident_t* loc, int gtid )
KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc) );
KMP_DEBUG_ASSERT( taskgroup != NULL );
KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
#if USE_ITT_BUILD

View File

@ -18,6 +18,7 @@
#include "kmp.h"
#include "kmp_itt.h"
#include "kmp_stats.h"
/*!
@defgroup WAIT_RELEASE Wait/Release operations
@ -104,6 +105,9 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin
}
th_gtid = this_thr->th.th_info.ds.ds_gtid;
KA_TRACE(20, ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
#if KMP_STATS_ENABLED
stats_state_e thread_state = KMP_GET_THREAD_STATE();
#endif
#if OMPT_SUPPORT && OMPT_BLAME
ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
@ -223,6 +227,15 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin
}
}
#if KMP_STATS_ENABLED
// Check if thread has been signalled to idle state
// This indicates that the logical "join-barrier" has finished
if (this_thr->th.th_stats->isIdle() && KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) {
KMP_SET_THREAD_STATE(IDLE);
KMP_PUSH_PARTITIONED_TIMER(OMP_idle);
}
#endif
// Don't suspend if KMP_BLOCKTIME is set to "infinite"
if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
continue;
@ -273,6 +286,14 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin
}
}
#endif
#if KMP_STATS_ENABLED
// If we were put into idle state, pop that off the state stack
if (KMP_GET_THREAD_STATE() == IDLE) {
KMP_POP_PARTITIONED_TIMER();
KMP_SET_THREAD_STATE(thread_state);
this_thr->th.th_stats->resetIdleFlag();
}
#endif
KMP_FSYNC_SPIN_ACQUIRED(spin);
}
@ -556,6 +577,15 @@ public:
flag_type get_ptr_type() { return flag_oncore; }
};
// Used to wake up threads, volatile void* flag is usually the th_sleep_loc associated
// with int gtid.
static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
switch (((kmp_flag_64 *)flag)->get_type()) {
case flag32: __kmp_resume_32(gtid, NULL); break;
case flag64: __kmp_resume_64(gtid, NULL); break;
case flag_oncore: __kmp_resume_oncore(gtid, NULL); break;
}
}
/*!
@}

View File

@ -697,6 +697,9 @@ __kmp_launch_worker( void *thr )
#if KMP_STATS_ENABLED
// set __thread local index to point to thread-specific stats
__kmp_stats_thread_ptr = ((kmp_info_t*)thr)->th.th_stats;
KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
KMP_SET_THREAD_STATE(IDLE);
KMP_INIT_PARTITIONED_TIMERS(OMP_idle);
#endif
#if USE_ITT_BUILD
@ -972,8 +975,9 @@ __kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size )
__kmp_stats_start_time = tsc_tick_count::now();
__kmp_stats_thread_ptr = th->th.th_stats;
__kmp_stats_init();
KMP_START_EXPLICIT_TIMER(OMP_serial);
KMP_START_EXPLICIT_TIMER(OMP_start_end);
KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
KMP_SET_THREAD_STATE(SERIAL_REGION);
KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
}
__kmp_release_tas_lock(&__kmp_stats_lock, gtid);
@ -1856,6 +1860,7 @@ void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
void
__kmp_resume_monitor()
{
KMP_TIME_DEVELOPER_BLOCK(USER_resume);
int status;
#ifdef KMP_DEBUG
int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;