From 0c86244a5d56cb93f28545a6412ca8166c77178e Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Tue, 26 Apr 2022 19:01:51 -0500 Subject: [PATCH 1/4] Add load imbalance measurement in batched drivers. --- src/QMCDrivers/DMC/DMCBatched.cpp | 4 +++- src/QMCDrivers/QMCDriverInput.cpp | 5 +++++ src/QMCDrivers/QMCDriverInput.h | 10 +++++++--- src/QMCDrivers/QMCDriverNew.cpp | 30 ++++++++++++++++++++++++++++-- src/QMCDrivers/QMCDriverNew.h | 7 +++++++ src/QMCDrivers/VMC/VMCBatched.cpp | 4 +++- 6 files changed, 53 insertions(+), 7 deletions(-) diff --git a/src/QMCDrivers/DMC/DMCBatched.cpp b/src/QMCDrivers/DMC/DMCBatched.cpp index deca4b21c..5f2630f35 100644 --- a/src/QMCDrivers/DMC/DMCBatched.cpp +++ b/src/QMCDrivers/DMC/DMCBatched.cpp @@ -103,7 +103,7 @@ void DMCBatched::advanceWalkers(const StateForThread& sft, } const int num_walkers = crowd.size(); - auto& pset_leader = walker_elecs.getLeader(); + auto& pset_leader = walker_elecs.getLeader(); const int num_particles = pset_leader.getTotalNum(); MCCoords drifts(num_walkers), drifts_reverse(num_walkers); @@ -475,6 +475,8 @@ bool DMCBatched::run() population_.redistributeWalkers(crowds_); } print_mem("DMCBatched after a block", app_debug_stream()); + if (qmcdriver_input_.get_measure_imbalance()) + measureImbalance(block); endBlock(); dmc_loop.stop(); diff --git a/src/QMCDrivers/QMCDriverInput.cpp b/src/QMCDrivers/QMCDriverInput.cpp index 3970ecd4f..3c96d52fb 100644 --- a/src/QMCDrivers/QMCDriverInput.cpp +++ b/src/QMCDrivers/QMCDriverInput.cpp @@ -36,6 +36,7 @@ void QMCDriverInput::readXML(xmlNodePtr cur) std::string serialize_walkers; std::string debug_checks_str; + std::string measure_imbalance_str; ParameterSet parameter_set; parameter_set.add(store_config_period_, "storeconfigs"); @@ -70,6 +71,7 @@ void QMCDriverInput::readXML(xmlNodePtr cur) parameter_set.add(max_disp_sq_, "maxDisplSq"); parameter_set.add(debug_checks_str, "debug_checks", {"no", "all", "checkGL_after_load", "checkGL_after_moves", "checkGL_after_tmove"}); + parameter_set.add(measure_imbalance_str, "measure_imbalance", {"no", "yes"}); OhmmsAttributeSet aAttrib; // first stage in from QMCDriverFactory @@ -139,6 +141,9 @@ void QMCDriverInput::readXML(xmlNodePtr cur) debug_checks_ |= DriverDebugChecks::CHECKGL_AFTER_TMOVE; } + if (measure_imbalance_str == "yes") + measure_imbalance_ = true; + if (check_point_period_.period < 1) check_point_period_.period = max_blocks_; } diff --git a/src/QMCDrivers/QMCDriverInput.h b/src/QMCDrivers/QMCDriverInput.h index 08c545d97..7f23e2630 100644 --- a/src/QMCDrivers/QMCDriverInput.h +++ b/src/QMCDrivers/QMCDriverInput.h @@ -31,8 +31,8 @@ public: void readXML(xmlNodePtr cur); // To allow compile check if move constructor is still implicit - QMCDriverInput() = default; - QMCDriverInput(const QMCDriverInput&) = default; + QMCDriverInput() = default; + QMCDriverInput(const QMCDriverInput&) = default; QMCDriverInput& operator=(const QMCDriverInput&) = default; QMCDriverInput(QMCDriverInput&&) noexcept; QMCDriverInput& operator=(QMCDriverInput&&) noexcept; @@ -41,6 +41,9 @@ protected: bool scoped_profiling_ = false; /// determine additional checks for debugging purpose DriverDebugChecks debug_checks_ = DriverDebugChecks::ALL_OFF; + /// measure load imbalance (add a barrier) before data aggregation (obvious synchronization) + bool measure_imbalance_ = false; + /** @ingroup Input Parameters for QMCDriver base class * @{ * All input determined variables should be here @@ -128,13 +131,14 @@ public: DriverDebugChecks get_debug_checks() const { return debug_checks_; } bool get_scoped_profiling() const { return scoped_profiling_; } bool are_walkers_serialized() const { return crowd_serialize_walkers_; } + bool get_measure_imbalance() const { return measure_imbalance_; } const std::string get_drift_modifier() const { return drift_modifier_; } RealType get_drift_modifier_unr_a() const { return drift_modifier_unr_a_; } }; // These will cause a compiler error if the implicit move constructor has been broken -inline QMCDriverInput::QMCDriverInput(QMCDriverInput&&) noexcept = default; +inline QMCDriverInput::QMCDriverInput(QMCDriverInput&&) noexcept = default; inline QMCDriverInput& QMCDriverInput::operator=(QMCDriverInput&&) noexcept = default; } // namespace qmcplusplus diff --git a/src/QMCDrivers/QMCDriverNew.cpp b/src/QMCDrivers/QMCDriverNew.cpp index 9077e9f64..b1a866c3d 100644 --- a/src/QMCDrivers/QMCDriverNew.cpp +++ b/src/QMCDrivers/QMCDriverNew.cpp @@ -31,6 +31,7 @@ #include "Concurrency/Info.hpp" #include "QMCDrivers/GreenFunctionModifiers/DriftModifierBuilder.h" #include "Utilities/StlPrettyPrint.hpp" +#include "Utilities/Timer.h" #include "Message/UniformCommunicateError.h" namespace qmcplusplus @@ -234,8 +235,7 @@ bool QMCDriverNew::finalize(int block, bool dumpwalkers) return true; } -void QMCDriverNew::makeLocalWalkers(IndexType nwalkers, - RealType reserve) +void QMCDriverNew::makeLocalWalkers(IndexType nwalkers, RealType reserve) { ScopedTimer local_timer(timers_.create_walkers_timer); // ensure nwalkers local walkers in population_ @@ -462,6 +462,7 @@ QMCDriverNew::AdjustedWalkerCounts QMCDriverNew::adjustGlobalWalkerCount(int num */ void QMCDriverNew::endBlock() { + ScopedTimer local_timer(timers_.endblock_timer); RefVector all_scalar_estimators; FullPrecRealType total_block_weight = 0.0; @@ -570,4 +571,29 @@ void QMCDriverNew::checkLogAndGL(Crowd& crowd, const std::string_view location) throw std::runtime_error(std::string("checkLogAndGL failed at ") + std::string(location) + std::string("\n")); } +void QMCDriverNew::measureImbalance(int block) const +{ + ScopedTimer local_timer(timers_.imbalance_timer); + Timer only_this_barrier; + myComm->barrier(); + std::vector my_barrier_time(1, only_this_barrier.elapsed()); + std::vector barrier_time_all_ranks(myComm->size(), 0.0); + myComm->gather(my_barrier_time, barrier_time_all_ranks, 0); + if (!myComm->rank()) + { + auto const count = static_cast(barrier_time_all_ranks.size()); + const auto max_it = std::max_element(barrier_time_all_ranks.begin(), barrier_time_all_ranks.end()); + const auto min_it = std::min_element(barrier_time_all_ranks.begin(), barrier_time_all_ranks.end()); + app_log() << std::endl + << "Block " << block << " imbalance (slow ranks wait less):" << std::endl + << " min wait at " << std::distance(barrier_time_all_ranks.begin(), min_it) << " value = " << *min_it + << std::endl + << " max wait at " << std::distance(barrier_time_all_ranks.begin(), max_it) << " value = " << *max_it + << std::endl + << " average wait value = " + << std::accumulate(barrier_time_all_ranks.begin(), barrier_time_all_ranks.end(), 0.0) / count + << std::endl; + } +} + } // namespace qmcplusplus diff --git a/src/QMCDrivers/QMCDriverNew.h b/src/QMCDrivers/QMCDriverNew.h index dbb029eaf..b9376c55c 100644 --- a/src/QMCDrivers/QMCDriverNew.h +++ b/src/QMCDrivers/QMCDriverNew.h @@ -102,6 +102,9 @@ public: std::bitset qmc_driver_mode_; protected: + /// inject additional barrier and measure load imbalance. + void measureImbalance(int block) const; + /// end of a block operations. Aggregates statistics across all MPI ranks and write to disk. void endBlock(); /** This is a data structure strictly for QMCDriver and its derived classes * @@ -325,6 +328,8 @@ protected: NewTimer& hamiltonian_timer; NewTimer& collectables_timer; NewTimer& estimators_timer; + NewTimer& imbalance_timer; + NewTimer& endblock_timer; NewTimer& resource_timer; DriverTimers(const std::string& prefix) : checkpoint_timer(*timer_manager.createTimer(prefix + "CheckPoint", timer_level_medium)), @@ -336,6 +341,8 @@ protected: hamiltonian_timer(*timer_manager.createTimer(prefix + "Hamiltonian", timer_level_medium)), collectables_timer(*timer_manager.createTimer(prefix + "Collectables", timer_level_medium)), estimators_timer(*timer_manager.createTimer(prefix + "Estimators", timer_level_medium)), + imbalance_timer(*timer_manager.createTimer(prefix + "Imbalance", timer_level_medium)), + endblock_timer(*timer_manager.createTimer(prefix + "BlockEndDataAggregation", timer_level_medium)), resource_timer(*timer_manager.createTimer(prefix + "Resources", timer_level_medium)) {} }; diff --git a/src/QMCDrivers/VMC/VMCBatched.cpp b/src/QMCDrivers/VMC/VMCBatched.cpp index 75fd2a7b4..031dc8dea 100644 --- a/src/QMCDrivers/VMC/VMCBatched.cpp +++ b/src/QMCDrivers/VMC/VMCBatched.cpp @@ -66,7 +66,7 @@ void VMCBatched::advanceWalkers(const StateForThread& sft, timers.movepbyp_timer.start(); const int num_walkers = crowd.size(); - auto& walker_leader = walker_elecs.getLeader(); + auto& walker_leader = walker_elecs.getLeader(); const int num_particles = walker_leader.getTotalNum(); // Note std::vector is not like the rest of stl. std::vector moved(num_walkers, false); @@ -362,6 +362,8 @@ bool VMCBatched::run() } } print_mem("VMCBatched after a block", app_debug_stream()); + if (qmcdriver_input_.get_measure_imbalance()) + measureImbalance(block); endBlock(); vmc_loop.stop(); From 101fa5913f439f8acd752e463e38d8157e4f9e6d Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Tue, 26 Apr 2022 19:21:46 -0500 Subject: [PATCH 2/4] Document measure_imbalance driver parameter. --- docs/methods.rst | 152 ++++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 74 deletions(-) diff --git a/docs/methods.rst b/docs/methods.rst index 7de0eb192..6ca89a1d0 100644 --- a/docs/methods.rst +++ b/docs/methods.rst @@ -316,39 +316,41 @@ Batched ``vmc`` driver (experimental) parameters: - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | **Name** | **Datatype** | **Values** | **Default** | **Description** | - +================================+==============+=========================+=============+===============================================+ - | ``total_walkers`` | integer | :math:`> 0` | 1 | Total number of walkers over all MPI ranks | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``walkers_per_rank`` | integer | :math:`> 0` | 1 | Number of walkers per MPI rank | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``crowds`` | integer | :math:`> 0` | dep. | Number of desynchronized dwalker crowds | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``blocks`` | integer | :math:`\geq 0` | 1 | Number of blocks | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``steps`` | integer | :math:`\geq 0` | 1 | Number of steps per block | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``warmupsteps`` | integer | :math:`\geq 0` | 0 | Number of steps for warming up | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``substeps`` | integer | :math:`\geq 0` | 1 | Number of substeps per step | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``usedrift`` | text | yes,no | yes | Use the algorithm with drift | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``timestep`` | real | :math:`> 0` | 0.1 | Time step for each electron move | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``samples`` (not ready) | integer | :math:`\geq 0` | 0 | Number of walker samples for in this VMC run | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``storeconfigs`` (not ready) | integer | all values | 0 | Write configurations to files | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``blocks_between_recompute`` | integer | :math:`\geq 0` | dep. | Wavefunction recompute frequency | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``crowd_serialize_walkers`` | integer | yes, no | no | Force use of single walker APIs (for testing) | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``debug_checks`` | text | see additional info | dep. | Turn on/off additional recompute and checks | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``spin_mass`` | real | :math:`\geq 0` | 1.0 | Effective mass for spin sampling | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | **Name** | **Datatype** | **Values** | **Default** | **Description** | + +================================+==============+=========================+=============+=================================================+ + | ``total_walkers`` | integer | :math:`> 0` | 1 | Total number of walkers over all MPI ranks | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``walkers_per_rank`` | integer | :math:`> 0` | 1 | Number of walkers per MPI rank | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``crowds`` | integer | :math:`> 0` | dep. | Number of desynchronized dwalker crowds | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``blocks`` | integer | :math:`\geq 0` | 1 | Number of blocks | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``steps`` | integer | :math:`\geq 0` | 1 | Number of steps per block | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``warmupsteps`` | integer | :math:`\geq 0` | 0 | Number of steps for warming up | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``substeps`` | integer | :math:`\geq 0` | 1 | Number of substeps per step | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``usedrift`` | text | yes,no | yes | Use the algorithm with drift | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``timestep`` | real | :math:`> 0` | 0.1 | Time step for each electron move | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``samples`` (not ready) | integer | :math:`\geq 0` | 0 | Number of walker samples for in this VMC run | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``storeconfigs`` (not ready) | integer | all values | 0 | Write configurations to files | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``blocks_between_recompute`` | integer | :math:`\geq 0` | dep. | Wavefunction recompute frequency | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``crowd_serialize_walkers`` | integer | yes, no | no | Force use of single walker APIs (for testing) | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``debug_checks`` | text | see additional info | dep. | Turn on/off additional recompute and checks | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``spin_mass`` | real | :math:`\geq 0` | 1.0 | Effective mass for spin sampling | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``measure_imbalance`` | text | yes,no | no | Measure load imbalance at the end of each block | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ Additional information: @@ -1564,47 +1566,49 @@ Batched ``dmc`` driver (experimental) parameters: - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | **Name** | **Datatype** | **Values** | **Default** | **Description** | - +================================+==============+=========================+=============+===============================================+ - | ``total_walkers`` | integer | :math:`> 0` | 1 | Total number of walkers over all MPI ranks | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``walkers_per_rank`` | integer | :math:`> 0` | 1 | Number of walkers per MPI rank | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``crowds`` | integer | :math:`> 0` | dep. | Number of desynchronized dwalker crowds | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``blocks`` | integer | :math:`\geq 0` | 1 | Number of blocks | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``steps`` | integer | :math:`\geq 0` | 1 | Number of steps per block | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``warmupsteps`` | integer | :math:`\geq 0` | 0 | Number of steps for warming up | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``timestep`` | real | :math:`> 0` | 0.1 | Time step for each electron move | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``nonlocalmoves`` | string | yes, no, v0, v1, v3 | no | Run with T-moves | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``branching_cutoff_scheme`` | string | classic/DRV/ZSGMA/YL | classic | Branch cutoff scheme | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``blocks_between_recompute`` | integer | :math:`\geq 0` | dep. | Wavefunction recompute frequency | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``feedback`` | double | :math:`\geq 0` | 1.0 | Population feedback on the trial energy | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``sigmaBound`` | 10 | :math:`\geq 0` | 10 | Parameter to cutoff large weights | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``reconfiguration`` | string | yes/pure/other | no | Fixed population technique | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``storeconfigs`` | integer | all values | 0 | Store configurations | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``use_nonblocking`` | string | yes/no | yes | Using nonblocking send/recv | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``debug_disable_branching`` | string | yes/no | no | Disable branching for debugging | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``crowd_serialize_walkers`` | integer | yes, no | no | Force use of single walker APIs (for testing) | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``debug_checks`` | text | see additional info | dep. | Turn on/off additional recompute and checks | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ - | ``spin_mass`` | real | :math:`\geq 0` | 1.0 | Effective mass for spin sampling | - +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+ + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | **Name** | **Datatype** | **Values** | **Default** | **Description** | + +================================+==============+=========================+=============+=================================================+ + | ``total_walkers`` | integer | :math:`> 0` | 1 | Total number of walkers over all MPI ranks | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``walkers_per_rank`` | integer | :math:`> 0` | 1 | Number of walkers per MPI rank | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``crowds`` | integer | :math:`> 0` | dep. | Number of desynchronized dwalker crowds | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``blocks`` | integer | :math:`\geq 0` | 1 | Number of blocks | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``steps`` | integer | :math:`\geq 0` | 1 | Number of steps per block | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``warmupsteps`` | integer | :math:`\geq 0` | 0 | Number of steps for warming up | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``timestep`` | real | :math:`> 0` | 0.1 | Time step for each electron move | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``nonlocalmoves`` | string | yes, no, v0, v1, v3 | no | Run with T-moves | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``branching_cutoff_scheme`` | string | classic/DRV/ZSGMA/YL | classic | Branch cutoff scheme | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``blocks_between_recompute`` | integer | :math:`\geq 0` | dep. | Wavefunction recompute frequency | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``feedback`` | double | :math:`\geq 0` | 1.0 | Population feedback on the trial energy | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``sigmaBound`` | 10 | :math:`\geq 0` | 10 | Parameter to cutoff large weights | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``reconfiguration`` | string | yes/pure/other | no | Fixed population technique | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``storeconfigs`` | integer | all values | 0 | Store configurations | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``use_nonblocking`` | string | yes/no | yes | Using nonblocking send/recv | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``debug_disable_branching`` | string | yes/no | no | Disable branching for debugging | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``crowd_serialize_walkers`` | integer | yes, no | no | Force use of single walker APIs (for testing) | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``debug_checks`` | text | see additional info | dep. | Turn on/off additional recompute and checks | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``spin_mass`` | real | :math:`\geq 0` | 1.0 | Effective mass for spin sampling | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ + | ``measure_imbalance`` | text | yes,no | no | Measure load imbalance at the end of each block | + +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+ - ``crowds`` The number of crowds that the walkers are subdivided into on each MPI rank. If not provided, it is set equal to the number of OpenMP threads. From 23d3523e94c58a3ffd52dd90c487accc6c0ea407 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Tue, 26 Apr 2022 19:30:07 -0500 Subject: [PATCH 3/4] Update printing. --- src/QMCDrivers/QMCDriverNew.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/QMCDrivers/QMCDriverNew.cpp b/src/QMCDrivers/QMCDriverNew.cpp index b1a866c3d..175c01d0c 100644 --- a/src/QMCDrivers/QMCDriverNew.cpp +++ b/src/QMCDrivers/QMCDriverNew.cpp @@ -586,13 +586,12 @@ void QMCDriverNew::measureImbalance(int block) const const auto min_it = std::min_element(barrier_time_all_ranks.begin(), barrier_time_all_ranks.end()); app_log() << std::endl << "Block " << block << " imbalance (slow ranks wait less):" << std::endl - << " min wait at " << std::distance(barrier_time_all_ranks.begin(), min_it) << " value = " << *min_it - << std::endl - << " max wait at " << std::distance(barrier_time_all_ranks.begin(), max_it) << " value = " << *max_it - << std::endl - << " average wait value = " - << std::accumulate(barrier_time_all_ranks.begin(), barrier_time_all_ranks.end(), 0.0) / count - << std::endl; + << " average wait seconds = " + << std::accumulate(barrier_time_all_ranks.begin(), barrier_time_all_ranks.end(), 0.0) / count << std::endl + << " min wait at rank " << std::distance(barrier_time_all_ranks.begin(), min_it) + << ", seconds = " << *min_it << std::endl + << " max wait at rank " << std::distance(barrier_time_all_ranks.begin(), max_it) + << ", seconds = " << *max_it << std::endl; } } From 223de680de8d9602c7644ffa64399d4c2929650b Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Wed, 27 Apr 2022 17:27:52 -0400 Subject: [PATCH 4/4] Refine timers in batched drivers. --- src/QMCDrivers/DMC/DMCBatched.cpp | 14 +++++++++----- src/QMCDrivers/QMCDriverNew.cpp | 12 ++++++++---- src/QMCDrivers/QMCDriverNew.h | 6 +++++- src/QMCDrivers/VMC/VMCBatched.cpp | 13 ++++++++++--- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/src/QMCDrivers/DMC/DMCBatched.cpp b/src/QMCDrivers/DMC/DMCBatched.cpp index 5f2630f35..ac562dda0 100644 --- a/src/QMCDrivers/DMC/DMCBatched.cpp +++ b/src/QMCDrivers/DMC/DMCBatched.cpp @@ -428,18 +428,22 @@ bool DMCBatched::run() ScopedTimer local_timer(timers_.init_walkers_timer); ParallelExecutor<> section_start_task; section_start_task(crowds_.size(), initialLogEvaluation, std::ref(crowds_), std::ref(step_contexts_)); - } - print_mem("DMCBatched after initialLogEvaluation", app_summary()); - - { FullPrecRealType energy, variance; population_.measureGlobalEnergyVariance(*myComm, energy, variance); // false indicates we do not support kill at node crossings. branch_engine_->initParam(population_, energy, variance, dmcdriver_input_.get_reconfiguration(), false); walker_controller_->setTrialEnergy(branch_engine_->getEtrial()); + + print_mem("DMCBatched after initialLogEvaluation", app_summary()); + if (qmcdriver_input_.get_measure_imbalance()) + measureImbalance("InitialLogEvaluation"); } + // this barrier fences all previous load imbalance. Avoid block 0 timing pollution. + myComm->barrier(); + + ScopedTimer local_timer(timers_.production_timer); ParallelExecutor<> crowd_task; for (int block = 0; block < num_blocks; ++block) @@ -476,7 +480,7 @@ bool DMCBatched::run() } print_mem("DMCBatched after a block", app_debug_stream()); if (qmcdriver_input_.get_measure_imbalance()) - measureImbalance(block); + measureImbalance("Block " + std::to_string(block)); endBlock(); dmc_loop.stop(); diff --git a/src/QMCDrivers/QMCDriverNew.cpp b/src/QMCDrivers/QMCDriverNew.cpp index 175c01d0c..8f3fcea48 100644 --- a/src/QMCDrivers/QMCDriverNew.cpp +++ b/src/QMCDrivers/QMCDriverNew.cpp @@ -123,6 +123,8 @@ void QMCDriverNew::checkNumCrowdsLTNumThreads(const int num_crowds) */ void QMCDriverNew::startup(xmlNodePtr cur, const QMCDriverNew::AdjustedWalkerCounts& awc) { + ScopedTimer local_timer(timers_.startup_timer); + app_summary() << QMCType << " Driver running with" << std::endl << " total_walkers = " << awc.global_walkers << std::endl << " walkers_per_rank = " << awc.walkers_per_rank << std::endl @@ -160,6 +162,9 @@ void QMCDriverNew::startup(xmlNodePtr cur, const QMCDriverNew::AdjustedWalkerCou // Once they are created move contexts can be created. createRngsStepContexts(crowds_.size()); + + if (qmcdriver_input_.get_measure_imbalance()) + measureImbalance("Startup"); } /** QMCDriverNew ignores h5name if you want to read and h5 config you have to explicitly @@ -219,9 +224,8 @@ void QMCDriverNew::recordBlock(int block) { if (qmcdriver_input_.get_dump_config() && block % qmcdriver_input_.get_check_point_period().period == 0) { - timers_.checkpoint_timer.start(); + ScopedTimer local_timer(timers_.checkpoint_timer); RandomNumberControl::write(root_name_, myComm); - timers_.checkpoint_timer.stop(); } } @@ -571,7 +575,7 @@ void QMCDriverNew::checkLogAndGL(Crowd& crowd, const std::string_view location) throw std::runtime_error(std::string("checkLogAndGL failed at ") + std::string(location) + std::string("\n")); } -void QMCDriverNew::measureImbalance(int block) const +void QMCDriverNew::measureImbalance(const std::string& tag) const { ScopedTimer local_timer(timers_.imbalance_timer); Timer only_this_barrier; @@ -585,7 +589,7 @@ void QMCDriverNew::measureImbalance(int block) const const auto max_it = std::max_element(barrier_time_all_ranks.begin(), barrier_time_all_ranks.end()); const auto min_it = std::min_element(barrier_time_all_ranks.begin(), barrier_time_all_ranks.end()); app_log() << std::endl - << "Block " << block << " imbalance (slow ranks wait less):" << std::endl + << tag << " imbalance (slow ranks wait less):" << std::endl << " average wait seconds = " << std::accumulate(barrier_time_all_ranks.begin(), barrier_time_all_ranks.end(), 0.0) / count << std::endl << " min wait at rank " << std::distance(barrier_time_all_ranks.begin(), min_it) diff --git a/src/QMCDrivers/QMCDriverNew.h b/src/QMCDrivers/QMCDriverNew.h index b9376c55c..28468c9d4 100644 --- a/src/QMCDrivers/QMCDriverNew.h +++ b/src/QMCDrivers/QMCDriverNew.h @@ -103,7 +103,7 @@ public: protected: /// inject additional barrier and measure load imbalance. - void measureImbalance(int block) const; + void measureImbalance(const std::string& tag) const; /// end of a block operations. Aggregates statistics across all MPI ranks and write to disk. void endBlock(); /** This is a data structure strictly for QMCDriver and its derived classes @@ -330,6 +330,8 @@ protected: NewTimer& estimators_timer; NewTimer& imbalance_timer; NewTimer& endblock_timer; + NewTimer& startup_timer; + NewTimer& production_timer; NewTimer& resource_timer; DriverTimers(const std::string& prefix) : checkpoint_timer(*timer_manager.createTimer(prefix + "CheckPoint", timer_level_medium)), @@ -343,6 +345,8 @@ protected: estimators_timer(*timer_manager.createTimer(prefix + "Estimators", timer_level_medium)), imbalance_timer(*timer_manager.createTimer(prefix + "Imbalance", timer_level_medium)), endblock_timer(*timer_manager.createTimer(prefix + "BlockEndDataAggregation", timer_level_medium)), + startup_timer(*timer_manager.createTimer(prefix + "Startup", timer_level_medium)), + production_timer(*timer_manager.createTimer(prefix + "Production", timer_level_medium)), resource_timer(*timer_manager.createTimer(prefix + "Resources", timer_level_medium)) {} }; diff --git a/src/QMCDrivers/VMC/VMCBatched.cpp b/src/QMCDrivers/VMC/VMCBatched.cpp index 031dc8dea..d21c05e93 100644 --- a/src/QMCDrivers/VMC/VMCBatched.cpp +++ b/src/QMCDrivers/VMC/VMCBatched.cpp @@ -299,10 +299,12 @@ bool VMCBatched::run() ScopedTimer local_timer(timers_.init_walkers_timer); ParallelExecutor<> section_start_task; section_start_task(crowds_.size(), initialLogEvaluation, std::ref(crowds_), std::ref(step_contexts_)); + print_mem("VMCBatched after initialLogEvaluation", app_summary()); + if (qmcdriver_input_.get_measure_imbalance()) + measureImbalance("InitialLogEvaluation"); } - print_mem("VMCBatched after initialLogEvaluation", app_summary()); - + ScopedTimer local_timer(timers_.production_timer); ParallelExecutor<> crowd_task; if (qmcdriver_input_.get_warmup_steps() > 0) @@ -331,8 +333,13 @@ bool VMCBatched::run() app_log() << "Warm-up is completed!" << std::endl; print_mem("VMCBatched after Warmup", app_log()); + if (qmcdriver_input_.get_measure_imbalance()) + measureImbalance("Warmup"); } + // this barrier fences all previous load imbalance. Avoid block 0 timing pollution. + myComm->barrier(); + for (int block = 0; block < num_blocks; ++block) { vmc_loop.start(); @@ -363,7 +370,7 @@ bool VMCBatched::run() } print_mem("VMCBatched after a block", app_debug_stream()); if (qmcdriver_input_.get_measure_imbalance()) - measureImbalance(block); + measureImbalance("Block " + std::to_string(block)); endBlock(); vmc_loop.stop();