Traces throttling. Preliminary, please test before trusting.

git-svn-id: https://subversion.assembla.com/svn/qmcdev/trunk@6367 e5b18d87-469d-4833-9cc0-8cdfa06e9491
This commit is contained in:
Jaron Krogel 2014-10-03 21:44:09 +00:00
parent e63019fdee
commit 74ff608843
6 changed files with 64 additions and 65 deletions

View File

@ -1219,6 +1219,12 @@ struct TraceBuffer
} }
inline bool empty()
{
return buffer.size(0)==0;
}
inline void set_samples(TraceSamples<T>& s) inline void set_samples(TraceSamples<T>& s)
{ {
samples = &s; samples = &s;
@ -1569,6 +1575,7 @@ public:
bool method_allows_traces; bool method_allows_traces;
bool streaming_traces; bool streaming_traces;
bool writing_traces; bool writing_traces;
int buffer_steps; //number of steps to buffer data between writes
bool verbose; bool verbose;
string format; string format;
bool hdf_format; bool hdf_format;
@ -1587,6 +1594,7 @@ public:
reset_permissions(); reset_permissions();
master_copy = true; master_copy = true;
communicator = comm; communicator = comm;
buffer_steps = 10;
format = "hdf"; format = "hdf";
default_domain = "scalars"; default_domain = "scalars";
request.set_scalar_domain(default_domain); request.set_scalar_domain(default_domain);
@ -1622,6 +1630,7 @@ public:
request = tm.request; request = tm.request;
streaming_traces = tm.streaming_traces; streaming_traces = tm.streaming_traces;
writing_traces = tm.writing_traces; writing_traces = tm.writing_traces;
buffer_steps = tm.buffer_steps;
verbose = tm.verbose; verbose = tm.verbose;
format = tm.format; format = tm.format;
hdf_format = tm.hdf_format; hdf_format = tm.hdf_format;
@ -1687,6 +1696,7 @@ public:
attrib.add(verbose_write, "verbose" ); attrib.add(verbose_write, "verbose" );
attrib.add(array, "particle" );//legacy attrib.add(array, "particle" );//legacy
attrib.add(array_defaults, "particle_defaults" );//legacy attrib.add(array_defaults, "particle_defaults" );//legacy
attrib.add(buffer_steps, "buffer" );//legacy
attrib.put(cur); attrib.put(cur);
writing_traces = writing == "yes"; writing_traces = writing == "yes";
bool scalars_on = scalar == "yes"; bool scalars_on = scalar == "yes";
@ -1694,6 +1704,8 @@ public:
bool use_scalar_defaults = scalar_defaults == "yes"; bool use_scalar_defaults = scalar_defaults == "yes";
bool use_array_defaults = array_defaults == "yes"; bool use_array_defaults = array_defaults == "yes";
verbose = verbose_write == "yes"; verbose = verbose_write == "yes";
if(buffer_steps<1)
APP_ABORT("TraceManager::put buffer steps cannot be less than 1");
tolower(format); tolower(format);
if(format=="hdf") if(format=="hdf")
{ {
@ -2051,34 +2063,43 @@ public:
} }
//write buffered trace data to file //monitor what step the run is on, and write when it is time
inline void write_buffers(vector<TraceManager*>& clones, int block) //(throttled write, threaded, critical)
inline void monitor_writes(int step,TraceManager* master)
{ {
if(master_copy) if(step%buffer_steps)
{ {
if(writing_traces) #pragma omp critical(TRACE_BUFFER_WRITE)
{ {
double tstart = MPI_Wtime(); write_buffers(*master);
if(verbose)
app_log()<<"TraceManager::write_buffers "<<master_copy<<endl;
if(hdf_format)
{
write_buffers_hdf(clones);
}
if(adios_format)
{
#ifdef HAVE_ADIOS
write_buffers_adios(clones, block);
#else
APP_ABORT("TraceManager::write_buffers (adios) ADIOS is not found");
#endif
//app_log()<<"TraceManager::write_buffers (adios) has not yet been implemented"<<endl;
app_log()<<" write_buffers() total time "<<MPI_Wtime()-tstart<<endl;
}
} }
} }
else }
APP_ABORT("TraceManager::write_buffers should not be called from non-master copy");
//write buffered trace data to file
inline void write_buffers(TraceManager& master)
{
if(writing_traces && !buffers_empty())
{
if(hdf_format)
write_buffers_hdf(master);
if(adios_format)
{
#ifdef HAVE_ADIOS
write_buffers_adios(master);
#else
APP_ABORT("TraceManager::write_buffer (adios) ADIOS is not found");
#endif
}
reset_buffers();
}
}
inline bool buffers_empty()
{
return int_buffer.empty() && real_buffer.empty();
} }
@ -2139,7 +2160,7 @@ public:
inline void startRun(int blocks,vector<TraceManager*>& clones) inline void startRun(vector<TraceManager*>& clones)
{ {
if(verbose) if(verbose)
app_log()<<"TraceManager::startRun "<<master_copy<<endl; app_log()<<"TraceManager::startRun "<<master_copy<<endl;
@ -2148,18 +2169,23 @@ public:
initialize_traces(); initialize_traces();
check_clones(clones); check_clones(clones);
open_file(clones); open_file(clones);
for(int ip=0; ip<clones.size(); ++ip)
clones[ip]->reset_buffers();
} }
else else
APP_ABORT("TraceManager::startRun should not be called from non-master copy"); APP_ABORT("TraceManager::startRun should not be called from non-master copy");
} }
inline void stopRun() inline void stopRun(vector<TraceManager*>& clones)
{ {
if(verbose) if(verbose)
app_log()<<"TraceManager::stopRun "<<master_copy<<endl; app_log()<<"TraceManager::stopRun "<<master_copy<<endl;
if(master_copy) if(master_copy)
{ {
if(writing_traces)// flush any additional data from the buffers
for(int ip=0; ip<clones.size(); ++ip)
clones[ip]->write_buffers(*this);
close_file(); close_file();
finalize_traces(); finalize_traces();
} }
@ -2168,21 +2194,6 @@ public:
} }
inline void startBlock(int nsteps)
{
if(verbose)
app_log()<<"TraceManager::startBlock "<<master_copy<<endl;
reset_buffers();
}
inline void stopBlock()
{
if(verbose)
app_log()<<"TraceManager::stopBlock "<<master_copy<<endl;
}
inline void write_summary(string pad=" ") inline void write_summary(string pad=" ")
{ {
string pad2 = pad+" "; string pad2 = pad+" ";
@ -2252,16 +2263,10 @@ public:
} }
inline void write_buffers_hdf(vector<TraceManager*>& clones) inline void write_buffers_hdf(TraceManager& master)
{ {
if(verbose) int_buffer.write_hdf(*master.hdf_file,master.int_buffer.hdf_file_pointer);
app_log()<<"TraceManager::write_buffers_hdf "<<master_copy<<endl; real_buffer.write_hdf(*master.hdf_file,master.real_buffer.hdf_file_pointer);
for(int ip=0; ip<clones.size(); ++ip)
{
TraceManager& tm = *clones[ip];
tm.int_buffer.write_hdf(*hdf_file,int_buffer.hdf_file_pointer);
tm.real_buffer.write_hdf(*hdf_file,real_buffer.hdf_file_pointer);
}
} }

View File

@ -196,7 +196,7 @@ bool CSVMC::run()
Estimators->start(nBlocks); Estimators->start(nBlocks);
for (int ip=0; ip<NumThreads; ++ip) for (int ip=0; ip<NumThreads; ++ip)
CSMovers[ip]->startRun(nBlocks,false); CSMovers[ip]->startRun(nBlocks,false);
Traces->startRun(nBlocks,traceClones); Traces->startRun(traceClones);
const bool has_collectables=W.Collectables.size(); const bool has_collectables=W.Collectables.size();
ADIOS_PROFILE::profile_adios_init(nBlocks); ADIOS_PROFILE::profile_adios_init(nBlocks);
for (int block=0; block<nBlocks; ++block) for (int block=0; block<nBlocks; ++block)
@ -227,6 +227,7 @@ bool CSVMC::run()
wClones[ip]->saveEnsemble(wit,wit_end); wClones[ip]->saveEnsemble(wit,wit_end);
// if(storeConfigs && (now_loc%storeConfigs == 0)) // if(storeConfigs && (now_loc%storeConfigs == 0))
// ForwardWalkingHistory.storeConfigsForForwardWalking(*wClones[ip]); // ForwardWalkingHistory.storeConfigsForForwardWalking(*wClones[ip]);
traceClones[ip]->monitor_writes(now_loc,Traces);
} }
CSMovers[ip]->stopBlock(false); CSMovers[ip]->stopBlock(false);
// app_log()<<"THREAD "<<ip<<endl; // app_log()<<"THREAD "<<ip<<endl;
@ -236,11 +237,6 @@ bool CSVMC::run()
//Estimators->accumulateCollectables(wClones,nSteps); //Estimators->accumulateCollectables(wClones,nSteps);
CurrentStep+=nSteps; CurrentStep+=nSteps;
Estimators->stopBlock(estimatorClones); Estimators->stopBlock(estimatorClones);
ADIOS_PROFILE::profile_adios_end_comp(block);
ADIOS_PROFILE::profile_adios_start_trace(block);
Traces->write_buffers(traceClones, block);
ADIOS_PROFILE::profile_adios_end_trace(block);
ADIOS_PROFILE::profile_adios_start_checkpoint(block);
if(storeConfigs) if(storeConfigs)
recordBlock(block); recordBlock(block);
ADIOS_PROFILE::profile_adios_end_checkpoint(block); ADIOS_PROFILE::profile_adios_end_checkpoint(block);
@ -249,7 +245,7 @@ bool CSVMC::run()
Estimators->stop(estimatorClones); Estimators->stop(estimatorClones);
for (int ip=0; ip<NumThreads; ++ip) for (int ip=0; ip<NumThreads; ++ip)
CSMovers[ip]->stopRun2(); CSMovers[ip]->stopRun2();
Traces->stopRun(); Traces->stopRun(traceClones);
//copy back the random states //copy back the random states
for (int ip=0; ip<NumThreads; ++ip) for (int ip=0; ip<NumThreads; ++ip)
*(RandomNumberControl::Children[ip])=*(Rng[ip]); *(RandomNumberControl::Children[ip])=*(Rng[ip]);

View File

@ -243,7 +243,7 @@ bool DMCOMP::run()
Estimators->start(nBlocks); Estimators->start(nBlocks);
for(int ip=0; ip<NumThreads; ip++) for(int ip=0; ip<NumThreads; ip++)
Movers[ip]->startRun(nBlocks,false); Movers[ip]->startRun(nBlocks,false);
Traces->startRun(nBlocks,traceClones); Traces->startRun(traceClones);
Timer myclock; Timer myclock;
IndexType block = 0; IndexType block = 0;
IndexType updatePeriod=(QMCDriverMode[QMC_UPDATE_MODE])?Period4CheckProperties:(nBlocks+1)*nSteps; IndexType updatePeriod=(QMCDriverMode[QMC_UPDATE_MODE])?Period4CheckProperties:(nBlocks+1)*nSteps;
@ -280,6 +280,7 @@ bool DMCOMP::run()
Movers[ip]->setMultiplicity(wit,wit_end); Movers[ip]->setMultiplicity(wit,wit_end);
if(QMCDriverMode[QMC_UPDATE_MODE] && now%updatePeriod == 0) if(QMCDriverMode[QMC_UPDATE_MODE] && now%updatePeriod == 0)
Movers[ip]->updateWalkers(wit, wit_end); Movers[ip]->updateWalkers(wit, wit_end);
traceClones[ip]->monitor_writes(now,Traces);
} }
prof.pop(); //close dmc_advance prof.pop(); //close dmc_advance
@ -304,7 +305,6 @@ bool DMCOMP::run()
} }
// branchEngine->debugFWconfig(); // branchEngine->debugFWconfig();
Estimators->stopBlock(acceptRatio()); Estimators->stopBlock(acceptRatio());
Traces->write_buffers(traceClones, block);
block++; block++;
if(DumpConfig &&block%Period4CheckPoint == 0) if(DumpConfig &&block%Period4CheckPoint == 0)
{ {
@ -322,7 +322,7 @@ bool DMCOMP::run()
Estimators->stop(); Estimators->stop();
for (int ip=0; ip<NumThreads; ++ip) for (int ip=0; ip<NumThreads; ++ip)
Movers[ip]->stopRun2(); Movers[ip]->stopRun2();
Traces->stopRun(); Traces->stopRun(traceClones);
return finalize(nBlocks); return finalize(nBlocks);
} }

View File

@ -154,7 +154,6 @@ void QMCUpdateBase::stopRun2()
void QMCUpdateBase::startBlock(int steps) void QMCUpdateBase::startBlock(int steps)
{ {
Estimators->startBlock(steps); Estimators->startBlock(steps);
Traces->startBlock(steps);
nAccept = 0; nAccept = 0;
nReject=0; nReject=0;
nAllRejected=0; nAllRejected=0;
@ -165,7 +164,6 @@ void QMCUpdateBase::startBlock(int steps)
void QMCUpdateBase::stopBlock(bool collectall) void QMCUpdateBase::stopBlock(bool collectall)
{ {
Estimators->stopBlock(acceptRatio(),collectall); Estimators->stopBlock(acceptRatio(),collectall);
Traces->stopBlock();
} }
void QMCUpdateBase::initWalkers(WalkerIter_t it, WalkerIter_t it_end) void QMCUpdateBase::initWalkers(WalkerIter_t it, WalkerIter_t it_end)

View File

@ -52,7 +52,7 @@ bool RMCSingleOMP::run()
for (int ip=0; ip<NumThreads; ++ip) for (int ip=0; ip<NumThreads; ++ip)
Movers[ip]->startRun(nBlocks,false); Movers[ip]->startRun(nBlocks,false);
Traces->startRun(nBlocks,traceClones); Traces->startRun(traceClones);
const bool has_collectables=W.Collectables.size(); const bool has_collectables=W.Collectables.size();
for (int block=0; block<nBlocks; ++block) for (int block=0; block<nBlocks; ++block)
{ {

View File

@ -53,7 +53,7 @@ bool VMCSingleOMP::run()
Estimators->start(nBlocks); Estimators->start(nBlocks);
for (int ip=0; ip<NumThreads; ++ip) for (int ip=0; ip<NumThreads; ++ip)
Movers[ip]->startRun(nBlocks,false); Movers[ip]->startRun(nBlocks,false);
Traces->startRun(nBlocks,traceClones); Traces->startRun(traceClones);
const bool has_collectables=W.Collectables.size(); const bool has_collectables=W.Collectables.size();
for (int block=0; block<nBlocks; ++block) for (int block=0; block<nBlocks; ++block)
{ {
@ -82,20 +82,20 @@ bool VMCSingleOMP::run()
wClones[ip]->saveEnsemble(wit,wit_end); wClones[ip]->saveEnsemble(wit,wit_end);
// if(storeConfigs && (now_loc%storeConfigs == 0)) // if(storeConfigs && (now_loc%storeConfigs == 0))
// ForwardWalkingHistory.storeConfigsForForwardWalking(*wClones[ip]); // ForwardWalkingHistory.storeConfigsForForwardWalking(*wClones[ip]);
traceClones[ip]->monitor_writes(now_loc,Traces);
} }
Movers[ip]->stopBlock(false); Movers[ip]->stopBlock(false);
}//end-of-parallel for }//end-of-parallel for
//Estimators->accumulateCollectables(wClones,nSteps); //Estimators->accumulateCollectables(wClones,nSteps);
CurrentStep+=nSteps; CurrentStep+=nSteps;
Estimators->stopBlock(estimatorClones); Estimators->stopBlock(estimatorClones);
Traces->write_buffers(traceClones, block);
if(storeConfigs) if(storeConfigs)
recordBlock(block); recordBlock(block);
}//block }//block
Estimators->stop(estimatorClones); Estimators->stop(estimatorClones);
for (int ip=0; ip<NumThreads; ++ip) for (int ip=0; ip<NumThreads; ++ip)
Movers[ip]->stopRun2(); Movers[ip]->stopRun2();
Traces->stopRun(); Traces->stopRun(traceClones);
//copy back the random states //copy back the random states
for (int ip=0; ip<NumThreads; ++ip) for (int ip=0; ip<NumThreads; ++ip)
*(RandomNumberControl::Children[ip])=*(Rng[ip]); *(RandomNumberControl::Children[ip])=*(Rng[ip]);