Merge branch 'develop' into opt-momentum-estimator

This commit is contained in:
Paul R. C. Kent 2018-01-05 09:35:58 -05:00 committed by GitHub
commit 58fcddbb54
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
33 changed files with 1803 additions and 348 deletions

View File

@ -7,21 +7,21 @@ FILE( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl.cxx"
"#include <iostream>\n #include <mkl.h>\n int main() { return 0; }\n" )
try_compile(HAVE_MKL ${CMAKE_BINARY_DIR}
${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl.cxx
CMAKE_FLAGS "${CMAKE_CXX_FLAGS} -mkl" )
COMPILE_DEFINITIONS "-mkl" )
# Check for mkl_vml_functions.h
FILE( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl_vml.cxx"
"#include <iostream>\n #include <mkl_vml_functions.h>\n int main() { return 0; }\n" )
try_compile(HAVE_MKL_VML ${CMAKE_BINARY_DIR}
${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl_vml.cxx
CMAKE_FLAGS "${CMAKE_CXX_FLAGS} -mkl" )
COMPILE_DEFINITIONS "-mkl" )
# Check for fftw3
FILE( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl_fftw3.cxx"
"#include <iostream>\n #include <fftw/fftw3.h>\n int main() { return 0; }\n" )
try_compile(HAVE_MKL_FFTW3 ${CMAKE_BINARY_DIR}
${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl_fftw3.cxx
CMAKE_FLAGS "${CMAKE_CXX_FLAGS} -mkl" )
COMPILE_DEFINITIONS "-mkl" )
IF ( HAVE_MKL )
SET( MKL_FOUND 1 )

38
config/build_alcf_mira.sh Normal file
View File

@ -0,0 +1,38 @@
#!/bin/bash
Compiler=Clang++11
for name in real_SoA real_SoA_MP cplx_SoA cplx_SoA_MP \
real real_MP cplx cplx_MP
do
CMAKE_FLAGS="-D CMAKE_TOOLCHAIN_FILE=../config/BGQ_${Compiler}_ToolChain.cmake"
if [[ $name == *"cplx"* ]]; then
CMAKE_FLAGS="$CMAKE_FLAGS -D QMC_COMPLEX=1"
fi
if [[ $name == *"_SoA"* ]]; then
CMAKE_FLAGS="$CMAKE_FLAGS -D ENABLE_SOA=1"
fi
if [[ $name == *"_MP"* ]]; then
CMAKE_FLAGS="$CMAKE_FLAGS -D QMC_MIXED_PRECISION=1"
fi
folder=build_${Compiler}_${name}
echo "**********************************"
echo "$folder"
echo "$CMAKE_FLAGS"
echo "**********************************"
mkdir $folder
cd $folder
if [ ! -f CMakeCache.txt ] ; then
cmake $CMAKE_FLAGS ..
cmake $CMAKE_FLAGS ..
fi
make -j24
cd ..
echo
done

View File

@ -0,0 +1,45 @@
module unload cray-libsci
module load cray-hdf5-parallel
export CC=cc
export CXX=CC
export BOOST_ROOT=/soft/libraries/boost/1.64.0/intel
export CRAYPE_LINK_TYPE=dynamic
#TYPE=RelWithDebInfo
TYPE=Release
Compiler=Intel
for name in real_SoA real_SoA_MP cplx_SoA cplx_SoA_MP \
real real_MP cplx cplx_MP
do
CMAKE_FLAGS="-D CMAKE_BUILD_TYPE=$TYPE"
if [[ $name == *"cplx"* ]]; then
CMAKE_FLAGS="$CMAKE_FLAGS -D QMC_COMPLEX=1"
fi
if [[ $name == *"_SoA"* ]]; then
CMAKE_FLAGS="$CMAKE_FLAGS -D ENABLE_SOA=1"
fi
if [[ $name == *"_MP"* ]]; then
CMAKE_FLAGS="$CMAKE_FLAGS -D QMC_MIXED_PRECISION=1"
fi
folder=build_KNL_${Compiler}_${name}
echo "**********************************"
echo "$folder"
echo "$CMAKE_FLAGS"
echo "**********************************"
mkdir $folder
cd $folder
if [ ! -f CMakeCache.txt ] ; then
cmake $CMAKE_FLAGS ..
fi
make -j32
cd ..
echo
done

15
manual/README.md Normal file
View File

@ -0,0 +1,15 @@
# QMCPACK Manual
This directory contains the LaTeX source for the QMCPACK manual. The
PDF manual is not currently created by the build process and must be
created manually. Note that due to the use of the bibtopic package to
support two bibliographies, bibtex must be invoked twice:
```
pdflatex qmcpack_manual.tex
bibtex qmcpack_manual1
bibtex qmcpack_manual2
pdflatex qmcpack_manual.tex
pdflatex qmcpack_manual.tex
```

View File

@ -1,6 +1,3 @@
\appendix
\chapter{Derivation of twist averaging efficiency}
\label{sec:app_ta_efficiency}
In this appendix we derive the relative statistical efficiency of

6
manual/build_manual.sh Executable file
View File

@ -0,0 +1,6 @@
#!/bin/sh
pdflatex qmcpack_manual.tex
bibtex qmcpack_manual1
bibtex qmcpack_manual2
pdflatex qmcpack_manual.tex
pdflatex qmcpack_manual.tex

View File

@ -609,9 +609,8 @@ S =
We note at the left and right extremes, the values and first two
derivatives of the functions are zero, while at the center, $h_{j0}$
has a value of 1, $h_{j1}$ has a first derivative of 1, and $h_{j2}$
has a second derivative of 1.}
has a second derivative of 1. \label{fig:LPQHI} }
\end{center}
\label{fig:LPQHI}
\end{figure}
Figure~\ref{fig:LPQHI} shows plots of these function shapes.

View File

@ -46,6 +46,7 @@
% & \texttt{wlen } & integer & $\ge 0$ & 0 & number of samples per thread \\
& \texttt{maxDisplSq } & real & all values & -1 & maximum particle move \\
& \texttt{storeconfigs } & integer & all values & 0 & store configurations \\
& \texttt{use\_nonblocking } & string & yes/other & yes & using non-blocking send/recv \\
& \texttt{blocks\_between\_recompute} & integer & $\ge 0$ & dep. & wavefunction recompute frequency \\
\hline
\end{tabularx}

View File

@ -1,4 +1,6 @@
\documentclass[11pt,usletter]{report}
\usepackage{bibtopic}
\bibliographystyle{ieeetr}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{delarray}
@ -147,6 +149,9 @@
\tableofcontents
\newpage
\begin{btUnit}
\begin{btSect}{bibliography}
\input{introduction}
\input{features}
\input{installation}
@ -198,15 +203,20 @@
\input{design_features}
\input{developing}
\input{appendices}
\newpage
\chapter{References}
%\begingroup % Avoid "References" section title
%\renewcommand{\section}[2]{}%
\bibliographystyle{ieeetr}
\bibliography{bibliography}
%\bibliographystyle{ieeetr}
%\bibliography{bibliography}
%\endgroup
\btPrintCited
\end{btSect}
\end{btUnit}
\appendix
\input{appendices}
\begin{btUnit}
\input{qmcpack_papers}
\end{btUnit}
\end{document}

1231
manual/qmcpack_papers.bib Normal file

File diff suppressed because it is too large Load Diff

15
manual/qmcpack_papers.tex Normal file
View File

@ -0,0 +1,15 @@
\chapter{QMCPACK papers}
% Uset the bibtopic package to generate a bibliography local to this btsection below.
% btPrintAll creates the bibliography citing all entries in the files qmcpack_papers.bib
\begin{btSect}{qmcpack_papers}
The following is a list of all papers, theses, and book chapters
known to use QMCPACK. Please let the developers know if your paper
is missing, if you know of other works, or an entry is incorrect. We
list papers whether they cite QMCPACK directly or not. This list
will be placed on the \url{http://www.qmcpack.org} website.
\btPrintAll
\end{btSect}

View File

@ -12,9 +12,6 @@ are performed. If the flag is absent, then the corresponding
option is disabled.
\begin{description}
\item[\texttt{-{}-async\_swap}]{ When enabled, this option will launch a new thread
to handle the communication between walkers. If it is not enabled,
then the communication happens on the current thread. }
\item[\texttt{-{}-dryrun}]{ Validate the input file without performing the simulation.
This is a good way to ensure that QMCPACK will do what you think it will. }
\item[\texttt{-{}-help}]{ Print version information as well as a list of optional

View File

@ -153,6 +153,7 @@ void
cublas_inverse (cublasHandle_t handle,
float *Alist_d[], float *Ainvlist_d[],
float *AWorklist_d[], float *AinvWorklist_d[],
int *PivotArray, int *infoArray,
int N, int rowStride, int numMats,
bool useHigherPrecision)
{
@ -160,8 +161,6 @@ cublas_inverse (cublasHandle_t handle,
// Info array tells if a matrix inversion is successful
// = 0 : successful
// = k : U(k,k) = 0; inversion failed
int *infoArray;
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
// If double precision operations are desired...
if (useHigherPrecision)
@ -174,16 +173,16 @@ cublas_inverse (cublasHandle_t handle,
// (ii) call cublas to do matrix inversion
// LU decomposition
callAndCheckError( cublasDgetrfBatched( handle, N, (double**)AWorklist_d, rowStride, NULL,
callAndCheckError( cublasDgetrfBatched( handle, N, (double**)AWorklist_d, rowStride, PivotArray,
infoArray, numMats), __LINE__ );
// Inversion
#if (CUDA_VERSION >= 6050)
callAndCheckError( cublasDgetriBatched( handle, N, (const double**)AWorklist_d, rowStride, NULL,
(double**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasDgetriBatched( handle, N, (const double**)AWorklist_d, rowStride, PivotArray,
(double**)AinvWorklist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#else
callAndCheckError( cublasDgetriBatched( handle, N, (double**)AWorklist_d, rowStride, NULL,
(double**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasDgetriBatched( handle, N, (double**)AWorklist_d, rowStride, PivotArray,
(double**)AinvWorklist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#endif
// (iii) convert results back to single precision
@ -195,24 +194,20 @@ cublas_inverse (cublasHandle_t handle,
{
// Call cublas to do matrix inversion
// LU decomposition
callAndCheckError( cublasSgetrfBatched( handle, N, Alist_d, rowStride, NULL,
callAndCheckError( cublasSgetrfBatched( handle, N, Alist_d, rowStride, PivotArray,
infoArray, numMats), __LINE__ );
// Inversion
#if (CUDA_VERSION >= 6050)
callAndCheckError( cublasSgetriBatched( handle, N, (const float**) Alist_d, rowStride, NULL,
Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasSgetriBatched( handle, N, (const float**) Alist_d, rowStride, PivotArray,
Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#else
callAndCheckError( cublasSgetriBatched( handle, N, Alist_d, rowStride, NULL,
Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasSgetriBatched( handle, N, Alist_d, rowStride, PivotArray,
Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#endif
}
cudaDeviceSynchronize();
// Free resources
cudaFree(infoArray);
}
// 2. for double matrices
@ -220,6 +215,7 @@ void
cublas_inverse (cublasHandle_t handle,
double *Alist_d[], double *Ainvlist_d[],
double *AWorklist_d[], double *AinvWorklist_d[],
int *PivotArray, int *infoArray,
int N, int rowStride, int numMats,
bool useHigherPrecision)
{
@ -227,8 +223,6 @@ cublas_inverse (cublasHandle_t handle,
// Info array tells if a matrix inversion is successful
// = 0 : successful
// = k : U(k,k) = 0; inversion failed
int *infoArray;
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
// (i) copy all the elements of Alist to AWorklist
dim3 dimBlockConvert (CONVERT_BS);
@ -237,22 +231,19 @@ cublas_inverse (cublasHandle_t handle,
// (ii) call cublas functions to do inversion
// LU decomposition
callAndCheckError( cublasDgetrfBatched( handle, N, AWorklist_d, rowStride, NULL,
callAndCheckError( cublasDgetrfBatched( handle, N, AWorklist_d, rowStride, PivotArray,
infoArray, numMats), __LINE__ );
// Inversion
#if (CUDA_VERSION >= 6050)
callAndCheckError( cublasDgetriBatched( handle, N, (const double**) AWorklist_d, rowStride, NULL,
Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasDgetriBatched( handle, N, (const double**) AWorklist_d, rowStride, PivotArray,
Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#else
callAndCheckError( cublasDgetriBatched( handle, N, AWorklist_d, rowStride, NULL,
Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasDgetriBatched( handle, N, AWorklist_d, rowStride, PivotArray,
Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#endif
cudaDeviceSynchronize();
cudaFree(infoArray);
}
@ -263,6 +254,7 @@ void
cublas_inverse (cublasHandle_t handle,
std::complex<float> *Alist_d[], std::complex<float> *Ainvlist_d[],
std::complex<float> *AWorklist_d[], std::complex<float> *AinvWorklist_d[],
int *PivotArray, int *infoArray,
int N, int rowStride, int numMats,
bool useHigherPrecision)
{
@ -270,8 +262,6 @@ cublas_inverse (cublasHandle_t handle,
// Info array tells if a matrix inversion is successful
// = 0 : successful
// = k : U(k,k) = 0; inversion failed
int *infoArray;
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
// If double precision operations are desired...
if (useHigherPrecision)
@ -284,12 +274,12 @@ cublas_inverse (cublasHandle_t handle,
// (ii) call cublas to do matrix inversion
// LU decomposition
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, infoArray, numMats), __LINE__ );
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, infoArray, numMats), __LINE__ );
// Inversion
#if (CUDA_VERSION >= 6050)
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#else
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#endif
// (iii) convert results back to single precision
@ -301,22 +291,18 @@ cublas_inverse (cublasHandle_t handle,
{
// Call cublas to do matrix inversion
// LU decomposition
callAndCheckError( cublasCgetrfBatched( handle, N, (cuComplex**)Alist_d, rowStride, NULL,
callAndCheckError( cublasCgetrfBatched( handle, N, (cuComplex**)Alist_d, rowStride, PivotArray,
infoArray, numMats), __LINE__ );
// Inversion
#if (CUDA_VERSION >= 6050)
callAndCheckError( cublasCgetriBatched( handle, N, (const cuComplex**)Alist_d, rowStride, NULL, (cuComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasCgetriBatched( handle, N, (const cuComplex**)Alist_d, rowStride, PivotArray, (cuComplex**)Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#else
callAndCheckError( cublasCgetriBatched( handle, N, (cuComplex**)Alist_d, rowStride, NULL, (cuComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasCgetriBatched( handle, N, (cuComplex**)Alist_d, rowStride, PivotArray, (cuComplex**)Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#endif
}
cudaDeviceSynchronize();
// Free resources
cudaFree(infoArray);
}
// 4. for complex double matrices
@ -324,6 +310,7 @@ void
cublas_inverse (cublasHandle_t handle,
std::complex<double> *Alist_d[], std::complex<double> *Ainvlist_d[],
std::complex<double> *AWorklist_d[], std::complex<double> *AinvWorklist_d[],
int *PivotArray, int *infoArray,
int N, int rowStride, int numMats,
bool useHigherPrecision)
{
@ -331,8 +318,6 @@ cublas_inverse (cublasHandle_t handle,
// Info array tells if a matrix inversion is successful
// = 0 : successful
// = k : U(k,k) = 0; inversion failed
int *infoArray;
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
// (i) copy all the elements of Alist to AWorklist
dim3 dimBlockConvert (CONVERT_BS);
@ -341,18 +326,15 @@ cublas_inverse (cublasHandle_t handle,
// (ii) call cublas to do matrix inversion
// LU decomposition
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, infoArray, numMats), __LINE__ );
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, infoArray, numMats), __LINE__ );
// Inversion
#if (CUDA_VERSION >= 6050)
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#else
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
#endif
cudaDeviceSynchronize();
cudaFree(infoArray);
}

View File

@ -25,6 +25,7 @@ void
cublas_inverse (cublasHandle_t handle,
float *Alist_d[], float *Ainvlist_d[],
float *AWorkList_d[], float *AinvWorkList_d[],
int *PivotArray, int *infoArray,
int N, int rowStride, int numMats,
bool useHigherPrecision = true);
@ -35,6 +36,7 @@ void
cublas_inverse (cublasHandle_t handle,
double *Alist_d[], double *Ainvlist_d[],
double *AWorklist_d[], double *AinvWorklist_d[],
int *PivotArray, int *infoArray,
int N, int rowStride, int numMats,
bool useHigherPrecision = true);
@ -46,6 +48,7 @@ void
cublas_inverse (cublasHandle_t handle,
std::complex<float> *Alist_d[], std::complex<float> *Ainvlist_d[],
std::complex<float> *AWorkList_d[], std::complex<float> *AinvWorkList_d[],
int *PivotArray, int *infoArray,
int N, int rowStride, int numMats,
bool useHigherPrecision = true);
@ -56,6 +59,7 @@ void
cublas_inverse (cublasHandle_t handle,
std::complex<double> *Alist_d[], std::complex<double> *Ainvlist_d[],
std::complex<double> *AWorklist_d[], std::complex<double> *AinvWorklist_d[],
int *PivotArray, int *infoArray,
int N, int rowStride, int numMats,
bool useHigherPrecision = true);

View File

@ -30,7 +30,7 @@ void test_e2iphi()
T vcos[N];
T vsin[N];
for (int i = 0; i < N; i++) {
phi[0] = 0.2*i;
phi[i] = 0.2*i;
}
eval_e2iphi(N, phi, vcos, vsin);

View File

@ -43,9 +43,13 @@ public:
/** constructor with size n*/
explicit inline
Vector(size_t n=0): nLocal(n), nAllocated(0), X(nullptr)
Vector(size_t n=0, Type_t val = Type_t()): nLocal(n), nAllocated(0), X(nullptr)
{
if(n) resize_impl(n);
if(n)
{
resize_impl(n);
std::fill_n(X, n, val);
}
}
/** constructor with an initialized ref */
@ -113,12 +117,20 @@ public:
}
///resize
inline void resize(size_t n)
inline void resize(size_t n, Type_t val = Type_t())
{
if(nLocal>nAllocated)
throw std::runtime_error("Resize not allowed on Vector constructed by initialized memory.");
if(n>nAllocated)
{
resize_impl(n);
std::fill_n(X, n, val);
}
else if(n>nLocal)
{
std::fill_n(X+nLocal, n-nLocal, val);
nLocal=n;
}
else
nLocal=n;
return;
@ -204,7 +216,6 @@ private:
mAllocator.deallocate(X,nAllocated);
}
X=mAllocator.allocate(n);
std::fill_n(X, n, T());
nLocal=n;
nAllocated=n;
}

View File

@ -184,6 +184,7 @@ struct AsymmetricDTD
inline void evaluate(ParticleSet& P, int jat)
{
APP_ABORT(" No need to call AsymmetricDTD::evaluate(ParticleSet& P, int jat)");
//based on full evaluation. Only compute it if jat==0
if(jat==0) evaluate(P);
}

View File

@ -504,8 +504,9 @@ ParticleSet::makeMove(Index_t iat, const SingleParticlePos_t& displ)
void ParticleSet::setActive(int iat)
{
myTimers[3]->start();
for (size_t i=0,n=DistTables.size(); i< n; i++)
DistTables[i]->evaluate(*this,iat);
for (size_t i=0; i<DistTables.size(); i++)
if(DistTables[i]->DTType==DT_SOA)
DistTables[i]->evaluate(*this,iat);
myTimers[3]->stop();
}
@ -809,7 +810,8 @@ void ParticleSet::loadWalker(Walker_t& awalker, bool pbyp)
{
// in certain cases, full tables must be ready
for (int i=0; i< DistTables.size(); i++)
if(DistTables[i]->Need_full_table_loadWalker) DistTables[i]->evaluate(*this);
if(DistTables[i]->DTType==DT_AOS||DistTables[i]->Need_full_table_loadWalker)
DistTables[i]->evaluate(*this);
//computed so that other objects can use them, e.g., kSpaceJastrow
if(SK && SK->DoUpdate)
SK->UpdateAllPart(*this);

View File

@ -137,6 +137,7 @@ struct SymmetricDTD
inline void evaluate(ParticleSet& P, int jat)
{
APP_ABORT(" No need to call SymmetricDTD::evaluate(ParticleSet& P, int jat)");
//based on full evaluation. Only compute it if jat==0
if(jat==0) evaluate(P);
}

View File

@ -121,6 +121,8 @@ struct Walker
* When Multiplicity = 0, this walker will be destroyed.
*/
RealType Multiplicity;
////Number of copies sent only for MPI
int NumSentCopies;
/** The configuration vector (3N-dimensional vector to store
the positions of all the particles for a single walker)*/
@ -437,6 +439,7 @@ struct Walker
DataSet.add(Age);
DataSet.add(ReleasedNodeAge);
DataSet.add(ReleasedNodeWeight);
DataSet.add(NumSentCopies);
// vectors
DataSet.add(R.first_address(), R.last_address());
#if !defined(SOA_MEMORY_OPTIMIZED)
@ -466,7 +469,7 @@ struct Walker
void copyFromBuffer()
{
DataSet.rewind();
DataSet >> ID >> ParentID >> Generation >> Age >> ReleasedNodeAge >> ReleasedNodeWeight;
DataSet >> ID >> ParentID >> Generation >> Age >> ReleasedNodeAge >> ReleasedNodeWeight >> NumSentCopies;
// vectors
DataSet.get(R.first_address(), R.last_address());
#if !defined(SOA_MEMORY_OPTIMIZED)
@ -513,7 +516,7 @@ struct Walker
void updateBuffer()
{
DataSet.rewind();
DataSet << ID << ParentID << Generation << Age << ReleasedNodeAge << ReleasedNodeWeight;
DataSet << ID << ParentID << Generation << Age << ReleasedNodeAge << ReleasedNodeWeight << NumSentCopies;
// vectors
DataSet.put(R.first_address(), R.last_address());
#if !defined(SOA_MEMORY_OPTIMIZED)

View File

@ -10,8 +10,6 @@
//
// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
//////////////////////////////////////////////////////////////////////////////////////
#include <QMCDrivers/DMC/WalkerControlMPI.h>
@ -24,36 +22,30 @@
namespace qmcplusplus
{
//#if defined(PRINT_DEBUG)
//#define DMC_BRANCH_START(NOW) NOW
//#define DMC_BRANCH_STOP(TID,TM) TID=TM
//#define DMC_BRANCH_DUMP(IT,T1,T2,T3) \
// OhmmsInfo::Debug->getStream() << "BRANCH " \
//<< std::setw(8) << IT \
//<< " SORT" << std::setw(16) << T1 \
//<< " GSUM" << std::setw(16) << T2 \
//<< " SWAP" << std::setw(16) << T3 << std::endl
//#else
#define DMC_BRANCH_START(NOW)
#define DMC_BRANCH_STOP(TID,TM)
#define DMC_BRANCH_DUMP(IT,T1,T2,T3)
//#endif
//#define MCWALKERSET_MPI_DEBUG
enum DMC_MPI_Timers
{
DMC_MPI_branch,
DMC_MPI_imbalance,
DMC_MPI_prebalance,
DMC_MPI_loadbalance
DMC_MPI_copyWalkers,
DMC_MPI_allreduce,
DMC_MPI_loadbalance,
DMC_MPI_send,
DMC_MPI_recv,
};
TimerNameList_t<DMC_MPI_Timers> DMCMPITimerNames =
{
{DMC_MPI_branch, "WalkerControlMPI::branch"},
{DMC_MPI_imbalance, "WalkerControlMPI::imbalance"},
{DMC_MPI_prebalance, "WalkerControlMPI::pre-loadbalance"},
{DMC_MPI_loadbalance, "WalkerControlMPI::loadbalance"}
{DMC_MPI_copyWalkers, "WalkerControlMPI::copyWalkers"},
{DMC_MPI_allreduce, "WalkerControlMPI::allreduce"},
{DMC_MPI_loadbalance, "WalkerControlMPI::loadbalance"},
{DMC_MPI_send, "WalkerControlMPI::send"},
{DMC_MPI_recv, "WalkerControlMPI::recv"}
};
/** default constructor
@ -73,36 +65,37 @@ WalkerControlMPI::WalkerControlMPI(Communicate* c): WalkerControlBase(c)
setup_timers(myTimers, DMCMPITimerNames, timer_level_medium);
}
int
WalkerControlMPI::branch(int iter, MCWalkerConfiguration& W, RealType trigger)
/** Perform branch and swap walkers as required
*
* It takes 4 steps:
* 1. sortWalkers marks good and bad walkers.
* 2. allreduce and make the decision of load balancing.
* 3. send/recv walkers. Receiving side recycles bad walkers' memory first.
* 4. copyWalkers generates copies of good walkers.
* In order to minimize the memory footprint fluctuation
* the walker copying is placed as the last step.
* In order to reduce the time for allocating walker memory,
* this algorithm does not destroy the bad walkers in step 1.
* All the bad walkers are recycled as much as possible in step 3/4.
*/
int WalkerControlMPI::branch(int iter, MCWalkerConfiguration& W, RealType trigger)
{
DMC_BRANCH_START(Timer localTimer);
TinyVector<RealType,3> bTime(0.0);
myTimers[DMC_MPI_branch]->start();
myTimers[DMC_MPI_prebalance]->start();
std::fill(curData.begin(),curData.end(),0);
//std::fill(NumPerNode.begin(),NumPerNode.end(),0);
sortWalkers(W);
//use NumWalkersSent from the previous exchange
curData[SENTWALKERS_INDEX]=NumWalkersSent;
//update the number of walkers for this node
curData[LE_MAX+MyContext]=NumWalkers;
DMC_BRANCH_STOP(bTime[0],localTimer.elapsed());
DMC_BRANCH_START(localTimer.restart());
int nw = copyWalkers(W);
//myTimers[DMC_MPI_imbalance]->start();
//myComm->barrier();
//myTimers[DMC_MPI_imbalance]->stop();
myTimers[DMC_MPI_allreduce]->start();
myComm->allreduce(curData);
myTimers[DMC_MPI_allreduce]->stop();
measureProperties(iter);
W.EnsembleProperty=EnsembleProperty;
DMC_BRANCH_STOP(bTime[1],localTimer.elapsed());
DMC_BRANCH_START(localTimer.restart());
////update the samples and weights
//W.EnsembleProperty.NumSamples=curData[WALKERSIZE_INDEX];
//W.EnsembleProperty.Weight=curData[WEIGHT_INDEX];
//RealType wgtInv(1.0/curData[WEIGHT_INDEX]);
//accumData[ENERGY_INDEX] += curData[ENERGY_INDEX]*wgtInv;
//accumData[ENERGY_SQ_INDEX] += curData[ENERGY_SQ_INDEX]*wgtInv;
//accumData[WALKERSIZE_INDEX] += curData[WALKERSIZE_INDEX];
//accumData[WEIGHT_INDEX] += curData[WEIGHT_INDEX];
Cur_pop=0;
for(int i=0, j=LE_MAX; i<NumContexts; i++,j++)
{
@ -110,26 +103,11 @@ WalkerControlMPI::branch(int iter, MCWalkerConfiguration& W, RealType trigger)
}
myTimers[DMC_MPI_prebalance]->stop();
myTimers[DMC_MPI_loadbalance]->start();
if(qmc_common.async_swap)
swapWalkersAsync(W);
else
swapWalkersSimple(W);
swapWalkersSimple(W);
myTimers[DMC_MPI_loadbalance]->stop();
//Do not need to use a trigger.
//Cur_min=Nmax;
//Cur_max=0;
//Cur_pop=0;
//for(int i=0, j=LE_MAX; i<NumContexts; i++,j++) {
// Cur_pop+= NumPerNode[i]=static_cast<int>(curData[j]);
// Cur_min = std::min(Cur_min,NumPerNode[i]);
// Cur_max = std::max(Cur_max,NumPerNode[i]);
//}
//int max_diff = std::max(Cur_max*NumContexts-Cur_pop,Cur_pop-Cur_min*NumContexts);
//double diff_pop = static_cast<double>(max_diff)/static_cast<double>(Cur_pop);
//if(diff_pop > trigger) {
// swapWalkersSimple(W);
// //swapWalkersMap(W);
//}
myTimers[DMC_MPI_copyWalkers]->start();
copyWalkers(W);
myTimers[DMC_MPI_copyWalkers]->stop();
//set Weight and Multiplicity to default values
MCWalkerConfiguration::iterator it(W.begin()),it_end(W.end());
while(it != it_end)
@ -141,8 +119,7 @@ WalkerControlMPI::branch(int iter, MCWalkerConfiguration& W, RealType trigger)
//update the global number of walkers and offsets
W.setGlobalNumWalkers(Cur_pop);
W.setWalkerOffsets(FairOffSet);
DMC_BRANCH_STOP(bTime[2],localTimer.elapsed());
DMC_BRANCH_DUMP(iter,bTime[0],bTime[1],bTime[2]);
myTimers[DMC_MPI_branch]->stop();
return Cur_pop;
}
@ -176,19 +153,24 @@ void determineNewWalkerPopulation(int Cur_pop, int NumContexts, int MyContext, c
}
}
/** swap Walkers with Recv/Send
/** swap Walkers with Recv/Send or Irecv/Isend
*
* The algorithm ensures that the load per node can differ only by one walker.
* The communication is one-dimensional.
* Each MPI rank can only send or receive or be silent.
* The communication is one-dimensional and very local.
* If multiple copies of a walker need to be sent to the target rank,
* only one walker is sent and the number of copies is encoded in the message.
* The blocking send/recv may become serialized and worsen load imbalance.
* Non blocking send/recv algorithm avoids serialization completely.
*/
void WalkerControlMPI::swapWalkersSimple(MCWalkerConfiguration& W)
{
std::vector<int> minus, plus;
determineNewWalkerPopulation(Cur_pop, NumContexts, MyContext, NumPerNode, FairOffSet, minus, plus);
Walker_t& wRef(*W[0]);
Walker_t& wRef(*good_w[0]);
std::vector<Walker_t*> newW;
std::vector<Walker_t*> oldW;
std::vector<int> ncopy_newW;
#ifdef MCWALKERSET_MPI_DEBUG
char fname[128];
sprintf(fname,"test.%d",MyContext);
@ -209,166 +191,206 @@ void WalkerControlMPI::swapWalkersSimple(MCWalkerConfiguration& W)
}
fout << std::endl;
#endif
// From the class
// MyContext (read)
// myComm (read)
// NumWalkersSent (write)
int nswap=std::min(plus.size(), minus.size());
int last=W.getActiveWalkers()-1;
int nsend=0;
for(int ic=0; ic<nswap; ic++)
if(plus.size()!=minus.size())
{
if(plus[ic]==MyContext)
{
size_t byteSize = W[last]->byteSize();
W[last]->updateBuffer();
OOMPI_Message sendBuffer(W[last]->DataSet.data(), byteSize);
myComm->getComm()[minus[ic]].Send(sendBuffer);
--last;
++nsend;
}
if(minus[ic]==MyContext)
{
Walker_t *awalker= new Walker_t(wRef);
size_t byteSize = awalker->byteSize();
OOMPI_Message recvBuffer(awalker->DataSet.data(), byteSize);
myComm->getComm()[plus[ic]].Recv(recvBuffer);
awalker->copyFromBuffer();
newW.push_back(awalker);
}
app_error() << "Walker send/recv pattern doesn't match. "
<< "The send size " << plus.size()
<< " is not equal to the receive size " << minus.size()
<< " ." << std::endl;
APP_ABORT("WalkerControlMPI::swapWalkersSimple");
}
//save the number of walkers sent
NumWalkersSent=nsend;
if(nsend)
{
nsend=NumPerNode[MyContext]-nsend;
W.destroyWalkers(W.begin()+nsend, W.end());
}
//add walkers from other node
if(newW.size())
W.insert(W.end(),newW.begin(),newW.end());
}
int nswap=plus.size();
// sort good walkers by the number of copies
assert(good_w.size()==ncopy_w.size());
std::vector<std::pair<int,int> > ncopy_pairs, nrecv_pairs;
for(int iw=0; iw<ncopy_w.size(); iw++)
ncopy_pairs.push_back(std::make_pair(ncopy_w[iw],iw));
std::sort(ncopy_pairs.begin(), ncopy_pairs.end());
/** swap Walkers with Irecv/Send
*
* The algorithm ensures that the load per node can differ only by one walker.
* The communication is one-dimensional.
*/
void WalkerControlMPI::swapWalkersAsync(MCWalkerConfiguration& W)
{
std::vector<int> minus, plus;
determineNewWalkerPopulation(Cur_pop, NumContexts, MyContext, NumPerNode, FairOffSet, minus, plus);
Walker_t& wRef(*W[0]);
std::vector<Walker_t*> newW;
std::vector<Walker_t*> oldW;
#ifdef MCWALKERSET_MPI_DEBUG
char fname[128];
sprintf(fname,"test.%d",MyContext);
std::ofstream fout(fname, std::ios::app);
//fout << NumSwaps << " " << Cur_pop << " ";
//for(int ic=0; ic<NumContexts; ic++) fout << NumPerNode[ic] << " ";
//fout << " | ";
//for(int ic=0; ic<NumContexts; ic++) fout << FairOffSet[ic+1]-FairOffSet[ic] << " ";
//fout << " | ";
for(int ic=0; ic<plus.size(); ic++)
{
fout << plus[ic] << " ";
}
fout << " | ";
for(int ic=0; ic<minus.size(); ic++)
{
fout << minus[ic] << " ";
}
fout << std::endl;
#endif
int nswap=std::min(plus.size(), minus.size());
int last=W.getActiveWalkers()-1;
int nsend=0;
int countSend = 1;
OOMPI_Packed ** sendBuffers = new OOMPI_Packed*[NumContexts];
OOMPI_Packed ** recvBuffers = new OOMPI_Packed*[NumContexts];
std::vector<OOMPI_Request> requests(NumContexts);
std::vector<int> sendCounts(NumContexts,0);
for(int ip=0; ip < NumContexts; ++ip)
sendBuffers[ip] = 0;
for(int ip=0; ip < NumContexts; ++ip)
recvBuffers[ip] = 0;
std::vector<OOMPI_Request> requests;
for(int ic=0; ic<nswap; ic++)
{
if(plus[ic]==MyContext)
{
if((ic < nswap - 1) && (plus[ic] == plus[ic+1]) && (minus[ic] == minus[ic+1]))
{
countSend++;
}
else
{
//OOMPI_Packed sendBuffer(wRef.byteSize(),OOMPI_COMM_WORLD);
sendBuffers[minus[ic]] = new OOMPI_Packed(countSend * wRef.byteSize(),myComm->getComm());
for(int cs = 0; cs < countSend; ++cs)
{
W[last]->putMessage(*(sendBuffers[minus[ic]]));
--last;
// always send the last good walker
Walker_t* &awalker = good_w[ncopy_pairs.back().second];
// count the possible copies in one send
auto &nsentcopy = awalker->NumSentCopies;
nsentcopy = 0;
for(int id=ic+1; id<nswap; id++)
if(plus[ic]==plus[id]&&minus[ic]==minus[id]&&ncopy_pairs.back().first>0)
{ // increment copy counter
ncopy_pairs.back().first--;
nsentcopy++;
}
//OOMPI_COMM_WORLD[minus[ic]].Send(sendBuffer);
requests[minus[ic]] = myComm->getComm()[minus[ic]].Isend(*(sendBuffers[minus[ic]]), plus[ic]);
nsend += countSend;
countSend = 1;
else
{ // not enough copies to send or not the same send/recv pair
break;
}
// pack data and send
size_t byteSize = awalker->byteSize();
awalker->updateBuffer();
OOMPI_Message sendBuffer(awalker->DataSet.data(), byteSize);
if(use_nonblocking)
requests.push_back(myComm->getComm()[minus[ic]].Isend(sendBuffer));
else
{
myTimers[DMC_MPI_send]->start();
myComm->getComm()[minus[ic]].Send(sendBuffer);
myTimers[DMC_MPI_send]->stop();
}
// update counter and cursor
++nsend;
ic+=nsentcopy;
// update copy counter
if(ncopy_pairs.back().first>0)
{
ncopy_pairs.back().first--;
std::sort(ncopy_pairs.begin(), ncopy_pairs.end());
}
else
{
ncopy_pairs.pop_back();
bad_w.push_back(awalker);
}
}
if(minus[ic]==MyContext)
{
if((ic < nswap - 1) && (plus[ic] == plus[ic+1]) && (minus[ic] == minus[ic+1]))
// count receive pairs, (source,copy)
nrecv_pairs.push_back(std::make_pair(plus[ic],0));
for(int id=ic+1; id<nswap; id++)
if(plus[ic]==plus[id]&&minus[ic]==minus[id])
nrecv_pairs.back().second++;
else
break;
// update cursor
ic+=nrecv_pairs.back().second;
}
}
if(nsend>0)
{
if(use_nonblocking)
{
// wait all the isend
for(int im=0; im<requests.size(); im++)
{
countSend++;
myTimers[DMC_MPI_send]->start();
requests[im].Wait();
myTimers[DMC_MPI_send]->stop();
}
requests.clear();
}
}
else
{
struct job {
OOMPI_Request request;
int walkerID;
int queueID;
job(const OOMPI_Request &req, int wid, int qid): request(req), walkerID(wid), queueID(qid) {};
};
std::vector<job> job_list;
std::vector<bool> queue_status(nrecv_pairs.size(),true);
bool completed=false;
while(!completed)
{
// receive data
for(int ic=0; ic<nrecv_pairs.size(); ic++)
if(queue_status[ic]&&nrecv_pairs[ic].second>=0)
{
Walker_t* awalker;
if(bad_w.empty())
{
awalker=new Walker_t(wRef);
}
else
{
awalker=bad_w.back();
bad_w.pop_back();
}
size_t byteSize = awalker->byteSize();
myTimers[DMC_MPI_recv]->start();
OOMPI_Message recvBuffer(awalker->DataSet.data(), byteSize);
if(use_nonblocking)
{
job_list.push_back(job(myComm->getComm()[nrecv_pairs[ic].first].Irecv(recvBuffer),
newW.size(),ic));
queue_status[ic]=false;
}
else
{
myComm->getComm()[nrecv_pairs[ic].first].Recv(recvBuffer);
job_list.push_back(job(OOMPI_Request(),newW.size(),ic));
}
myTimers[DMC_MPI_recv]->stop();
newW.push_back(awalker);
ncopy_newW.push_back(0);
}
if(use_nonblocking)
{
OOMPI_Status status;
for(auto jobit=job_list.begin(); jobit!=job_list.end(); jobit++)
if(jobit->request.Test(status))
{
auto &awalker=newW[jobit->walkerID];
// unpack data
awalker->copyFromBuffer();
ncopy_newW[jobit->walkerID]=awalker->NumSentCopies;
// update counter
nrecv_pairs[jobit->queueID].second-=(awalker->NumSentCopies+1);
queue_status[jobit->queueID]=true;
job_list.erase(jobit);
break;
}
}
else
{
//OOMPI_Packed recvBuffer(wRef.byteSize(),OOMPI_COMM_WORLD);
recvBuffers[plus[ic]] = new OOMPI_Packed(countSend * wRef.byteSize(),myComm->getComm());
//OOMPI_COMM_WORLD[plus[ic]].Recv(recvBuffer);
requests[plus[ic]] = myComm->getComm()[plus[ic]].Irecv(*(recvBuffers[plus[ic]]), plus[ic]);
sendCounts[plus[ic]] = countSend;
countSend = 1;
for(auto jobit=job_list.begin(); jobit!=job_list.end(); jobit++)
{
auto &awalker=newW[jobit->walkerID];
// unpack data
awalker->copyFromBuffer();
ncopy_newW[jobit->walkerID]=awalker->NumSentCopies;
// update counter
nrecv_pairs[jobit->queueID].second-=(awalker->NumSentCopies+1);
}
job_list.clear();
}
// check the completion of queues
completed=true;
for(int ic=0; ic<nrecv_pairs.size(); ic++)
completed = completed && (nrecv_pairs[ic].second==-1);
}
}
for(int ip = 0; ip < NumContexts; ++ip)
{
if(recvBuffers[ip])
{
requests[ip].Wait();
for(int cs = 0; cs < sendCounts[ip]; ++cs)
{
Walker_t *awalker= new Walker_t(wRef);
awalker->getMessage(*(recvBuffers[ip]));
newW.push_back(awalker);
}
delete recvBuffers[ip];
}
}
for(int ip = 0; ip < NumContexts; ++ip)
{
if(sendBuffers[ip])
{
requests[ip].Wait();
delete sendBuffers[ip];
}
}
delete[] sendBuffers;
delete[] recvBuffers;
//save the number of walkers sent
NumWalkersSent=nsend;
if(nsend)
// rebuild good_w and ncopy_w
std::vector<Walker_t*> good_w_temp(good_w);
good_w.resize(ncopy_pairs.size());
ncopy_w.resize(ncopy_pairs.size());
for(int iw=0; iw<ncopy_pairs.size(); iw++)
{
nsend=NumPerNode[MyContext]-nsend;
W.destroyWalkers(W.begin()+nsend, W.end());
good_w[iw]=good_w_temp[ncopy_pairs[iw].second];
ncopy_w[iw]=ncopy_pairs[iw].first;
}
//add walkers from other node
if(newW.size())
W.insert(W.end(),newW.begin(),newW.end());
{
good_w.insert(good_w.end(),newW.begin(),newW.end());
ncopy_w.insert(ncopy_w.end(),ncopy_newW.begin(),ncopy_newW.end());
}
}
}

View File

@ -44,12 +44,9 @@ struct WalkerControlMPI: public WalkerControlBase
/** perform branch and swap walkers as required */
int branch(int iter, MCWalkerConfiguration& W, RealType trigger);
//current implementations
void swapWalkersSimple(MCWalkerConfiguration& W);
//old implementations
void swapWalkersAsync(MCWalkerConfiguration& W);
// Removed swapWalkersBlocked and swapWalkersMap (minimize size and number of messages)
};
}
#endif

View File

@ -225,6 +225,7 @@ int WalkerControlBase::doNotBranch(int iter, MCWalkerConfiguration& W)
curData[RNSIZE_INDEX]=nrn;
curData[B_ENERGY_INDEX]=besum;
curData[B_WGT_INDEX]=bwgtsum;
curData[SENTWALKERS_INDEX]=NumWalkersSent=0;
myComm->allreduce(curData);
measureProperties(iter);
trialEnergy=EnsembleProperty.Energy;
@ -283,12 +284,16 @@ void WalkerControlBase::Write2XYZ(MCWalkerConfiguration& W)
}
/** evaluate curData and mark the bad/good walkers
/** evaluate curData and mark the bad/good walkers.
*
* Each good walker has a counter registering the
* number of copies needed to be generated from this walker.
* Bad walkers will be either recycled or removed later.
*/
int WalkerControlBase::sortWalkers(MCWalkerConfiguration& W)
{
MCWalkerConfiguration::iterator it(W.begin());
std::vector<Walker_t*> bad,good_rn;
std::vector<Walker_t*> good_rn;
std::vector<int> ncopy_rn;
NumWalkers=0;
MCWalkerConfiguration::iterator it_end(W.end());
@ -353,7 +358,7 @@ int WalkerControlBase::sortWalkers(MCWalkerConfiguration& W)
}
else
{
bad.push_back(*it);
bad_w.push_back(*it);
}
++it;
}
@ -378,9 +383,6 @@ int WalkerControlBase::sortWalkers(MCWalkerConfiguration& W)
//W.EnsembleProperty.Energy=(esum/=wsum);
//W.EnsembleProperty.Variance=(e2sum/wsum-esum*esum);
//W.EnsembleProperty.Variance=(e2sum*wsum-esum*esum)/(wsum*wsum-w2sum);
//remove bad walkers empty the container
for(int i=0; i<bad.size(); i++)
delete bad[i];
if (!WriteRN)
{
if(good_w.empty())
@ -441,25 +443,58 @@ int WalkerControlBase::sortWalkers(MCWalkerConfiguration& W)
return NumWalkers;
}
/** copy good walkers to W
*
* Good walkers are copied based on the registered number of copies
* Bad walkers are recycled to avoid memory allocation and deallocation.
*/
int WalkerControlBase::copyWalkers(MCWalkerConfiguration& W)
{
// save current good walker size.
const int size_good_w = good_w.size();
std::vector<int> copy_list;
for(int i=0; i<size_good_w; i++)
{
for(int j=0; j<ncopy_w[i]; j++)
{
if(bad_w.empty())
{
good_w.push_back(nullptr);
}
else
{
good_w.push_back(bad_w.back());
bad_w.pop_back();
}
copy_list.push_back(i);
}
}
#pragma omp parallel for
for(int i=size_good_w; i<good_w.size(); i++)
{
auto &wRef=good_w[copy_list[i-size_good_w]];
auto &awalker=good_w[i];
if(awalker==nullptr)
awalker=new Walker_t(*wRef);
else
*awalker=*wRef;
// not fully sure this is correct or even used
awalker->ID=(i-size_good_w)*NumContexts+MyContext;
awalker->ParentID=wRef->ParentID;
}
//clear the WalkerList to populate them with the good walkers
W.clear();
W.insert(W.begin(), good_w.begin(), good_w.end());
int cur_walker = good_w.size();
for(int i=0; i<good_w.size(); i++)
//,ie+=ncols) {
{
for(int j=0; j<ncopy_w[i]; j++, cur_walker++)
{
Walker_t* awalker=new Walker_t(*(good_w[i]));
awalker->ID=(++NumWalkersCreated)*NumContexts+MyContext;
awalker->ParentID=good_w[i]->ParentID;
W.push_back(awalker);
}
}
//remove bad walkers if there is any left
for(int i=0; i<bad_w.size(); i++)
delete bad_w[i];
//clear good_w and ncopy_w for the next branch
good_w.clear();
bad_w.clear();
ncopy_w.clear();
return W.getActiveWalkers();
}
@ -467,22 +502,26 @@ int WalkerControlBase::copyWalkers(MCWalkerConfiguration& W)
bool WalkerControlBase::put(xmlNodePtr cur)
{
int nw_target=0, nw_max=0;
std::string nonblocking="yes";
ParameterSet params;
params.add(targetEnergyBound,"energyBound","double");
params.add(targetSigma,"sigmaBound","double");
params.add(MaxCopy,"maxCopy","int");
params.add(nw_target,"targetwalkers","int");
params.add(nw_max,"max_walkers","int");
params.add(nonblocking,"use_nonblocking","string");
bool success=params.put(cur);
setMinMax(nw_target,nw_max);
use_nonblocking = nonblocking=="yes";
app_log() << " WalkerControlBase parameters " << std::endl;
//app_log() << " energyBound = " << targetEnergyBound << std::endl;
//app_log() << " sigmaBound = " << targetSigma << std::endl;
app_log() << " maxCopy = " << MaxCopy << std::endl;
app_log() << " Max Walkers per node " << Nmax << std::endl;
app_log() << " Min Walkers per node " << Nmin << std::endl;
app_log() << " Using " << (use_nonblocking?"non-":"") << "blocking send/recv" << std::endl;
return true;
}

View File

@ -125,12 +125,14 @@ public:
std::vector<RealType> accumData;
///any temporary data
std::vector<RealType> curData;
///temporary storage for good walkers
std::vector<Walker_t*> good_w;
///temporary storage for good and bad walkers
std::vector<Walker_t*> good_w, bad_w;
///temporary storage for copy counters
std::vector<int> ncopy_w;
///Add released-node fields to .dmc.dat file
bool WriteRN;
///Use non-blocking isend/irecv
bool use_nonblocking;
/** default constructor
*

View File

@ -41,6 +41,8 @@ DiracDeterminantCUDA::DiracDeterminantCUDA(SPOSetBasePtr const &spos, int first)
newGradLaplList_d("DiracDeterminantBase::newGradLaplList_d"),
AWorkList_d("DiracDeterminantBase::AWorkList_d"),
AinvWorkList_d("DiracDeterminantBase::AinvWorkList_d"),
PivotArray_d("DiracDeterminantBase::PivotArray_d"),
infoArray_d("DiracDeterminantBase::infoArray_d"),
GLList_d("DiracDeterminantBase::GLList_d"),
ratio_d("DiracDeterminantBase::ratio_d"),
gradLapl_d("DiracDeterminantBase::gradLapl_d"),
@ -58,39 +60,6 @@ DiracDeterminantCUDA::DiracDeterminantCUDA(SPOSetBasePtr const &spos, int first)
NLratios_d[i] = gpu::device_vector<CudaValueType>("DiracDeterminantBase::NLratios_d");
}
DiracDeterminantCUDA::DiracDeterminantCUDA(const DiracDeterminantCUDA& s) :
DiracDeterminantBase(s),
UpdateJobList_d("DiracDeterminantBase::UpdateJobList_d"),
srcList_d("DiracDeterminantBase::srcList_d"),
destList_d("DiracDeterminantBase::destList_d"),
AList_d("DiracDeterminantBase::AList_d"),
AinvList_d("DiracDeterminantBase::AinvList_d"),
newRowList_d("DiracDeterminantBase::newRowList_d"),
AinvDeltaList_d("DiracDeterminantBase::AinvDeltaList_d"),
AinvColkList_d("DiracDeterminantBase::AinvColkList_d"),
gradLaplList_d("DiracDeterminantBase::gradLaplList_d"),
newGradLaplList_d("DiracDeterminantBase::newGradLaplList_d"),
AWorkList_d("DiracDeterminantBase::AWorkList_d"),
AinvWorkList_d("DiracDeterminantBase::AinvWorkList_d"),
GLList_d("DiracDeterminantBase::GLList_d"),
ratio_d("DiracDeterminantBase::ratio_d"),
gradLapl_d("DiracDeterminantBase::gradLapl_d"),
iatList_d("DiracDeterminantBase::iatList_d"),
NLrowBuffer_d("DiracDeterminantBase::NLrowBuffer_d"),
SplineRowList_d("DiracDeterminantBase::SplineRowList_d"),
RatioRowList_d("DiracDeterminantBase::RatioRowList_d"),
NLposBuffer_d("DiracDeterminantBase::NLposBuffer_d"),
NLAinvList_d("DiracDeterminantBase::NLAinvList_d"),
NLnumRatioList_d("DiracDeterminantBase::NLnumRatioList_d"),
NLelecList_d("DiracDeterminantBase::NLelecList_d"),
NLratioList_d("DiracDeterminantBase::NLratioList_d")
{
for(int i = 0; i < 2; ++i)
NLratios_d[i] = gpu::device_vector<CudaValueType>("DiracDeterminantBase::NLratios_d");
}
/////////////////////////////////////
// Vectorized evaluation functions //
/////////////////////////////////////
@ -412,9 +381,28 @@ DiracDeterminantCUDA::recompute(MCWalkerConfiguration &W, bool firstTime)
// Invert
bool useDoublePrecision = true;
cublas_inverse (gpu::cublasHandle, AList_d.data(), AinvList_d.data(),
AWorkList_d.data(), AinvWorkList_d.data(),
AWorkList_d.data(), AinvWorkList_d.data(),
PivotArray_d.data(), infoArray_d.data(),
NumPtcls, RowStride, walkers.size(), useDoublePrecision);
// checking inversion status
infoArray_host = infoArray_d;
bool failed = false;
for(int iw=0; iw<walkers.size(); iw++)
if(infoArray_host[iw]!=0||infoArray_host[iw+walkers.size()]!=0)
{
failed = true;
fprintf(stderr, "cublas_inverse failed on walker %d, getrf error %d, getri error %d.\n",
iw, infoArray_host[iw], infoArray_host[iw+walkers.size()]);
char name[1000];
gethostname(name, 1000);
fprintf(stderr, "Offending hostname = %s\n", name);
int dev;
cudaGetDevice(&dev);
fprintf(stderr, "Offending device = %d\n", dev);
}
if(failed) abort();
#ifdef DEBUG_CUDA
CudaValueType Ainv[NumPtcls][RowStride], A[NumPtcls][RowStride];
//for (int iw=0; iw<walkers.size(); iw++)
@ -940,6 +928,7 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
if (std::isnan(lapl(iw,iat+FirstIndex)))
#endif
{
fprintf (stderr, "Offending walker = %d\n", iw);
char name[1000];
gethostname(name, 1000);
fprintf (stderr, "Offending hostname = %s\n", name);
@ -961,13 +950,13 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
fprintf (Ainv, "%14.8e+%14.8ei ", host_data[AinvOffset+i*RowStride+j].real(), host_data[AinvOffset+i*RowStride+j].imag());
fprintf (Lmat, "%14.8e+%14.8ei ", host_data[gradLaplOffset+(4*i+3)*RowStride+j].real(), host_data[gradLaplOffset+(4*i+3)*RowStride+j].imag());
for (int k=0; k<3; k++)
fprintf (Lmat, "%14.8e+%14.8ei ", host_data[gradLaplOffset+(4*i+k)*RowStride+j].real(), host_data[gradLaplOffset+(4*i+k)*RowStride+j].imag());
fprintf (Gmat, "%14.8e+%14.8ei ", host_data[gradLaplOffset+(4*i+k)*RowStride+j].real(), host_data[gradLaplOffset+(4*i+k)*RowStride+j].imag());
#else
fprintf (Amat, "%14.8e ", host_data[AOffset+i*RowStride+j]);
fprintf (Ainv, "%14.8e ", host_data[AinvOffset+i*RowStride+j]);
fprintf (Lmat, "%14.8e ", host_data[gradLaplOffset+(4*i+3)*RowStride+j]);
for (int k=0; k<3; k++)
fprintf (Lmat, "%14.8e ", host_data[gradLaplOffset+(4*i+k)*RowStride+j]);
fprintf (Gmat, "%14.8e ", host_data[gradLaplOffset+(4*i+k)*RowStride+j]);
#endif
}
fprintf (Amat, "\n");

View File

@ -40,7 +40,7 @@ public:
typedef ParticleSet::Walker_t Walker_t;
DiracDeterminantCUDA(SPOSetBasePtr const &spos, int first=0);
DiracDeterminantCUDA(const DiracDeterminantCUDA& s);
DiracDeterminantCUDA(const DiracDeterminantCUDA& s) = delete;
protected:
/////////////////////////////////////////////////////
@ -60,6 +60,9 @@ protected:
gpu::device_vector<CudaValueType*> srcList_d, destList_d, AList_d, AinvList_d, newRowList_d,
AinvDeltaList_d, AinvColkList_d, gradLaplList_d,
newGradLaplList_d, AWorkList_d, AinvWorkList_d, GLList_d;
gpu::device_vector<int> PivotArray_d;
gpu::device_vector<int> infoArray_d;
gpu::host_vector<int> infoArray_host;
gpu::device_vector<CudaValueType> ratio_d;
gpu::host_vector<CudaValueType> ratio_host;
gpu::device_vector<CudaValueType> gradLapl_d;
@ -118,6 +121,9 @@ protected:
// HACK HACK HACK
// gradLapl_d.resize (numWalkers*NumOrbitals*4);
// gradLapl_host.resize(numWalkers*NumOrbitals*4);
infoArray_d.resize(numWalkers*2);
infoArray_host.resize(numWalkers*2);
PivotArray_d.resize(numWalkers*NumOrbitals);
gradLapl_d.resize (numWalkers*RowStride*4);
gradLapl_host.resize(numWalkers*RowStride*4);
NLrowBuffer_d.resize(NLrowBufferRows*RowStride);

View File

@ -15,6 +15,10 @@
InfoStream::~InfoStream()
{
if (currStream != nullStream) {
delete nullStream;
}
if (ownStream && currStream) {
delete(currStream);
}

View File

@ -14,11 +14,10 @@
#ifndef QMCPLUSPLUS_POOLEDMEMORY_H
#define QMCPLUSPLUS_POOLEDMEMORY_H
#include <vector>
#include <complex>
#include <limits>
#include <cstring>
#include "simd/allocator.hpp"
#include "OhmmsPETE/OhmmsVector.h"
namespace qmcplusplus
{
@ -38,12 +37,12 @@ struct PooledMemory
{
typedef char T;
typedef T value_type;
typedef typename std::vector<T>::size_type size_type;
typedef typename Vector<T, Alloc>::size_type size_type;
const int scalar_multiplier;
size_type Current, Current_scalar;
T_scalar *Scalar_ptr;
std::vector<T, Alloc> myData;
Vector<T, Alloc> myData;
///default constructor
inline PooledMemory():

View File

@ -13,12 +13,17 @@
#include "catch.hpp"
#include "Utilities/PooledMemory.h"
#include "Utilities/PooledData.h"
#include "Utilities/Timer.h"
#include <iostream>
#include <stdio.h>
#include <string>
#include <vector>
#include <random>
#include <complex>
//#define CHECK_ALLOCATION_PERF
using std::string;
using std::vector;
using std::complex;
@ -91,6 +96,36 @@ TEST_CASE("pack scalar", "[utilities]")
REQUIRE(i6[0]==i6[0]);
REQUIRE(i6[1]==i6[1]);
REQUIRE(i6[2]==i6[2]);
#ifdef CHECK_ALLOCATION_PERF
// up to 512 MB.
for(size_t size_scale=15; size_scale<30; size_scale++)
{
const size_t size = (1<<size_scale)+17*8;
{
PooledMemory<double> pm_walker;
pm_walker.Current = size;
Timer PoolTimer;
pm_walker.allocate();
std::cout << "PooledMemory Allocate " << pm_walker.byteSize() << " bytes Time " << PoolTimer.elapsed() << std::endl;
PoolTimer.restart();
PooledMemory<double> pm_walker_copy(pm_walker);
std::cout << "PooledMemory Copy Time " << PoolTimer.elapsed() << std::endl;
}
{
PooledData<double> pd_walker;
Timer PoolTimer;
pd_walker.resize(size/8);
std::cout << "PooledData Allocate " << pd_walker.byteSize() << " bytes Time " << PoolTimer.elapsed() << std::endl;
PoolTimer.restart();
PooledData<double> pd_walker_copy(pd_walker);
std::cout << "PooledData Copy Time " << PoolTimer.elapsed() << std::endl;
}
std::cout << std::endl;
}
#endif
}
}

View File

@ -29,7 +29,6 @@ QMCState::QMCState()
use_density=false;
dryrun=false;
save_wfs=false;
async_swap=false;
io_node=true;
mpi_groups=1;
use_ewald=false;
@ -60,10 +59,6 @@ void QMCState::initialize(int argc, char **argv)
{
save_wfs=(c.find("no")>=c.size());
}
else if(c.find("--async_swap") < c.size())
{
async_swap=(c.find("no")>=c.size());
}
else if(c.find("--help")< c.size())
{
stopit=true;
@ -99,7 +94,7 @@ void QMCState::initialize(int argc, char **argv)
std::cerr << " git last commit date: " << QMCPACK_GIT_COMMIT_LAST_CHANGED << std::endl;
std::cerr << " git last commit subject: " << QMCPACK_GIT_COMMIT_SUBJECT << std::endl;
#endif
std::cerr << std::endl << "Usage: qmcpack input [--dryrun --save_wfs[=no] --async_swap[=no] --gpu]" << std::endl << std::endl;
std::cerr << std::endl << "Usage: qmcpack input [--dryrun --save_wfs[=no] --gpu]" << std::endl << std::endl;
}
if(stopit)
{
@ -115,10 +110,6 @@ void QMCState::print_options(std::ostream& os)
os << " dryrun : qmc sections will be ignored." << std::endl;
if(save_wfs)
os << " save_wfs=1 : save wavefunctions in hdf5. " << std::endl;
if(async_swap)
os << " async_swap=1 : using async isend/irecv for walker swaps " << std::endl;
else
os << " async_swap=0 : using blocking send/recv for walker swaps " << std::endl;
}
void QMCState::print_memory_change(const std::string& who, size_t before)

View File

@ -37,8 +37,6 @@ struct QMCState
bool dryrun;
///true, if wave functions are stored for next runs
bool save_wfs;
///true, if walker swap is done by async
bool async_swap;
///true, print out file
bool io_node;
///true, use Ewald instead of optimal breakup for the Coulomb

View File

@ -37,7 +37,7 @@ IF (NOT QMC_CUDA)
#
# H4 starting from perturbed orbitals, using optimizable determinants, and the adaptive linear method
#
LIST(APPEND H4_ORB_OPT_SCALARS "totenergy" "-1.93045653 0.00837934") # total energy
LIST(APPEND H4_ORB_OPT_SCALARS "totenergy" "-2.0889 0.001") # total energy
QMC_RUN_AND_CHECK(short-H4-orb-opt
"${CMAKE_SOURCE_DIR}/tests/molecules/H4_ae"

View File

@ -47,5 +47,20 @@
</sposet>
</determinantset>
<jastrow name="J2" type="Two-Body" function="Bspline" print="yes">
<correlation rcut="2" size="10" speciesA="u" speciesB="u">
<coefficients id="uu" type="Array" optimize="no"> 0.04661440939 0.0555644703 0.1286335481 0.1077457768 0.0811381484 0.0626555805 0.0436904683 0.02811397453 0.01572772511 0.006258753264</coefficients>
</correlation>
<correlation rcut="2" size="10" speciesA="u" speciesB="d">
<coefficients id="ud" type="Array" optimize="no"> 0.4465508582 0.3603308616 0.2893897103 0.2269436006 0.1717850082 0.1250275368 0.08538908372 0.05318480503 0.02806504868 0.01035981902</coefficients>
</correlation>
</jastrow>
<jastrow name="J1" type="One-Body" function="Bspline" source="ion0" print="yes">
<correlation rcut="2" size="10" cusp="1" elementType="H">
<coefficients id="eH" type="Array" optimize="no"> 0.0167926566 0.1405038909 0.1611311956 0.1198446618 0.06764720157 0.03602453588 0.02460661286 0.01952889643 0.01300432967 0.005621933949</coefficients>
</correlation>
</jastrow>
</wavefunction>
</qmcsystem>