mirror of https://github.com/QMCPACK/qmcpack.git
Merge branch 'develop' into opt-momentum-estimator
This commit is contained in:
commit
58fcddbb54
|
@ -7,21 +7,21 @@ FILE( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl.cxx"
|
|||
"#include <iostream>\n #include <mkl.h>\n int main() { return 0; }\n" )
|
||||
try_compile(HAVE_MKL ${CMAKE_BINARY_DIR}
|
||||
${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl.cxx
|
||||
CMAKE_FLAGS "${CMAKE_CXX_FLAGS} -mkl" )
|
||||
COMPILE_DEFINITIONS "-mkl" )
|
||||
|
||||
# Check for mkl_vml_functions.h
|
||||
FILE( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl_vml.cxx"
|
||||
"#include <iostream>\n #include <mkl_vml_functions.h>\n int main() { return 0; }\n" )
|
||||
try_compile(HAVE_MKL_VML ${CMAKE_BINARY_DIR}
|
||||
${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl_vml.cxx
|
||||
CMAKE_FLAGS "${CMAKE_CXX_FLAGS} -mkl" )
|
||||
COMPILE_DEFINITIONS "-mkl" )
|
||||
|
||||
# Check for fftw3
|
||||
FILE( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl_fftw3.cxx"
|
||||
"#include <iostream>\n #include <fftw/fftw3.h>\n int main() { return 0; }\n" )
|
||||
try_compile(HAVE_MKL_FFTW3 ${CMAKE_BINARY_DIR}
|
||||
${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src_mkl_fftw3.cxx
|
||||
CMAKE_FLAGS "${CMAKE_CXX_FLAGS} -mkl" )
|
||||
COMPILE_DEFINITIONS "-mkl" )
|
||||
|
||||
IF ( HAVE_MKL )
|
||||
SET( MKL_FOUND 1 )
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
#!/bin/bash
|
||||
|
||||
Compiler=Clang++11
|
||||
|
||||
for name in real_SoA real_SoA_MP cplx_SoA cplx_SoA_MP \
|
||||
real real_MP cplx cplx_MP
|
||||
do
|
||||
|
||||
CMAKE_FLAGS="-D CMAKE_TOOLCHAIN_FILE=../config/BGQ_${Compiler}_ToolChain.cmake"
|
||||
|
||||
if [[ $name == *"cplx"* ]]; then
|
||||
CMAKE_FLAGS="$CMAKE_FLAGS -D QMC_COMPLEX=1"
|
||||
fi
|
||||
|
||||
if [[ $name == *"_SoA"* ]]; then
|
||||
CMAKE_FLAGS="$CMAKE_FLAGS -D ENABLE_SOA=1"
|
||||
fi
|
||||
|
||||
if [[ $name == *"_MP"* ]]; then
|
||||
CMAKE_FLAGS="$CMAKE_FLAGS -D QMC_MIXED_PRECISION=1"
|
||||
fi
|
||||
|
||||
folder=build_${Compiler}_${name}
|
||||
echo "**********************************"
|
||||
echo "$folder"
|
||||
echo "$CMAKE_FLAGS"
|
||||
echo "**********************************"
|
||||
mkdir $folder
|
||||
cd $folder
|
||||
if [ ! -f CMakeCache.txt ] ; then
|
||||
cmake $CMAKE_FLAGS ..
|
||||
cmake $CMAKE_FLAGS ..
|
||||
fi
|
||||
make -j24
|
||||
cd ..
|
||||
|
||||
echo
|
||||
done
|
|
@ -0,0 +1,45 @@
|
|||
module unload cray-libsci
|
||||
module load cray-hdf5-parallel
|
||||
|
||||
export CC=cc
|
||||
export CXX=CC
|
||||
export BOOST_ROOT=/soft/libraries/boost/1.64.0/intel
|
||||
export CRAYPE_LINK_TYPE=dynamic
|
||||
|
||||
#TYPE=RelWithDebInfo
|
||||
TYPE=Release
|
||||
Compiler=Intel
|
||||
|
||||
for name in real_SoA real_SoA_MP cplx_SoA cplx_SoA_MP \
|
||||
real real_MP cplx cplx_MP
|
||||
do
|
||||
|
||||
CMAKE_FLAGS="-D CMAKE_BUILD_TYPE=$TYPE"
|
||||
|
||||
if [[ $name == *"cplx"* ]]; then
|
||||
CMAKE_FLAGS="$CMAKE_FLAGS -D QMC_COMPLEX=1"
|
||||
fi
|
||||
|
||||
if [[ $name == *"_SoA"* ]]; then
|
||||
CMAKE_FLAGS="$CMAKE_FLAGS -D ENABLE_SOA=1"
|
||||
fi
|
||||
|
||||
if [[ $name == *"_MP"* ]]; then
|
||||
CMAKE_FLAGS="$CMAKE_FLAGS -D QMC_MIXED_PRECISION=1"
|
||||
fi
|
||||
|
||||
folder=build_KNL_${Compiler}_${name}
|
||||
echo "**********************************"
|
||||
echo "$folder"
|
||||
echo "$CMAKE_FLAGS"
|
||||
echo "**********************************"
|
||||
mkdir $folder
|
||||
cd $folder
|
||||
if [ ! -f CMakeCache.txt ] ; then
|
||||
cmake $CMAKE_FLAGS ..
|
||||
fi
|
||||
make -j32
|
||||
cd ..
|
||||
|
||||
echo
|
||||
done
|
|
@ -0,0 +1,15 @@
|
|||
# QMCPACK Manual
|
||||
|
||||
This directory contains the LaTeX source for the QMCPACK manual. The
|
||||
PDF manual is not currently created by the build process and must be
|
||||
created manually. Note that due to the use of the bibtopic package to
|
||||
support two bibliographies, bibtex must be invoked twice:
|
||||
|
||||
```
|
||||
pdflatex qmcpack_manual.tex
|
||||
bibtex qmcpack_manual1
|
||||
bibtex qmcpack_manual2
|
||||
pdflatex qmcpack_manual.tex
|
||||
pdflatex qmcpack_manual.tex
|
||||
```
|
||||
|
|
@ -1,6 +1,3 @@
|
|||
|
||||
\appendix
|
||||
|
||||
\chapter{Derivation of twist averaging efficiency}
|
||||
\label{sec:app_ta_efficiency}
|
||||
In this appendix we derive the relative statistical efficiency of
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/sh
|
||||
pdflatex qmcpack_manual.tex
|
||||
bibtex qmcpack_manual1
|
||||
bibtex qmcpack_manual2
|
||||
pdflatex qmcpack_manual.tex
|
||||
pdflatex qmcpack_manual.tex
|
|
@ -609,9 +609,8 @@ S =
|
|||
We note at the left and right extremes, the values and first two
|
||||
derivatives of the functions are zero, while at the center, $h_{j0}$
|
||||
has a value of 1, $h_{j1}$ has a first derivative of 1, and $h_{j2}$
|
||||
has a second derivative of 1.}
|
||||
has a second derivative of 1. \label{fig:LPQHI} }
|
||||
\end{center}
|
||||
\label{fig:LPQHI}
|
||||
\end{figure}
|
||||
Figure~\ref{fig:LPQHI} shows plots of these function shapes.
|
||||
|
||||
|
|
|
@ -46,6 +46,7 @@
|
|||
% & \texttt{wlen } & integer & $\ge 0$ & 0 & number of samples per thread \\
|
||||
& \texttt{maxDisplSq } & real & all values & -1 & maximum particle move \\
|
||||
& \texttt{storeconfigs } & integer & all values & 0 & store configurations \\
|
||||
& \texttt{use\_nonblocking } & string & yes/other & yes & using non-blocking send/recv \\
|
||||
& \texttt{blocks\_between\_recompute} & integer & $\ge 0$ & dep. & wavefunction recompute frequency \\
|
||||
\hline
|
||||
\end{tabularx}
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
\documentclass[11pt,usletter]{report}
|
||||
\usepackage{bibtopic}
|
||||
\bibliographystyle{ieeetr}
|
||||
\usepackage{amsmath}
|
||||
\usepackage{amssymb}
|
||||
\usepackage{delarray}
|
||||
|
@ -147,6 +149,9 @@
|
|||
\tableofcontents
|
||||
\newpage
|
||||
|
||||
\begin{btUnit}
|
||||
\begin{btSect}{bibliography}
|
||||
|
||||
\input{introduction}
|
||||
\input{features}
|
||||
\input{installation}
|
||||
|
@ -198,15 +203,20 @@
|
|||
\input{design_features}
|
||||
\input{developing}
|
||||
|
||||
\input{appendices}
|
||||
|
||||
|
||||
\newpage
|
||||
\chapter{References}
|
||||
%\begingroup % Avoid "References" section title
|
||||
%\renewcommand{\section}[2]{}%
|
||||
\bibliographystyle{ieeetr}
|
||||
\bibliography{bibliography}
|
||||
%\bibliographystyle{ieeetr}
|
||||
%\bibliography{bibliography}
|
||||
%\endgroup
|
||||
|
||||
\btPrintCited
|
||||
\end{btSect}
|
||||
\end{btUnit}
|
||||
|
||||
\appendix
|
||||
\input{appendices}
|
||||
\begin{btUnit}
|
||||
\input{qmcpack_papers}
|
||||
\end{btUnit}
|
||||
\end{document}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,15 @@
|
|||
\chapter{QMCPACK papers}
|
||||
% Uset the bibtopic package to generate a bibliography local to this btsection below.
|
||||
% btPrintAll creates the bibliography citing all entries in the files qmcpack_papers.bib
|
||||
\begin{btSect}{qmcpack_papers}
|
||||
|
||||
The following is a list of all papers, theses, and book chapters
|
||||
known to use QMCPACK. Please let the developers know if your paper
|
||||
is missing, if you know of other works, or an entry is incorrect. We
|
||||
list papers whether they cite QMCPACK directly or not. This list
|
||||
will be placed on the \url{http://www.qmcpack.org} website.
|
||||
|
||||
\btPrintAll
|
||||
|
||||
\end{btSect}
|
||||
|
|
@ -12,9 +12,6 @@ are performed. If the flag is absent, then the corresponding
|
|||
option is disabled.
|
||||
|
||||
\begin{description}
|
||||
\item[\texttt{-{}-async\_swap}]{ When enabled, this option will launch a new thread
|
||||
to handle the communication between walkers. If it is not enabled,
|
||||
then the communication happens on the current thread. }
|
||||
\item[\texttt{-{}-dryrun}]{ Validate the input file without performing the simulation.
|
||||
This is a good way to ensure that QMCPACK will do what you think it will. }
|
||||
\item[\texttt{-{}-help}]{ Print version information as well as a list of optional
|
||||
|
|
|
@ -153,6 +153,7 @@ void
|
|||
cublas_inverse (cublasHandle_t handle,
|
||||
float *Alist_d[], float *Ainvlist_d[],
|
||||
float *AWorklist_d[], float *AinvWorklist_d[],
|
||||
int *PivotArray, int *infoArray,
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision)
|
||||
{
|
||||
|
@ -160,8 +161,6 @@ cublas_inverse (cublasHandle_t handle,
|
|||
// Info array tells if a matrix inversion is successful
|
||||
// = 0 : successful
|
||||
// = k : U(k,k) = 0; inversion failed
|
||||
int *infoArray;
|
||||
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
|
||||
|
||||
// If double precision operations are desired...
|
||||
if (useHigherPrecision)
|
||||
|
@ -174,16 +173,16 @@ cublas_inverse (cublasHandle_t handle,
|
|||
|
||||
// (ii) call cublas to do matrix inversion
|
||||
// LU decomposition
|
||||
callAndCheckError( cublasDgetrfBatched( handle, N, (double**)AWorklist_d, rowStride, NULL,
|
||||
callAndCheckError( cublasDgetrfBatched( handle, N, (double**)AWorklist_d, rowStride, PivotArray,
|
||||
infoArray, numMats), __LINE__ );
|
||||
|
||||
|
||||
// Inversion
|
||||
#if (CUDA_VERSION >= 6050)
|
||||
callAndCheckError( cublasDgetriBatched( handle, N, (const double**)AWorklist_d, rowStride, NULL,
|
||||
(double**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasDgetriBatched( handle, N, (const double**)AWorklist_d, rowStride, PivotArray,
|
||||
(double**)AinvWorklist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#else
|
||||
callAndCheckError( cublasDgetriBatched( handle, N, (double**)AWorklist_d, rowStride, NULL,
|
||||
(double**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasDgetriBatched( handle, N, (double**)AWorklist_d, rowStride, PivotArray,
|
||||
(double**)AinvWorklist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#endif
|
||||
|
||||
// (iii) convert results back to single precision
|
||||
|
@ -195,24 +194,20 @@ cublas_inverse (cublasHandle_t handle,
|
|||
{
|
||||
// Call cublas to do matrix inversion
|
||||
// LU decomposition
|
||||
callAndCheckError( cublasSgetrfBatched( handle, N, Alist_d, rowStride, NULL,
|
||||
callAndCheckError( cublasSgetrfBatched( handle, N, Alist_d, rowStride, PivotArray,
|
||||
infoArray, numMats), __LINE__ );
|
||||
|
||||
// Inversion
|
||||
#if (CUDA_VERSION >= 6050)
|
||||
callAndCheckError( cublasSgetriBatched( handle, N, (const float**) Alist_d, rowStride, NULL,
|
||||
Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasSgetriBatched( handle, N, (const float**) Alist_d, rowStride, PivotArray,
|
||||
Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#else
|
||||
callAndCheckError( cublasSgetriBatched( handle, N, Alist_d, rowStride, NULL,
|
||||
Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasSgetriBatched( handle, N, Alist_d, rowStride, PivotArray,
|
||||
Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#endif
|
||||
}
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Free resources
|
||||
cudaFree(infoArray);
|
||||
|
||||
}
|
||||
|
||||
// 2. for double matrices
|
||||
|
@ -220,6 +215,7 @@ void
|
|||
cublas_inverse (cublasHandle_t handle,
|
||||
double *Alist_d[], double *Ainvlist_d[],
|
||||
double *AWorklist_d[], double *AinvWorklist_d[],
|
||||
int *PivotArray, int *infoArray,
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision)
|
||||
{
|
||||
|
@ -227,8 +223,6 @@ cublas_inverse (cublasHandle_t handle,
|
|||
// Info array tells if a matrix inversion is successful
|
||||
// = 0 : successful
|
||||
// = k : U(k,k) = 0; inversion failed
|
||||
int *infoArray;
|
||||
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
|
||||
|
||||
// (i) copy all the elements of Alist to AWorklist
|
||||
dim3 dimBlockConvert (CONVERT_BS);
|
||||
|
@ -237,22 +231,19 @@ cublas_inverse (cublasHandle_t handle,
|
|||
|
||||
// (ii) call cublas functions to do inversion
|
||||
// LU decomposition
|
||||
callAndCheckError( cublasDgetrfBatched( handle, N, AWorklist_d, rowStride, NULL,
|
||||
callAndCheckError( cublasDgetrfBatched( handle, N, AWorklist_d, rowStride, PivotArray,
|
||||
infoArray, numMats), __LINE__ );
|
||||
|
||||
// Inversion
|
||||
#if (CUDA_VERSION >= 6050)
|
||||
callAndCheckError( cublasDgetriBatched( handle, N, (const double**) AWorklist_d, rowStride, NULL,
|
||||
Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasDgetriBatched( handle, N, (const double**) AWorklist_d, rowStride, PivotArray,
|
||||
Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#else
|
||||
callAndCheckError( cublasDgetriBatched( handle, N, AWorklist_d, rowStride, NULL,
|
||||
Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasDgetriBatched( handle, N, AWorklist_d, rowStride, PivotArray,
|
||||
Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#endif
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
cudaFree(infoArray);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -263,6 +254,7 @@ void
|
|||
cublas_inverse (cublasHandle_t handle,
|
||||
std::complex<float> *Alist_d[], std::complex<float> *Ainvlist_d[],
|
||||
std::complex<float> *AWorklist_d[], std::complex<float> *AinvWorklist_d[],
|
||||
int *PivotArray, int *infoArray,
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision)
|
||||
{
|
||||
|
@ -270,8 +262,6 @@ cublas_inverse (cublasHandle_t handle,
|
|||
// Info array tells if a matrix inversion is successful
|
||||
// = 0 : successful
|
||||
// = k : U(k,k) = 0; inversion failed
|
||||
int *infoArray;
|
||||
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
|
||||
|
||||
// If double precision operations are desired...
|
||||
if (useHigherPrecision)
|
||||
|
@ -284,12 +274,12 @@ cublas_inverse (cublasHandle_t handle,
|
|||
|
||||
// (ii) call cublas to do matrix inversion
|
||||
// LU decomposition
|
||||
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, infoArray, numMats), __LINE__ );
|
||||
// Inversion
|
||||
#if (CUDA_VERSION >= 6050)
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#else
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#endif
|
||||
|
||||
// (iii) convert results back to single precision
|
||||
|
@ -301,22 +291,18 @@ cublas_inverse (cublasHandle_t handle,
|
|||
{
|
||||
// Call cublas to do matrix inversion
|
||||
// LU decomposition
|
||||
callAndCheckError( cublasCgetrfBatched( handle, N, (cuComplex**)Alist_d, rowStride, NULL,
|
||||
callAndCheckError( cublasCgetrfBatched( handle, N, (cuComplex**)Alist_d, rowStride, PivotArray,
|
||||
infoArray, numMats), __LINE__ );
|
||||
|
||||
// Inversion
|
||||
#if (CUDA_VERSION >= 6050)
|
||||
callAndCheckError( cublasCgetriBatched( handle, N, (const cuComplex**)Alist_d, rowStride, NULL, (cuComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasCgetriBatched( handle, N, (const cuComplex**)Alist_d, rowStride, PivotArray, (cuComplex**)Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#else
|
||||
callAndCheckError( cublasCgetriBatched( handle, N, (cuComplex**)Alist_d, rowStride, NULL, (cuComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasCgetriBatched( handle, N, (cuComplex**)Alist_d, rowStride, PivotArray, (cuComplex**)Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#endif
|
||||
}
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Free resources
|
||||
cudaFree(infoArray);
|
||||
|
||||
}
|
||||
|
||||
// 4. for complex double matrices
|
||||
|
@ -324,6 +310,7 @@ void
|
|||
cublas_inverse (cublasHandle_t handle,
|
||||
std::complex<double> *Alist_d[], std::complex<double> *Ainvlist_d[],
|
||||
std::complex<double> *AWorklist_d[], std::complex<double> *AinvWorklist_d[],
|
||||
int *PivotArray, int *infoArray,
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision)
|
||||
{
|
||||
|
@ -331,8 +318,6 @@ cublas_inverse (cublasHandle_t handle,
|
|||
// Info array tells if a matrix inversion is successful
|
||||
// = 0 : successful
|
||||
// = k : U(k,k) = 0; inversion failed
|
||||
int *infoArray;
|
||||
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
|
||||
|
||||
// (i) copy all the elements of Alist to AWorklist
|
||||
dim3 dimBlockConvert (CONVERT_BS);
|
||||
|
@ -341,18 +326,15 @@ cublas_inverse (cublasHandle_t handle,
|
|||
|
||||
// (ii) call cublas to do matrix inversion
|
||||
// LU decomposition
|
||||
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, infoArray, numMats), __LINE__ );
|
||||
// Inversion
|
||||
#if (CUDA_VERSION >= 6050)
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#else
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, PivotArray, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray+numMats, numMats), __LINE__ );
|
||||
#endif
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
cudaFree(infoArray);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ void
|
|||
cublas_inverse (cublasHandle_t handle,
|
||||
float *Alist_d[], float *Ainvlist_d[],
|
||||
float *AWorkList_d[], float *AinvWorkList_d[],
|
||||
int *PivotArray, int *infoArray,
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision = true);
|
||||
|
||||
|
@ -35,6 +36,7 @@ void
|
|||
cublas_inverse (cublasHandle_t handle,
|
||||
double *Alist_d[], double *Ainvlist_d[],
|
||||
double *AWorklist_d[], double *AinvWorklist_d[],
|
||||
int *PivotArray, int *infoArray,
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision = true);
|
||||
|
||||
|
@ -46,6 +48,7 @@ void
|
|||
cublas_inverse (cublasHandle_t handle,
|
||||
std::complex<float> *Alist_d[], std::complex<float> *Ainvlist_d[],
|
||||
std::complex<float> *AWorkList_d[], std::complex<float> *AinvWorkList_d[],
|
||||
int *PivotArray, int *infoArray,
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision = true);
|
||||
|
||||
|
@ -56,6 +59,7 @@ void
|
|||
cublas_inverse (cublasHandle_t handle,
|
||||
std::complex<double> *Alist_d[], std::complex<double> *Ainvlist_d[],
|
||||
std::complex<double> *AWorklist_d[], std::complex<double> *AinvWorklist_d[],
|
||||
int *PivotArray, int *infoArray,
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision = true);
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ void test_e2iphi()
|
|||
T vcos[N];
|
||||
T vsin[N];
|
||||
for (int i = 0; i < N; i++) {
|
||||
phi[0] = 0.2*i;
|
||||
phi[i] = 0.2*i;
|
||||
}
|
||||
|
||||
eval_e2iphi(N, phi, vcos, vsin);
|
||||
|
|
|
@ -43,9 +43,13 @@ public:
|
|||
|
||||
/** constructor with size n*/
|
||||
explicit inline
|
||||
Vector(size_t n=0): nLocal(n), nAllocated(0), X(nullptr)
|
||||
Vector(size_t n=0, Type_t val = Type_t()): nLocal(n), nAllocated(0), X(nullptr)
|
||||
{
|
||||
if(n) resize_impl(n);
|
||||
if(n)
|
||||
{
|
||||
resize_impl(n);
|
||||
std::fill_n(X, n, val);
|
||||
}
|
||||
}
|
||||
|
||||
/** constructor with an initialized ref */
|
||||
|
@ -113,12 +117,20 @@ public:
|
|||
}
|
||||
|
||||
///resize
|
||||
inline void resize(size_t n)
|
||||
inline void resize(size_t n, Type_t val = Type_t())
|
||||
{
|
||||
if(nLocal>nAllocated)
|
||||
throw std::runtime_error("Resize not allowed on Vector constructed by initialized memory.");
|
||||
if(n>nAllocated)
|
||||
{
|
||||
resize_impl(n);
|
||||
std::fill_n(X, n, val);
|
||||
}
|
||||
else if(n>nLocal)
|
||||
{
|
||||
std::fill_n(X+nLocal, n-nLocal, val);
|
||||
nLocal=n;
|
||||
}
|
||||
else
|
||||
nLocal=n;
|
||||
return;
|
||||
|
@ -204,7 +216,6 @@ private:
|
|||
mAllocator.deallocate(X,nAllocated);
|
||||
}
|
||||
X=mAllocator.allocate(n);
|
||||
std::fill_n(X, n, T());
|
||||
nLocal=n;
|
||||
nAllocated=n;
|
||||
}
|
||||
|
|
|
@ -184,6 +184,7 @@ struct AsymmetricDTD
|
|||
|
||||
inline void evaluate(ParticleSet& P, int jat)
|
||||
{
|
||||
APP_ABORT(" No need to call AsymmetricDTD::evaluate(ParticleSet& P, int jat)");
|
||||
//based on full evaluation. Only compute it if jat==0
|
||||
if(jat==0) evaluate(P);
|
||||
}
|
||||
|
|
|
@ -504,8 +504,9 @@ ParticleSet::makeMove(Index_t iat, const SingleParticlePos_t& displ)
|
|||
void ParticleSet::setActive(int iat)
|
||||
{
|
||||
myTimers[3]->start();
|
||||
for (size_t i=0,n=DistTables.size(); i< n; i++)
|
||||
DistTables[i]->evaluate(*this,iat);
|
||||
for (size_t i=0; i<DistTables.size(); i++)
|
||||
if(DistTables[i]->DTType==DT_SOA)
|
||||
DistTables[i]->evaluate(*this,iat);
|
||||
myTimers[3]->stop();
|
||||
}
|
||||
|
||||
|
@ -809,7 +810,8 @@ void ParticleSet::loadWalker(Walker_t& awalker, bool pbyp)
|
|||
{
|
||||
// in certain cases, full tables must be ready
|
||||
for (int i=0; i< DistTables.size(); i++)
|
||||
if(DistTables[i]->Need_full_table_loadWalker) DistTables[i]->evaluate(*this);
|
||||
if(DistTables[i]->DTType==DT_AOS||DistTables[i]->Need_full_table_loadWalker)
|
||||
DistTables[i]->evaluate(*this);
|
||||
//computed so that other objects can use them, e.g., kSpaceJastrow
|
||||
if(SK && SK->DoUpdate)
|
||||
SK->UpdateAllPart(*this);
|
||||
|
|
|
@ -137,6 +137,7 @@ struct SymmetricDTD
|
|||
|
||||
inline void evaluate(ParticleSet& P, int jat)
|
||||
{
|
||||
APP_ABORT(" No need to call SymmetricDTD::evaluate(ParticleSet& P, int jat)");
|
||||
//based on full evaluation. Only compute it if jat==0
|
||||
if(jat==0) evaluate(P);
|
||||
}
|
||||
|
|
|
@ -121,6 +121,8 @@ struct Walker
|
|||
* When Multiplicity = 0, this walker will be destroyed.
|
||||
*/
|
||||
RealType Multiplicity;
|
||||
////Number of copies sent only for MPI
|
||||
int NumSentCopies;
|
||||
|
||||
/** The configuration vector (3N-dimensional vector to store
|
||||
the positions of all the particles for a single walker)*/
|
||||
|
@ -437,6 +439,7 @@ struct Walker
|
|||
DataSet.add(Age);
|
||||
DataSet.add(ReleasedNodeAge);
|
||||
DataSet.add(ReleasedNodeWeight);
|
||||
DataSet.add(NumSentCopies);
|
||||
// vectors
|
||||
DataSet.add(R.first_address(), R.last_address());
|
||||
#if !defined(SOA_MEMORY_OPTIMIZED)
|
||||
|
@ -466,7 +469,7 @@ struct Walker
|
|||
void copyFromBuffer()
|
||||
{
|
||||
DataSet.rewind();
|
||||
DataSet >> ID >> ParentID >> Generation >> Age >> ReleasedNodeAge >> ReleasedNodeWeight;
|
||||
DataSet >> ID >> ParentID >> Generation >> Age >> ReleasedNodeAge >> ReleasedNodeWeight >> NumSentCopies;
|
||||
// vectors
|
||||
DataSet.get(R.first_address(), R.last_address());
|
||||
#if !defined(SOA_MEMORY_OPTIMIZED)
|
||||
|
@ -513,7 +516,7 @@ struct Walker
|
|||
void updateBuffer()
|
||||
{
|
||||
DataSet.rewind();
|
||||
DataSet << ID << ParentID << Generation << Age << ReleasedNodeAge << ReleasedNodeWeight;
|
||||
DataSet << ID << ParentID << Generation << Age << ReleasedNodeAge << ReleasedNodeWeight << NumSentCopies;
|
||||
// vectors
|
||||
DataSet.put(R.first_address(), R.last_address());
|
||||
#if !defined(SOA_MEMORY_OPTIMIZED)
|
||||
|
|
|
@ -10,8 +10,6 @@
|
|||
//
|
||||
// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
|
||||
|
||||
#include <QMCDrivers/DMC/WalkerControlMPI.h>
|
||||
|
@ -24,36 +22,30 @@
|
|||
namespace qmcplusplus
|
||||
{
|
||||
|
||||
//#if defined(PRINT_DEBUG)
|
||||
//#define DMC_BRANCH_START(NOW) NOW
|
||||
//#define DMC_BRANCH_STOP(TID,TM) TID=TM
|
||||
//#define DMC_BRANCH_DUMP(IT,T1,T2,T3) \
|
||||
// OhmmsInfo::Debug->getStream() << "BRANCH " \
|
||||
//<< std::setw(8) << IT \
|
||||
//<< " SORT" << std::setw(16) << T1 \
|
||||
//<< " GSUM" << std::setw(16) << T2 \
|
||||
//<< " SWAP" << std::setw(16) << T3 << std::endl
|
||||
//#else
|
||||
#define DMC_BRANCH_START(NOW)
|
||||
#define DMC_BRANCH_STOP(TID,TM)
|
||||
#define DMC_BRANCH_DUMP(IT,T1,T2,T3)
|
||||
//#endif
|
||||
|
||||
|
||||
//#define MCWALKERSET_MPI_DEBUG
|
||||
|
||||
enum DMC_MPI_Timers
|
||||
{
|
||||
DMC_MPI_branch,
|
||||
DMC_MPI_imbalance,
|
||||
DMC_MPI_prebalance,
|
||||
DMC_MPI_loadbalance
|
||||
DMC_MPI_copyWalkers,
|
||||
DMC_MPI_allreduce,
|
||||
DMC_MPI_loadbalance,
|
||||
DMC_MPI_send,
|
||||
DMC_MPI_recv,
|
||||
};
|
||||
|
||||
TimerNameList_t<DMC_MPI_Timers> DMCMPITimerNames =
|
||||
{
|
||||
{DMC_MPI_branch, "WalkerControlMPI::branch"},
|
||||
{DMC_MPI_imbalance, "WalkerControlMPI::imbalance"},
|
||||
{DMC_MPI_prebalance, "WalkerControlMPI::pre-loadbalance"},
|
||||
{DMC_MPI_loadbalance, "WalkerControlMPI::loadbalance"}
|
||||
{DMC_MPI_copyWalkers, "WalkerControlMPI::copyWalkers"},
|
||||
{DMC_MPI_allreduce, "WalkerControlMPI::allreduce"},
|
||||
{DMC_MPI_loadbalance, "WalkerControlMPI::loadbalance"},
|
||||
{DMC_MPI_send, "WalkerControlMPI::send"},
|
||||
{DMC_MPI_recv, "WalkerControlMPI::recv"}
|
||||
};
|
||||
|
||||
/** default constructor
|
||||
|
@ -73,36 +65,37 @@ WalkerControlMPI::WalkerControlMPI(Communicate* c): WalkerControlBase(c)
|
|||
setup_timers(myTimers, DMCMPITimerNames, timer_level_medium);
|
||||
}
|
||||
|
||||
int
|
||||
WalkerControlMPI::branch(int iter, MCWalkerConfiguration& W, RealType trigger)
|
||||
/** Perform branch and swap walkers as required
|
||||
*
|
||||
* It takes 4 steps:
|
||||
* 1. sortWalkers marks good and bad walkers.
|
||||
* 2. allreduce and make the decision of load balancing.
|
||||
* 3. send/recv walkers. Receiving side recycles bad walkers' memory first.
|
||||
* 4. copyWalkers generates copies of good walkers.
|
||||
* In order to minimize the memory footprint fluctuation
|
||||
* the walker copying is placed as the last step.
|
||||
* In order to reduce the time for allocating walker memory,
|
||||
* this algorithm does not destroy the bad walkers in step 1.
|
||||
* All the bad walkers are recycled as much as possible in step 3/4.
|
||||
*/
|
||||
int WalkerControlMPI::branch(int iter, MCWalkerConfiguration& W, RealType trigger)
|
||||
{
|
||||
DMC_BRANCH_START(Timer localTimer);
|
||||
TinyVector<RealType,3> bTime(0.0);
|
||||
myTimers[DMC_MPI_branch]->start();
|
||||
myTimers[DMC_MPI_prebalance]->start();
|
||||
std::fill(curData.begin(),curData.end(),0);
|
||||
//std::fill(NumPerNode.begin(),NumPerNode.end(),0);
|
||||
sortWalkers(W);
|
||||
//use NumWalkersSent from the previous exchange
|
||||
curData[SENTWALKERS_INDEX]=NumWalkersSent;
|
||||
//update the number of walkers for this node
|
||||
curData[LE_MAX+MyContext]=NumWalkers;
|
||||
DMC_BRANCH_STOP(bTime[0],localTimer.elapsed());
|
||||
DMC_BRANCH_START(localTimer.restart());
|
||||
int nw = copyWalkers(W);
|
||||
//myTimers[DMC_MPI_imbalance]->start();
|
||||
//myComm->barrier();
|
||||
//myTimers[DMC_MPI_imbalance]->stop();
|
||||
myTimers[DMC_MPI_allreduce]->start();
|
||||
myComm->allreduce(curData);
|
||||
myTimers[DMC_MPI_allreduce]->stop();
|
||||
measureProperties(iter);
|
||||
W.EnsembleProperty=EnsembleProperty;
|
||||
DMC_BRANCH_STOP(bTime[1],localTimer.elapsed());
|
||||
DMC_BRANCH_START(localTimer.restart());
|
||||
////update the samples and weights
|
||||
//W.EnsembleProperty.NumSamples=curData[WALKERSIZE_INDEX];
|
||||
//W.EnsembleProperty.Weight=curData[WEIGHT_INDEX];
|
||||
//RealType wgtInv(1.0/curData[WEIGHT_INDEX]);
|
||||
//accumData[ENERGY_INDEX] += curData[ENERGY_INDEX]*wgtInv;
|
||||
//accumData[ENERGY_SQ_INDEX] += curData[ENERGY_SQ_INDEX]*wgtInv;
|
||||
//accumData[WALKERSIZE_INDEX] += curData[WALKERSIZE_INDEX];
|
||||
//accumData[WEIGHT_INDEX] += curData[WEIGHT_INDEX];
|
||||
Cur_pop=0;
|
||||
for(int i=0, j=LE_MAX; i<NumContexts; i++,j++)
|
||||
{
|
||||
|
@ -110,26 +103,11 @@ WalkerControlMPI::branch(int iter, MCWalkerConfiguration& W, RealType trigger)
|
|||
}
|
||||
myTimers[DMC_MPI_prebalance]->stop();
|
||||
myTimers[DMC_MPI_loadbalance]->start();
|
||||
if(qmc_common.async_swap)
|
||||
swapWalkersAsync(W);
|
||||
else
|
||||
swapWalkersSimple(W);
|
||||
swapWalkersSimple(W);
|
||||
myTimers[DMC_MPI_loadbalance]->stop();
|
||||
//Do not need to use a trigger.
|
||||
//Cur_min=Nmax;
|
||||
//Cur_max=0;
|
||||
//Cur_pop=0;
|
||||
//for(int i=0, j=LE_MAX; i<NumContexts; i++,j++) {
|
||||
// Cur_pop+= NumPerNode[i]=static_cast<int>(curData[j]);
|
||||
// Cur_min = std::min(Cur_min,NumPerNode[i]);
|
||||
// Cur_max = std::max(Cur_max,NumPerNode[i]);
|
||||
//}
|
||||
//int max_diff = std::max(Cur_max*NumContexts-Cur_pop,Cur_pop-Cur_min*NumContexts);
|
||||
//double diff_pop = static_cast<double>(max_diff)/static_cast<double>(Cur_pop);
|
||||
//if(diff_pop > trigger) {
|
||||
// swapWalkersSimple(W);
|
||||
// //swapWalkersMap(W);
|
||||
//}
|
||||
myTimers[DMC_MPI_copyWalkers]->start();
|
||||
copyWalkers(W);
|
||||
myTimers[DMC_MPI_copyWalkers]->stop();
|
||||
//set Weight and Multiplicity to default values
|
||||
MCWalkerConfiguration::iterator it(W.begin()),it_end(W.end());
|
||||
while(it != it_end)
|
||||
|
@ -141,8 +119,7 @@ WalkerControlMPI::branch(int iter, MCWalkerConfiguration& W, RealType trigger)
|
|||
//update the global number of walkers and offsets
|
||||
W.setGlobalNumWalkers(Cur_pop);
|
||||
W.setWalkerOffsets(FairOffSet);
|
||||
DMC_BRANCH_STOP(bTime[2],localTimer.elapsed());
|
||||
DMC_BRANCH_DUMP(iter,bTime[0],bTime[1],bTime[2]);
|
||||
|
||||
myTimers[DMC_MPI_branch]->stop();
|
||||
return Cur_pop;
|
||||
}
|
||||
|
@ -176,19 +153,24 @@ void determineNewWalkerPopulation(int Cur_pop, int NumContexts, int MyContext, c
|
|||
}
|
||||
}
|
||||
|
||||
/** swap Walkers with Recv/Send
|
||||
/** swap Walkers with Recv/Send or Irecv/Isend
|
||||
*
|
||||
* The algorithm ensures that the load per node can differ only by one walker.
|
||||
* The communication is one-dimensional.
|
||||
* Each MPI rank can only send or receive or be silent.
|
||||
* The communication is one-dimensional and very local.
|
||||
* If multiple copies of a walker need to be sent to the target rank,
|
||||
* only one walker is sent and the number of copies is encoded in the message.
|
||||
* The blocking send/recv may become serialized and worsen load imbalance.
|
||||
* Non blocking send/recv algorithm avoids serialization completely.
|
||||
*/
|
||||
void WalkerControlMPI::swapWalkersSimple(MCWalkerConfiguration& W)
|
||||
{
|
||||
std::vector<int> minus, plus;
|
||||
determineNewWalkerPopulation(Cur_pop, NumContexts, MyContext, NumPerNode, FairOffSet, minus, plus);
|
||||
|
||||
Walker_t& wRef(*W[0]);
|
||||
Walker_t& wRef(*good_w[0]);
|
||||
std::vector<Walker_t*> newW;
|
||||
std::vector<Walker_t*> oldW;
|
||||
std::vector<int> ncopy_newW;
|
||||
#ifdef MCWALKERSET_MPI_DEBUG
|
||||
char fname[128];
|
||||
sprintf(fname,"test.%d",MyContext);
|
||||
|
@ -209,166 +191,206 @@ void WalkerControlMPI::swapWalkersSimple(MCWalkerConfiguration& W)
|
|||
}
|
||||
fout << std::endl;
|
||||
#endif
|
||||
// From the class
|
||||
// MyContext (read)
|
||||
// myComm (read)
|
||||
// NumWalkersSent (write)
|
||||
int nswap=std::min(plus.size(), minus.size());
|
||||
int last=W.getActiveWalkers()-1;
|
||||
int nsend=0;
|
||||
for(int ic=0; ic<nswap; ic++)
|
||||
if(plus.size()!=minus.size())
|
||||
{
|
||||
if(plus[ic]==MyContext)
|
||||
{
|
||||
size_t byteSize = W[last]->byteSize();
|
||||
W[last]->updateBuffer();
|
||||
OOMPI_Message sendBuffer(W[last]->DataSet.data(), byteSize);
|
||||
myComm->getComm()[minus[ic]].Send(sendBuffer);
|
||||
--last;
|
||||
++nsend;
|
||||
}
|
||||
if(minus[ic]==MyContext)
|
||||
{
|
||||
Walker_t *awalker= new Walker_t(wRef);
|
||||
size_t byteSize = awalker->byteSize();
|
||||
OOMPI_Message recvBuffer(awalker->DataSet.data(), byteSize);
|
||||
myComm->getComm()[plus[ic]].Recv(recvBuffer);
|
||||
awalker->copyFromBuffer();
|
||||
newW.push_back(awalker);
|
||||
}
|
||||
app_error() << "Walker send/recv pattern doesn't match. "
|
||||
<< "The send size " << plus.size()
|
||||
<< " is not equal to the receive size " << minus.size()
|
||||
<< " ." << std::endl;
|
||||
APP_ABORT("WalkerControlMPI::swapWalkersSimple");
|
||||
}
|
||||
//save the number of walkers sent
|
||||
NumWalkersSent=nsend;
|
||||
if(nsend)
|
||||
{
|
||||
nsend=NumPerNode[MyContext]-nsend;
|
||||
W.destroyWalkers(W.begin()+nsend, W.end());
|
||||
}
|
||||
//add walkers from other node
|
||||
if(newW.size())
|
||||
W.insert(W.end(),newW.begin(),newW.end());
|
||||
}
|
||||
int nswap=plus.size();
|
||||
// sort good walkers by the number of copies
|
||||
assert(good_w.size()==ncopy_w.size());
|
||||
std::vector<std::pair<int,int> > ncopy_pairs, nrecv_pairs;
|
||||
for(int iw=0; iw<ncopy_w.size(); iw++)
|
||||
ncopy_pairs.push_back(std::make_pair(ncopy_w[iw],iw));
|
||||
std::sort(ncopy_pairs.begin(), ncopy_pairs.end());
|
||||
|
||||
/** swap Walkers with Irecv/Send
|
||||
*
|
||||
* The algorithm ensures that the load per node can differ only by one walker.
|
||||
* The communication is one-dimensional.
|
||||
*/
|
||||
void WalkerControlMPI::swapWalkersAsync(MCWalkerConfiguration& W)
|
||||
{
|
||||
std::vector<int> minus, plus;
|
||||
determineNewWalkerPopulation(Cur_pop, NumContexts, MyContext, NumPerNode, FairOffSet, minus, plus);
|
||||
Walker_t& wRef(*W[0]);
|
||||
std::vector<Walker_t*> newW;
|
||||
std::vector<Walker_t*> oldW;
|
||||
#ifdef MCWALKERSET_MPI_DEBUG
|
||||
char fname[128];
|
||||
sprintf(fname,"test.%d",MyContext);
|
||||
std::ofstream fout(fname, std::ios::app);
|
||||
//fout << NumSwaps << " " << Cur_pop << " ";
|
||||
//for(int ic=0; ic<NumContexts; ic++) fout << NumPerNode[ic] << " ";
|
||||
//fout << " | ";
|
||||
//for(int ic=0; ic<NumContexts; ic++) fout << FairOffSet[ic+1]-FairOffSet[ic] << " ";
|
||||
//fout << " | ";
|
||||
for(int ic=0; ic<plus.size(); ic++)
|
||||
{
|
||||
fout << plus[ic] << " ";
|
||||
}
|
||||
fout << " | ";
|
||||
for(int ic=0; ic<minus.size(); ic++)
|
||||
{
|
||||
fout << minus[ic] << " ";
|
||||
}
|
||||
fout << std::endl;
|
||||
#endif
|
||||
int nswap=std::min(plus.size(), minus.size());
|
||||
int last=W.getActiveWalkers()-1;
|
||||
int nsend=0;
|
||||
int countSend = 1;
|
||||
OOMPI_Packed ** sendBuffers = new OOMPI_Packed*[NumContexts];
|
||||
OOMPI_Packed ** recvBuffers = new OOMPI_Packed*[NumContexts];
|
||||
std::vector<OOMPI_Request> requests(NumContexts);
|
||||
std::vector<int> sendCounts(NumContexts,0);
|
||||
for(int ip=0; ip < NumContexts; ++ip)
|
||||
sendBuffers[ip] = 0;
|
||||
for(int ip=0; ip < NumContexts; ++ip)
|
||||
recvBuffers[ip] = 0;
|
||||
std::vector<OOMPI_Request> requests;
|
||||
for(int ic=0; ic<nswap; ic++)
|
||||
{
|
||||
if(plus[ic]==MyContext)
|
||||
{
|
||||
if((ic < nswap - 1) && (plus[ic] == plus[ic+1]) && (minus[ic] == minus[ic+1]))
|
||||
{
|
||||
countSend++;
|
||||
}
|
||||
else
|
||||
{
|
||||
//OOMPI_Packed sendBuffer(wRef.byteSize(),OOMPI_COMM_WORLD);
|
||||
sendBuffers[minus[ic]] = new OOMPI_Packed(countSend * wRef.byteSize(),myComm->getComm());
|
||||
for(int cs = 0; cs < countSend; ++cs)
|
||||
{
|
||||
W[last]->putMessage(*(sendBuffers[minus[ic]]));
|
||||
--last;
|
||||
// always send the last good walker
|
||||
Walker_t* &awalker = good_w[ncopy_pairs.back().second];
|
||||
|
||||
// count the possible copies in one send
|
||||
auto &nsentcopy = awalker->NumSentCopies;
|
||||
nsentcopy = 0;
|
||||
|
||||
for(int id=ic+1; id<nswap; id++)
|
||||
if(plus[ic]==plus[id]&&minus[ic]==minus[id]&&ncopy_pairs.back().first>0)
|
||||
{ // increment copy counter
|
||||
ncopy_pairs.back().first--;
|
||||
nsentcopy++;
|
||||
}
|
||||
//OOMPI_COMM_WORLD[minus[ic]].Send(sendBuffer);
|
||||
requests[minus[ic]] = myComm->getComm()[minus[ic]].Isend(*(sendBuffers[minus[ic]]), plus[ic]);
|
||||
nsend += countSend;
|
||||
countSend = 1;
|
||||
else
|
||||
{ // not enough copies to send or not the same send/recv pair
|
||||
break;
|
||||
}
|
||||
|
||||
// pack data and send
|
||||
size_t byteSize = awalker->byteSize();
|
||||
awalker->updateBuffer();
|
||||
OOMPI_Message sendBuffer(awalker->DataSet.data(), byteSize);
|
||||
if(use_nonblocking)
|
||||
requests.push_back(myComm->getComm()[minus[ic]].Isend(sendBuffer));
|
||||
else
|
||||
{
|
||||
myTimers[DMC_MPI_send]->start();
|
||||
myComm->getComm()[minus[ic]].Send(sendBuffer);
|
||||
myTimers[DMC_MPI_send]->stop();
|
||||
}
|
||||
// update counter and cursor
|
||||
++nsend;
|
||||
ic+=nsentcopy;
|
||||
|
||||
// update copy counter
|
||||
if(ncopy_pairs.back().first>0)
|
||||
{
|
||||
ncopy_pairs.back().first--;
|
||||
std::sort(ncopy_pairs.begin(), ncopy_pairs.end());
|
||||
}
|
||||
else
|
||||
{
|
||||
ncopy_pairs.pop_back();
|
||||
bad_w.push_back(awalker);
|
||||
}
|
||||
}
|
||||
if(minus[ic]==MyContext)
|
||||
{
|
||||
if((ic < nswap - 1) && (plus[ic] == plus[ic+1]) && (minus[ic] == minus[ic+1]))
|
||||
// count receive pairs, (source,copy)
|
||||
nrecv_pairs.push_back(std::make_pair(plus[ic],0));
|
||||
for(int id=ic+1; id<nswap; id++)
|
||||
if(plus[ic]==plus[id]&&minus[ic]==minus[id])
|
||||
nrecv_pairs.back().second++;
|
||||
else
|
||||
break;
|
||||
// update cursor
|
||||
ic+=nrecv_pairs.back().second;
|
||||
}
|
||||
}
|
||||
|
||||
if(nsend>0)
|
||||
{
|
||||
if(use_nonblocking)
|
||||
{
|
||||
// wait all the isend
|
||||
for(int im=0; im<requests.size(); im++)
|
||||
{
|
||||
countSend++;
|
||||
myTimers[DMC_MPI_send]->start();
|
||||
requests[im].Wait();
|
||||
myTimers[DMC_MPI_send]->stop();
|
||||
}
|
||||
requests.clear();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
struct job {
|
||||
OOMPI_Request request;
|
||||
int walkerID;
|
||||
int queueID;
|
||||
job(const OOMPI_Request &req, int wid, int qid): request(req), walkerID(wid), queueID(qid) {};
|
||||
};
|
||||
std::vector<job> job_list;
|
||||
std::vector<bool> queue_status(nrecv_pairs.size(),true);
|
||||
|
||||
bool completed=false;
|
||||
while(!completed)
|
||||
{
|
||||
// receive data
|
||||
for(int ic=0; ic<nrecv_pairs.size(); ic++)
|
||||
if(queue_status[ic]&&nrecv_pairs[ic].second>=0)
|
||||
{
|
||||
Walker_t* awalker;
|
||||
if(bad_w.empty())
|
||||
{
|
||||
awalker=new Walker_t(wRef);
|
||||
}
|
||||
else
|
||||
{
|
||||
awalker=bad_w.back();
|
||||
bad_w.pop_back();
|
||||
}
|
||||
|
||||
size_t byteSize = awalker->byteSize();
|
||||
myTimers[DMC_MPI_recv]->start();
|
||||
OOMPI_Message recvBuffer(awalker->DataSet.data(), byteSize);
|
||||
if(use_nonblocking)
|
||||
{
|
||||
job_list.push_back(job(myComm->getComm()[nrecv_pairs[ic].first].Irecv(recvBuffer),
|
||||
newW.size(),ic));
|
||||
queue_status[ic]=false;
|
||||
}
|
||||
else
|
||||
{
|
||||
myComm->getComm()[nrecv_pairs[ic].first].Recv(recvBuffer);
|
||||
job_list.push_back(job(OOMPI_Request(),newW.size(),ic));
|
||||
}
|
||||
myTimers[DMC_MPI_recv]->stop();
|
||||
|
||||
newW.push_back(awalker);
|
||||
ncopy_newW.push_back(0);
|
||||
}
|
||||
|
||||
if(use_nonblocking)
|
||||
{
|
||||
OOMPI_Status status;
|
||||
for(auto jobit=job_list.begin(); jobit!=job_list.end(); jobit++)
|
||||
if(jobit->request.Test(status))
|
||||
{
|
||||
auto &awalker=newW[jobit->walkerID];
|
||||
// unpack data
|
||||
awalker->copyFromBuffer();
|
||||
ncopy_newW[jobit->walkerID]=awalker->NumSentCopies;
|
||||
// update counter
|
||||
nrecv_pairs[jobit->queueID].second-=(awalker->NumSentCopies+1);
|
||||
queue_status[jobit->queueID]=true;
|
||||
job_list.erase(jobit);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//OOMPI_Packed recvBuffer(wRef.byteSize(),OOMPI_COMM_WORLD);
|
||||
recvBuffers[plus[ic]] = new OOMPI_Packed(countSend * wRef.byteSize(),myComm->getComm());
|
||||
//OOMPI_COMM_WORLD[plus[ic]].Recv(recvBuffer);
|
||||
requests[plus[ic]] = myComm->getComm()[plus[ic]].Irecv(*(recvBuffers[plus[ic]]), plus[ic]);
|
||||
sendCounts[plus[ic]] = countSend;
|
||||
countSend = 1;
|
||||
for(auto jobit=job_list.begin(); jobit!=job_list.end(); jobit++)
|
||||
{
|
||||
auto &awalker=newW[jobit->walkerID];
|
||||
// unpack data
|
||||
awalker->copyFromBuffer();
|
||||
ncopy_newW[jobit->walkerID]=awalker->NumSentCopies;
|
||||
// update counter
|
||||
nrecv_pairs[jobit->queueID].second-=(awalker->NumSentCopies+1);
|
||||
}
|
||||
job_list.clear();
|
||||
}
|
||||
|
||||
// check the completion of queues
|
||||
completed=true;
|
||||
for(int ic=0; ic<nrecv_pairs.size(); ic++)
|
||||
completed = completed && (nrecv_pairs[ic].second==-1);
|
||||
}
|
||||
}
|
||||
for(int ip = 0; ip < NumContexts; ++ip)
|
||||
{
|
||||
if(recvBuffers[ip])
|
||||
{
|
||||
requests[ip].Wait();
|
||||
for(int cs = 0; cs < sendCounts[ip]; ++cs)
|
||||
{
|
||||
Walker_t *awalker= new Walker_t(wRef);
|
||||
awalker->getMessage(*(recvBuffers[ip]));
|
||||
newW.push_back(awalker);
|
||||
}
|
||||
delete recvBuffers[ip];
|
||||
}
|
||||
}
|
||||
for(int ip = 0; ip < NumContexts; ++ip)
|
||||
{
|
||||
if(sendBuffers[ip])
|
||||
{
|
||||
requests[ip].Wait();
|
||||
delete sendBuffers[ip];
|
||||
}
|
||||
}
|
||||
delete[] sendBuffers;
|
||||
delete[] recvBuffers;
|
||||
//save the number of walkers sent
|
||||
NumWalkersSent=nsend;
|
||||
if(nsend)
|
||||
// rebuild good_w and ncopy_w
|
||||
std::vector<Walker_t*> good_w_temp(good_w);
|
||||
good_w.resize(ncopy_pairs.size());
|
||||
ncopy_w.resize(ncopy_pairs.size());
|
||||
for(int iw=0; iw<ncopy_pairs.size(); iw++)
|
||||
{
|
||||
nsend=NumPerNode[MyContext]-nsend;
|
||||
W.destroyWalkers(W.begin()+nsend, W.end());
|
||||
good_w[iw]=good_w_temp[ncopy_pairs[iw].second];
|
||||
ncopy_w[iw]=ncopy_pairs[iw].first;
|
||||
}
|
||||
//add walkers from other node
|
||||
if(newW.size())
|
||||
W.insert(W.end(),newW.begin(),newW.end());
|
||||
{
|
||||
good_w.insert(good_w.end(),newW.begin(),newW.end());
|
||||
ncopy_w.insert(ncopy_w.end(),ncopy_newW.begin(),ncopy_newW.end());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -44,12 +44,9 @@ struct WalkerControlMPI: public WalkerControlBase
|
|||
/** perform branch and swap walkers as required */
|
||||
int branch(int iter, MCWalkerConfiguration& W, RealType trigger);
|
||||
|
||||
//current implementations
|
||||
void swapWalkersSimple(MCWalkerConfiguration& W);
|
||||
|
||||
//old implementations
|
||||
void swapWalkersAsync(MCWalkerConfiguration& W);
|
||||
// Removed swapWalkersBlocked and swapWalkersMap (minimize size and number of messages)
|
||||
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -225,6 +225,7 @@ int WalkerControlBase::doNotBranch(int iter, MCWalkerConfiguration& W)
|
|||
curData[RNSIZE_INDEX]=nrn;
|
||||
curData[B_ENERGY_INDEX]=besum;
|
||||
curData[B_WGT_INDEX]=bwgtsum;
|
||||
curData[SENTWALKERS_INDEX]=NumWalkersSent=0;
|
||||
myComm->allreduce(curData);
|
||||
measureProperties(iter);
|
||||
trialEnergy=EnsembleProperty.Energy;
|
||||
|
@ -283,12 +284,16 @@ void WalkerControlBase::Write2XYZ(MCWalkerConfiguration& W)
|
|||
}
|
||||
|
||||
|
||||
/** evaluate curData and mark the bad/good walkers
|
||||
/** evaluate curData and mark the bad/good walkers.
|
||||
*
|
||||
* Each good walker has a counter registering the
|
||||
* number of copies needed to be generated from this walker.
|
||||
* Bad walkers will be either recycled or removed later.
|
||||
*/
|
||||
int WalkerControlBase::sortWalkers(MCWalkerConfiguration& W)
|
||||
{
|
||||
MCWalkerConfiguration::iterator it(W.begin());
|
||||
std::vector<Walker_t*> bad,good_rn;
|
||||
std::vector<Walker_t*> good_rn;
|
||||
std::vector<int> ncopy_rn;
|
||||
NumWalkers=0;
|
||||
MCWalkerConfiguration::iterator it_end(W.end());
|
||||
|
@ -353,7 +358,7 @@ int WalkerControlBase::sortWalkers(MCWalkerConfiguration& W)
|
|||
}
|
||||
else
|
||||
{
|
||||
bad.push_back(*it);
|
||||
bad_w.push_back(*it);
|
||||
}
|
||||
++it;
|
||||
}
|
||||
|
@ -378,9 +383,6 @@ int WalkerControlBase::sortWalkers(MCWalkerConfiguration& W)
|
|||
//W.EnsembleProperty.Energy=(esum/=wsum);
|
||||
//W.EnsembleProperty.Variance=(e2sum/wsum-esum*esum);
|
||||
//W.EnsembleProperty.Variance=(e2sum*wsum-esum*esum)/(wsum*wsum-w2sum);
|
||||
//remove bad walkers empty the container
|
||||
for(int i=0; i<bad.size(); i++)
|
||||
delete bad[i];
|
||||
if (!WriteRN)
|
||||
{
|
||||
if(good_w.empty())
|
||||
|
@ -441,25 +443,58 @@ int WalkerControlBase::sortWalkers(MCWalkerConfiguration& W)
|
|||
return NumWalkers;
|
||||
}
|
||||
|
||||
/** copy good walkers to W
|
||||
*
|
||||
* Good walkers are copied based on the registered number of copies
|
||||
* Bad walkers are recycled to avoid memory allocation and deallocation.
|
||||
*/
|
||||
int WalkerControlBase::copyWalkers(MCWalkerConfiguration& W)
|
||||
{
|
||||
// save current good walker size.
|
||||
const int size_good_w = good_w.size();
|
||||
std::vector<int> copy_list;
|
||||
for(int i=0; i<size_good_w; i++)
|
||||
{
|
||||
for(int j=0; j<ncopy_w[i]; j++)
|
||||
{
|
||||
if(bad_w.empty())
|
||||
{
|
||||
good_w.push_back(nullptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
good_w.push_back(bad_w.back());
|
||||
bad_w.pop_back();
|
||||
}
|
||||
copy_list.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int i=size_good_w; i<good_w.size(); i++)
|
||||
{
|
||||
auto &wRef=good_w[copy_list[i-size_good_w]];
|
||||
auto &awalker=good_w[i];
|
||||
if(awalker==nullptr)
|
||||
awalker=new Walker_t(*wRef);
|
||||
else
|
||||
*awalker=*wRef;
|
||||
// not fully sure this is correct or even used
|
||||
awalker->ID=(i-size_good_w)*NumContexts+MyContext;
|
||||
awalker->ParentID=wRef->ParentID;
|
||||
}
|
||||
|
||||
//clear the WalkerList to populate them with the good walkers
|
||||
W.clear();
|
||||
W.insert(W.begin(), good_w.begin(), good_w.end());
|
||||
int cur_walker = good_w.size();
|
||||
for(int i=0; i<good_w.size(); i++)
|
||||
//,ie+=ncols) {
|
||||
{
|
||||
for(int j=0; j<ncopy_w[i]; j++, cur_walker++)
|
||||
{
|
||||
Walker_t* awalker=new Walker_t(*(good_w[i]));
|
||||
awalker->ID=(++NumWalkersCreated)*NumContexts+MyContext;
|
||||
awalker->ParentID=good_w[i]->ParentID;
|
||||
W.push_back(awalker);
|
||||
}
|
||||
}
|
||||
|
||||
//remove bad walkers if there is any left
|
||||
for(int i=0; i<bad_w.size(); i++)
|
||||
delete bad_w[i];
|
||||
|
||||
//clear good_w and ncopy_w for the next branch
|
||||
good_w.clear();
|
||||
bad_w.clear();
|
||||
ncopy_w.clear();
|
||||
return W.getActiveWalkers();
|
||||
}
|
||||
|
@ -467,22 +502,26 @@ int WalkerControlBase::copyWalkers(MCWalkerConfiguration& W)
|
|||
bool WalkerControlBase::put(xmlNodePtr cur)
|
||||
{
|
||||
int nw_target=0, nw_max=0;
|
||||
std::string nonblocking="yes";
|
||||
ParameterSet params;
|
||||
params.add(targetEnergyBound,"energyBound","double");
|
||||
params.add(targetSigma,"sigmaBound","double");
|
||||
params.add(MaxCopy,"maxCopy","int");
|
||||
params.add(nw_target,"targetwalkers","int");
|
||||
params.add(nw_max,"max_walkers","int");
|
||||
params.add(nonblocking,"use_nonblocking","string");
|
||||
|
||||
bool success=params.put(cur);
|
||||
|
||||
setMinMax(nw_target,nw_max);
|
||||
use_nonblocking = nonblocking=="yes";
|
||||
app_log() << " WalkerControlBase parameters " << std::endl;
|
||||
//app_log() << " energyBound = " << targetEnergyBound << std::endl;
|
||||
//app_log() << " sigmaBound = " << targetSigma << std::endl;
|
||||
app_log() << " maxCopy = " << MaxCopy << std::endl;
|
||||
app_log() << " Max Walkers per node " << Nmax << std::endl;
|
||||
app_log() << " Min Walkers per node " << Nmin << std::endl;
|
||||
app_log() << " Using " << (use_nonblocking?"non-":"") << "blocking send/recv" << std::endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -125,12 +125,14 @@ public:
|
|||
std::vector<RealType> accumData;
|
||||
///any temporary data
|
||||
std::vector<RealType> curData;
|
||||
///temporary storage for good walkers
|
||||
std::vector<Walker_t*> good_w;
|
||||
///temporary storage for good and bad walkers
|
||||
std::vector<Walker_t*> good_w, bad_w;
|
||||
///temporary storage for copy counters
|
||||
std::vector<int> ncopy_w;
|
||||
///Add released-node fields to .dmc.dat file
|
||||
bool WriteRN;
|
||||
///Use non-blocking isend/irecv
|
||||
bool use_nonblocking;
|
||||
|
||||
/** default constructor
|
||||
*
|
||||
|
|
|
@ -41,6 +41,8 @@ DiracDeterminantCUDA::DiracDeterminantCUDA(SPOSetBasePtr const &spos, int first)
|
|||
newGradLaplList_d("DiracDeterminantBase::newGradLaplList_d"),
|
||||
AWorkList_d("DiracDeterminantBase::AWorkList_d"),
|
||||
AinvWorkList_d("DiracDeterminantBase::AinvWorkList_d"),
|
||||
PivotArray_d("DiracDeterminantBase::PivotArray_d"),
|
||||
infoArray_d("DiracDeterminantBase::infoArray_d"),
|
||||
GLList_d("DiracDeterminantBase::GLList_d"),
|
||||
ratio_d("DiracDeterminantBase::ratio_d"),
|
||||
gradLapl_d("DiracDeterminantBase::gradLapl_d"),
|
||||
|
@ -58,39 +60,6 @@ DiracDeterminantCUDA::DiracDeterminantCUDA(SPOSetBasePtr const &spos, int first)
|
|||
NLratios_d[i] = gpu::device_vector<CudaValueType>("DiracDeterminantBase::NLratios_d");
|
||||
}
|
||||
|
||||
DiracDeterminantCUDA::DiracDeterminantCUDA(const DiracDeterminantCUDA& s) :
|
||||
DiracDeterminantBase(s),
|
||||
UpdateJobList_d("DiracDeterminantBase::UpdateJobList_d"),
|
||||
srcList_d("DiracDeterminantBase::srcList_d"),
|
||||
destList_d("DiracDeterminantBase::destList_d"),
|
||||
AList_d("DiracDeterminantBase::AList_d"),
|
||||
AinvList_d("DiracDeterminantBase::AinvList_d"),
|
||||
newRowList_d("DiracDeterminantBase::newRowList_d"),
|
||||
AinvDeltaList_d("DiracDeterminantBase::AinvDeltaList_d"),
|
||||
AinvColkList_d("DiracDeterminantBase::AinvColkList_d"),
|
||||
gradLaplList_d("DiracDeterminantBase::gradLaplList_d"),
|
||||
newGradLaplList_d("DiracDeterminantBase::newGradLaplList_d"),
|
||||
AWorkList_d("DiracDeterminantBase::AWorkList_d"),
|
||||
AinvWorkList_d("DiracDeterminantBase::AinvWorkList_d"),
|
||||
GLList_d("DiracDeterminantBase::GLList_d"),
|
||||
ratio_d("DiracDeterminantBase::ratio_d"),
|
||||
gradLapl_d("DiracDeterminantBase::gradLapl_d"),
|
||||
iatList_d("DiracDeterminantBase::iatList_d"),
|
||||
NLrowBuffer_d("DiracDeterminantBase::NLrowBuffer_d"),
|
||||
SplineRowList_d("DiracDeterminantBase::SplineRowList_d"),
|
||||
RatioRowList_d("DiracDeterminantBase::RatioRowList_d"),
|
||||
NLposBuffer_d("DiracDeterminantBase::NLposBuffer_d"),
|
||||
NLAinvList_d("DiracDeterminantBase::NLAinvList_d"),
|
||||
NLnumRatioList_d("DiracDeterminantBase::NLnumRatioList_d"),
|
||||
NLelecList_d("DiracDeterminantBase::NLelecList_d"),
|
||||
NLratioList_d("DiracDeterminantBase::NLratioList_d")
|
||||
{
|
||||
for(int i = 0; i < 2; ++i)
|
||||
NLratios_d[i] = gpu::device_vector<CudaValueType>("DiracDeterminantBase::NLratios_d");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////
|
||||
// Vectorized evaluation functions //
|
||||
/////////////////////////////////////
|
||||
|
@ -412,9 +381,28 @@ DiracDeterminantCUDA::recompute(MCWalkerConfiguration &W, bool firstTime)
|
|||
// Invert
|
||||
bool useDoublePrecision = true;
|
||||
cublas_inverse (gpu::cublasHandle, AList_d.data(), AinvList_d.data(),
|
||||
AWorkList_d.data(), AinvWorkList_d.data(),
|
||||
AWorkList_d.data(), AinvWorkList_d.data(),
|
||||
PivotArray_d.data(), infoArray_d.data(),
|
||||
NumPtcls, RowStride, walkers.size(), useDoublePrecision);
|
||||
|
||||
// checking inversion status
|
||||
infoArray_host = infoArray_d;
|
||||
bool failed = false;
|
||||
for(int iw=0; iw<walkers.size(); iw++)
|
||||
if(infoArray_host[iw]!=0||infoArray_host[iw+walkers.size()]!=0)
|
||||
{
|
||||
failed = true;
|
||||
fprintf(stderr, "cublas_inverse failed on walker %d, getrf error %d, getri error %d.\n",
|
||||
iw, infoArray_host[iw], infoArray_host[iw+walkers.size()]);
|
||||
char name[1000];
|
||||
gethostname(name, 1000);
|
||||
fprintf(stderr, "Offending hostname = %s\n", name);
|
||||
int dev;
|
||||
cudaGetDevice(&dev);
|
||||
fprintf(stderr, "Offending device = %d\n", dev);
|
||||
}
|
||||
if(failed) abort();
|
||||
|
||||
#ifdef DEBUG_CUDA
|
||||
CudaValueType Ainv[NumPtcls][RowStride], A[NumPtcls][RowStride];
|
||||
//for (int iw=0; iw<walkers.size(); iw++)
|
||||
|
@ -940,6 +928,7 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
|
|||
if (std::isnan(lapl(iw,iat+FirstIndex)))
|
||||
#endif
|
||||
{
|
||||
fprintf (stderr, "Offending walker = %d\n", iw);
|
||||
char name[1000];
|
||||
gethostname(name, 1000);
|
||||
fprintf (stderr, "Offending hostname = %s\n", name);
|
||||
|
@ -961,13 +950,13 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
|
|||
fprintf (Ainv, "%14.8e+%14.8ei ", host_data[AinvOffset+i*RowStride+j].real(), host_data[AinvOffset+i*RowStride+j].imag());
|
||||
fprintf (Lmat, "%14.8e+%14.8ei ", host_data[gradLaplOffset+(4*i+3)*RowStride+j].real(), host_data[gradLaplOffset+(4*i+3)*RowStride+j].imag());
|
||||
for (int k=0; k<3; k++)
|
||||
fprintf (Lmat, "%14.8e+%14.8ei ", host_data[gradLaplOffset+(4*i+k)*RowStride+j].real(), host_data[gradLaplOffset+(4*i+k)*RowStride+j].imag());
|
||||
fprintf (Gmat, "%14.8e+%14.8ei ", host_data[gradLaplOffset+(4*i+k)*RowStride+j].real(), host_data[gradLaplOffset+(4*i+k)*RowStride+j].imag());
|
||||
#else
|
||||
fprintf (Amat, "%14.8e ", host_data[AOffset+i*RowStride+j]);
|
||||
fprintf (Ainv, "%14.8e ", host_data[AinvOffset+i*RowStride+j]);
|
||||
fprintf (Lmat, "%14.8e ", host_data[gradLaplOffset+(4*i+3)*RowStride+j]);
|
||||
for (int k=0; k<3; k++)
|
||||
fprintf (Lmat, "%14.8e ", host_data[gradLaplOffset+(4*i+k)*RowStride+j]);
|
||||
fprintf (Gmat, "%14.8e ", host_data[gradLaplOffset+(4*i+k)*RowStride+j]);
|
||||
#endif
|
||||
}
|
||||
fprintf (Amat, "\n");
|
||||
|
|
|
@ -40,7 +40,7 @@ public:
|
|||
typedef ParticleSet::Walker_t Walker_t;
|
||||
|
||||
DiracDeterminantCUDA(SPOSetBasePtr const &spos, int first=0);
|
||||
DiracDeterminantCUDA(const DiracDeterminantCUDA& s);
|
||||
DiracDeterminantCUDA(const DiracDeterminantCUDA& s) = delete;
|
||||
|
||||
protected:
|
||||
/////////////////////////////////////////////////////
|
||||
|
@ -60,6 +60,9 @@ protected:
|
|||
gpu::device_vector<CudaValueType*> srcList_d, destList_d, AList_d, AinvList_d, newRowList_d,
|
||||
AinvDeltaList_d, AinvColkList_d, gradLaplList_d,
|
||||
newGradLaplList_d, AWorkList_d, AinvWorkList_d, GLList_d;
|
||||
gpu::device_vector<int> PivotArray_d;
|
||||
gpu::device_vector<int> infoArray_d;
|
||||
gpu::host_vector<int> infoArray_host;
|
||||
gpu::device_vector<CudaValueType> ratio_d;
|
||||
gpu::host_vector<CudaValueType> ratio_host;
|
||||
gpu::device_vector<CudaValueType> gradLapl_d;
|
||||
|
@ -118,6 +121,9 @@ protected:
|
|||
// HACK HACK HACK
|
||||
// gradLapl_d.resize (numWalkers*NumOrbitals*4);
|
||||
// gradLapl_host.resize(numWalkers*NumOrbitals*4);
|
||||
infoArray_d.resize(numWalkers*2);
|
||||
infoArray_host.resize(numWalkers*2);
|
||||
PivotArray_d.resize(numWalkers*NumOrbitals);
|
||||
gradLapl_d.resize (numWalkers*RowStride*4);
|
||||
gradLapl_host.resize(numWalkers*RowStride*4);
|
||||
NLrowBuffer_d.resize(NLrowBufferRows*RowStride);
|
||||
|
|
|
@ -15,6 +15,10 @@
|
|||
|
||||
InfoStream::~InfoStream()
|
||||
{
|
||||
if (currStream != nullStream) {
|
||||
delete nullStream;
|
||||
}
|
||||
|
||||
if (ownStream && currStream) {
|
||||
delete(currStream);
|
||||
}
|
||||
|
|
|
@ -14,11 +14,10 @@
|
|||
#ifndef QMCPLUSPLUS_POOLEDMEMORY_H
|
||||
#define QMCPLUSPLUS_POOLEDMEMORY_H
|
||||
|
||||
#include <vector>
|
||||
#include <complex>
|
||||
#include <limits>
|
||||
#include <cstring>
|
||||
#include "simd/allocator.hpp"
|
||||
#include "OhmmsPETE/OhmmsVector.h"
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
|
@ -38,12 +37,12 @@ struct PooledMemory
|
|||
{
|
||||
typedef char T;
|
||||
typedef T value_type;
|
||||
typedef typename std::vector<T>::size_type size_type;
|
||||
typedef typename Vector<T, Alloc>::size_type size_type;
|
||||
|
||||
const int scalar_multiplier;
|
||||
size_type Current, Current_scalar;
|
||||
T_scalar *Scalar_ptr;
|
||||
std::vector<T, Alloc> myData;
|
||||
Vector<T, Alloc> myData;
|
||||
|
||||
///default constructor
|
||||
inline PooledMemory():
|
||||
|
|
|
@ -13,12 +13,17 @@
|
|||
#include "catch.hpp"
|
||||
|
||||
#include "Utilities/PooledMemory.h"
|
||||
#include "Utilities/PooledData.h"
|
||||
#include "Utilities/Timer.h"
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <complex>
|
||||
|
||||
//#define CHECK_ALLOCATION_PERF
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using std::complex;
|
||||
|
@ -91,6 +96,36 @@ TEST_CASE("pack scalar", "[utilities]")
|
|||
REQUIRE(i6[0]==i6[0]);
|
||||
REQUIRE(i6[1]==i6[1]);
|
||||
REQUIRE(i6[2]==i6[2]);
|
||||
|
||||
#ifdef CHECK_ALLOCATION_PERF
|
||||
// up to 512 MB.
|
||||
for(size_t size_scale=15; size_scale<30; size_scale++)
|
||||
{
|
||||
const size_t size = (1<<size_scale)+17*8;
|
||||
{
|
||||
PooledMemory<double> pm_walker;
|
||||
pm_walker.Current = size;
|
||||
Timer PoolTimer;
|
||||
pm_walker.allocate();
|
||||
std::cout << "PooledMemory Allocate " << pm_walker.byteSize() << " bytes Time " << PoolTimer.elapsed() << std::endl;
|
||||
PoolTimer.restart();
|
||||
PooledMemory<double> pm_walker_copy(pm_walker);
|
||||
std::cout << "PooledMemory Copy Time " << PoolTimer.elapsed() << std::endl;
|
||||
}
|
||||
|
||||
{
|
||||
PooledData<double> pd_walker;
|
||||
Timer PoolTimer;
|
||||
pd_walker.resize(size/8);
|
||||
std::cout << "PooledData Allocate " << pd_walker.byteSize() << " bytes Time " << PoolTimer.elapsed() << std::endl;
|
||||
PoolTimer.restart();
|
||||
PooledData<double> pd_walker_copy(pd_walker);
|
||||
std::cout << "PooledData Copy Time " << PoolTimer.elapsed() << std::endl;
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -29,7 +29,6 @@ QMCState::QMCState()
|
|||
use_density=false;
|
||||
dryrun=false;
|
||||
save_wfs=false;
|
||||
async_swap=false;
|
||||
io_node=true;
|
||||
mpi_groups=1;
|
||||
use_ewald=false;
|
||||
|
@ -60,10 +59,6 @@ void QMCState::initialize(int argc, char **argv)
|
|||
{
|
||||
save_wfs=(c.find("no")>=c.size());
|
||||
}
|
||||
else if(c.find("--async_swap") < c.size())
|
||||
{
|
||||
async_swap=(c.find("no")>=c.size());
|
||||
}
|
||||
else if(c.find("--help")< c.size())
|
||||
{
|
||||
stopit=true;
|
||||
|
@ -99,7 +94,7 @@ void QMCState::initialize(int argc, char **argv)
|
|||
std::cerr << " git last commit date: " << QMCPACK_GIT_COMMIT_LAST_CHANGED << std::endl;
|
||||
std::cerr << " git last commit subject: " << QMCPACK_GIT_COMMIT_SUBJECT << std::endl;
|
||||
#endif
|
||||
std::cerr << std::endl << "Usage: qmcpack input [--dryrun --save_wfs[=no] --async_swap[=no] --gpu]" << std::endl << std::endl;
|
||||
std::cerr << std::endl << "Usage: qmcpack input [--dryrun --save_wfs[=no] --gpu]" << std::endl << std::endl;
|
||||
}
|
||||
if(stopit)
|
||||
{
|
||||
|
@ -115,10 +110,6 @@ void QMCState::print_options(std::ostream& os)
|
|||
os << " dryrun : qmc sections will be ignored." << std::endl;
|
||||
if(save_wfs)
|
||||
os << " save_wfs=1 : save wavefunctions in hdf5. " << std::endl;
|
||||
if(async_swap)
|
||||
os << " async_swap=1 : using async isend/irecv for walker swaps " << std::endl;
|
||||
else
|
||||
os << " async_swap=0 : using blocking send/recv for walker swaps " << std::endl;
|
||||
}
|
||||
|
||||
void QMCState::print_memory_change(const std::string& who, size_t before)
|
||||
|
|
|
@ -37,8 +37,6 @@ struct QMCState
|
|||
bool dryrun;
|
||||
///true, if wave functions are stored for next runs
|
||||
bool save_wfs;
|
||||
///true, if walker swap is done by async
|
||||
bool async_swap;
|
||||
///true, print out file
|
||||
bool io_node;
|
||||
///true, use Ewald instead of optimal breakup for the Coulomb
|
||||
|
|
|
@ -37,7 +37,7 @@ IF (NOT QMC_CUDA)
|
|||
#
|
||||
# H4 starting from perturbed orbitals, using optimizable determinants, and the adaptive linear method
|
||||
#
|
||||
LIST(APPEND H4_ORB_OPT_SCALARS "totenergy" "-1.93045653 0.00837934") # total energy
|
||||
LIST(APPEND H4_ORB_OPT_SCALARS "totenergy" "-2.0889 0.001") # total energy
|
||||
|
||||
QMC_RUN_AND_CHECK(short-H4-orb-opt
|
||||
"${CMAKE_SOURCE_DIR}/tests/molecules/H4_ae"
|
||||
|
|
|
@ -47,5 +47,20 @@
|
|||
</sposet>
|
||||
</determinantset>
|
||||
|
||||
<jastrow name="J2" type="Two-Body" function="Bspline" print="yes">
|
||||
<correlation rcut="2" size="10" speciesA="u" speciesB="u">
|
||||
<coefficients id="uu" type="Array" optimize="no"> 0.04661440939 0.0555644703 0.1286335481 0.1077457768 0.0811381484 0.0626555805 0.0436904683 0.02811397453 0.01572772511 0.006258753264</coefficients>
|
||||
</correlation>
|
||||
<correlation rcut="2" size="10" speciesA="u" speciesB="d">
|
||||
<coefficients id="ud" type="Array" optimize="no"> 0.4465508582 0.3603308616 0.2893897103 0.2269436006 0.1717850082 0.1250275368 0.08538908372 0.05318480503 0.02806504868 0.01035981902</coefficients>
|
||||
</correlation>
|
||||
</jastrow>
|
||||
|
||||
<jastrow name="J1" type="One-Body" function="Bspline" source="ion0" print="yes">
|
||||
<correlation rcut="2" size="10" cusp="1" elementType="H">
|
||||
<coefficients id="eH" type="Array" optimize="no"> 0.0167926566 0.1405038909 0.1611311956 0.1198446618 0.06764720157 0.03602453588 0.02460661286 0.01952889643 0.01300432967 0.005621933949</coefficients>
|
||||
</correlation>
|
||||
</jastrow>
|
||||
|
||||
</wavefunction>
|
||||
</qmcsystem>
|
||||
|
|
Loading…
Reference in New Issue