Merge pull request #84 from atillack/qmcpack_gpu_complex

GPU Complex Implementation
This commit is contained in:
Paul R. C. Kent 2017-01-25 16:50:11 -05:00 committed by GitHub
commit 63480e964e
35 changed files with 1978 additions and 600 deletions

View File

@ -645,6 +645,9 @@ IF(QMC_CUDA)
else(CUDA_NVCC_FLAGS MATCHES "arch")
# Automatically set the default NVCC flags
SET(CUDA_NVCC_FLAGS "-Drestrict=__restrict__;-DNO_CUDA_MAIN")
if (QMC_COMPLEX)
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DQMC_COMPLEX")
endif()
if ( CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-g;-G")
else()

View File

@ -22,9 +22,14 @@
//
// 1. Make sure libcublas.so is in the search path
// 2. cd to build/ directory
// 3. nvcc -o cuda_inverse -arch=sm_35 -lcublas -DCUDA_TEST_MAIN
// 3. For real numbers, compile with
// nvcc -o cuda_inverse -arch=sm_35 -lcublas -DCUDA_TEST_MAIN
// ../src/Numerics/CUDA/cuda_inverse.cu
//
// For complex numbers, compile with
// nvcc -o cuda_inverse -arch=sm_35 -lcublas -DCUDA_TEST_MAIN
// -DQMC_COMPLEX=1 ../src/Numerics/CUDA/cuda_inverse.cu
//
// =============================================================== //
#include <cstdio>
@ -32,8 +37,10 @@
#include <sstream>
#include <vector>
#include <iostream>
#include <complex>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuComplex.h>
#define CONVERT_BS 256
#define INVERSE_BS 16
@ -117,10 +124,31 @@ convert (Tdest **dest_list, Tsrc **src_list, int len)
mydest[i] = (Tdest) mysrc[i];
}
// Two matrix inversion functions
// Convert for complex numbers
template <typename Tdest, typename Tdest2, typename Tsrc>
__global__ void
convert_complex (Tdest **dest_list, Tsrc **src_list, int len)
{
__shared__ Tsrc *mysrc;
__shared__ Tdest *mydest;
if (threadIdx.x == 0)
{
mysrc = src_list[blockIdx.y];
mydest = dest_list[blockIdx.y];
}
__syncthreads();
int i = blockIdx.x * CONVERT_BS + threadIdx.x;
if (i < len) {
mydest[i].x = (Tdest2) mysrc[i].x;
mydest[i].y = (Tdest2) mysrc[i].y;
}
}
// Four matrix inversion functions
// 1. for float matrices
// useHigherPrecision = false --> single precision operations
// useHigherPrecision = true --> double precision operations
// useHigherPrecision = true --> double precision operations (default)
void
cublas_inverse (cublasHandle_t handle,
float *Alist_d[], float *Ainvlist_d[],
@ -228,6 +256,107 @@ cublas_inverse (cublasHandle_t handle,
}
// 3. for complex float matrices
// useHigherPrecision = false --> single precision operations
// useHigherPrecision = true --> double precision operations (default)
void
cublas_inverse (cublasHandle_t handle,
std::complex<float> *Alist_d[], std::complex<float> *Ainvlist_d[],
std::complex<float> *AWorklist_d[], std::complex<float> *AinvWorklist_d[],
int N, int rowStride, int numMats,
bool useHigherPrecision)
{
// Info array tells if a matrix inversion is successful
// = 0 : successful
// = k : U(k,k) = 0; inversion failed
int *infoArray;
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
// If double precision operations are desired...
if (useHigherPrecision)
{
// (i) convert elements in Alist from float to double, put them in AWorklist
dim3 dimBlockConvert (CONVERT_BS);
dim3 dimGridConvert ((N*rowStride + (CONVERT_BS-1)) / CONVERT_BS, numMats);
convert_complex<cuDoubleComplex, double, cuComplex> <<< dimGridConvert, dimBlockConvert >>> ((cuDoubleComplex**)AWorklist_d, (cuComplex**)Alist_d, N*rowStride);
// (ii) call cublas to do matrix inversion
// LU decomposition
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, infoArray, numMats), __LINE__ );
// Inversion
#if (CUDA_VERSION >= 6050)
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
#else
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
#endif
// (iii) convert results back to single precision
convert_complex<cuComplex, float, cuDoubleComplex> <<< dimGridConvert, dimBlockConvert >>> ((cuComplex**)Ainvlist_d, (cuDoubleComplex**)AinvWorklist_d, N*rowStride);
}
// else, carry out single precision operations
else
{
// Call cublas to do matrix inversion
// LU decomposition
callAndCheckError( cublasCgetrfBatched( handle, N, (cuComplex**)Alist_d, rowStride, NULL,
infoArray, numMats), __LINE__ );
// Inversion
#if (CUDA_VERSION >= 6050)
callAndCheckError( cublasCgetriBatched( handle, N, (const cuComplex**)Alist_d, rowStride, NULL, (cuComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
#else
callAndCheckError( cublasCgetriBatched( handle, N, (cuComplex**)Alist_d, rowStride, NULL, (cuComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
#endif
}
cudaDeviceSynchronize();
// Free resources
cudaFree(infoArray);
}
// 4. for complex double matrices
void
cublas_inverse (cublasHandle_t handle,
std::complex<double> *Alist_d[], std::complex<double> *Ainvlist_d[],
std::complex<double> *AWorklist_d[], std::complex<double> *AinvWorklist_d[],
int N, int rowStride, int numMats,
bool useHigherPrecision)
{
// Info array tells if a matrix inversion is successful
// = 0 : successful
// = k : U(k,k) = 0; inversion failed
int *infoArray;
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
// (i) copy all the elements of Alist to AWorklist
dim3 dimBlockConvert (CONVERT_BS);
dim3 dimGridConvert ((N*rowStride + (CONVERT_BS-1)) / CONVERT_BS, numMats);
convert_complex<cuDoubleComplex, double, cuDoubleComplex> <<< dimGridConvert, dimBlockConvert >>> ((cuDoubleComplex**)AWorklist_d, (cuDoubleComplex**)Alist_d, N*rowStride);
// (ii) call cublas to do matrix inversion
// LU decomposition
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, infoArray, numMats), __LINE__ );
// Inversion
#if (CUDA_VERSION >= 6050)
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
#else
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
#endif
cudaDeviceSynchronize();
cudaFree(infoArray);
}
//////////////////////////////////////////////////////
// Test routines //
//////////////////////////////////////////////////////
@ -270,8 +399,13 @@ test_cublas_inverse(int matSize, int numMats)
T* A = (T*) malloc(sizeof(T) * numMats * N * row_stride);
for (int j=0; j<numMats; j++)
for (int i=0; i<N*row_stride; i++)
for (int i=0; i<N*row_stride; i++) {
#ifndef QMC_COMPLEX
A[j*N*row_stride+i] = 1.0 * (drand48() - 0.5);
#else
A[j*N*row_stride+i] = T(1.0 * (drand48() - 0.5), 1.0 * (drand48() - 0.5));
#endif
}
// Allocate memory on device for each matrix
for (int mat=0; mat<numMats; mat++)
@ -328,10 +462,10 @@ test_cublas_inverse(int matSize, int numMats)
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
{
double val = 0.0;
T val = 0.0;
for (int k=0; k<N; k++)
val += Ainv[i*row_stride+k] * A[mat*N*row_stride+k*row_stride+j];
double diff = (i==j) ? (1.0f - val) : val;
double diff = (i==j) ? (1.0f - std::real(val)) : std::real(val);
error += diff * diff;
}
fprintf (stderr, "error = %1.8e\n", sqrt(error/(double)(N*N)));
@ -378,8 +512,13 @@ int main(int argc, char** argv)
exit(1);
}
#ifndef QMC_COMPLEX
test_cublas_inverse<double>(matSize, numMats);
test_cublas_inverse<float>(matSize, numMats);
#else
test_cublas_inverse<std::complex<double> >(matSize, numMats);
test_cublas_inverse<std::complex<float> >(matSize, numMats);
#endif
return 0;
}

View File

@ -15,6 +15,7 @@
#ifndef CUDA_INVERSE_H
#define CUDA_INVERSE_H
#include <complex>
#include <cublas_v2.h>
//////////////////////////////
@ -37,4 +38,25 @@ cublas_inverse (cublasHandle_t handle,
int N, int rowStride, int numMats,
bool useHigherPrecision = true);
//////////////////////////////////////
// Complex single / mixed precision //
//////////////////////////////////////
void
cublas_inverse (cublasHandle_t handle,
std::complex<float> *Alist_d[], std::complex<float> *Ainvlist_d[],
std::complex<float> *AWorkList_d[], std::complex<float> *AinvWorkList_d[],
int N, int rowStride, int numMats,
bool useHigherPrecision = true);
//////////////////////////////
// Complex double precision //
//////////////////////////////
void
cublas_inverse (cublasHandle_t handle,
std::complex<double> *Alist_d[], std::complex<double> *Ainvlist_d[],
std::complex<double> *AWorklist_d[], std::complex<double> *AinvWorklist_d[],
int N, int rowStride, int numMats,
bool useHigherPrecision = true);
#endif

View File

@ -565,31 +565,37 @@ void MCWalkerConfiguration::updateLists_GPU()
DataList_GPU.resize(nw);
}
hostlist.resize(nw);
hostlist_valueType.resize(nw);
hostlist_AA.resize(nw);
for (int iw=0; iw<nw; iw++)
{
if (WalkerList[iw]->R_GPU.size() != R.size())
std::cerr << "Error in R_GPU size for iw = " << iw << "!\n";
hostlist[iw] = (CUDA_PRECISION*)WalkerList[iw]->R_GPU.data();
hostlist[iw] = (CudaRealType*)WalkerList[iw]->R_GPU.data();
}
RList_GPU = hostlist;
for (int iw=0; iw<nw; iw++)
{
if (WalkerList[iw]->Grad_GPU.size() != R.size())
std::cerr << "Error in Grad_GPU size for iw = " << iw << "!\n";
hostlist[iw] = (CUDA_PRECISION*)WalkerList[iw]->Grad_GPU.data();
hostlist_valueType[iw] = (CudaValueType*)WalkerList[iw]->Grad_GPU.data();
}
GradList_GPU = hostlist;
GradList_GPU = hostlist_valueType;
for (int iw=0; iw<nw; iw++)
{
if (WalkerList[iw]->Lap_GPU.size() != R.size())
std::cerr << "Error in Lap_GPU size for iw = " << iw << "!\n";
hostlist[iw] = (CUDA_PRECISION*)WalkerList[iw]->Lap_GPU.data();
hostlist_valueType[iw] = (CudaValueType*)WalkerList[iw]->Lap_GPU.data();
}
LapList_GPU = hostlist;
LapList_GPU = hostlist_valueType;
for (int iw=0; iw<nw; iw++)
hostlist[iw] = WalkerList[iw]->cuda_DataSet.data();
DataList_GPU = hostlist;
hostlist_valueType[iw] = WalkerList[iw]->cuda_DataSet.data();
DataList_GPU = hostlist_valueType;
for (int isp=0; isp<NumSpecies; isp++)
{
for (int iw=0; iw<nw; iw++)
@ -629,10 +635,9 @@ void MCWalkerConfiguration::copyWalkersToGPU(bool copyGrad)
copyWalkerGradToGPU();
}
void MCWalkerConfiguration::copyWalkerGradToGPU()
{
Grad_host.resize(WalkerList[0]->R.size());
Grad_host.resize(WalkerList[0]->G.size());
for (int iw=0; iw<WalkerList.size(); iw++)
{
for (int i=0; i<WalkerList[iw]->size(); i++)
@ -642,7 +647,6 @@ void MCWalkerConfiguration::copyWalkerGradToGPU()
}
}
void MCWalkerConfiguration::proposeMove_GPU
(std::vector<PosType> &newPos, int iat)
{
@ -713,7 +717,7 @@ void MCWalkerConfiguration::NLMove_GPU(std::vector<Walker_t*> &walkers,
}
/***************************************************************************
* $RCSfile$ $Author$
* $Revision$ $Date$
* $Id$
* $RCSfile$ $Author: yingwai $
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
* $Id: MCWalkerConfiguration.cpp 7279 2016-11-24 00:21:16Z yingwai $
***************************************************************************/

View File

@ -91,23 +91,27 @@ public:
// laplacians for each walker. These vectors .data() is often
// passed to GPU kernels.
#ifdef QMC_CUDA
gpu::device_vector<CUDA_PRECISION*> RList_GPU, GradList_GPU, LapList_GPU;
gpu::device_vector<CudaRealType*> RList_GPU;
gpu::device_vector<CudaValueType*> GradList_GPU, LapList_GPU;
// First index is the species. The second index is the walker
std::vector<gpu::device_vector<CUDA_PRECISION_FULL*> > RhokLists_GPU;
gpu::device_vector<CUDA_PRECISION*> DataList_GPU;
gpu::device_vector<TinyVector<CUDA_PRECISION,OHMMS_DIM> > Rnew_GPU;
gpu::host_vector<TinyVector<CUDA_PRECISION,OHMMS_DIM> > Rnew_host;
gpu::device_vector<CudaValueType*> DataList_GPU;
gpu::device_vector<CudaPosType> Rnew_GPU;
gpu::host_vector<CudaPosType> Rnew_host;
std::vector<PosType> Rnew;
gpu::device_vector<CudaRealType*> NLlist_GPU;
gpu::host_vector<CudaRealType*> NLlist_host;
gpu::host_vector<CudaRealType*> hostlist;
gpu::host_vector<CudaValueType*> hostlist_valueType;
gpu::host_vector<CUDA_PRECISION_FULL*> hostlist_AA;
gpu::host_vector<CudaPosType> R_host;
gpu::host_vector<CudaGradType> Grad_host;
gpu::device_vector<int> iatList_GPU;
gpu::host_vector<int> iatList_host;
gpu::device_vector<CUDA_PRECISION*> NLlist_GPU;
gpu::host_vector<CUDA_PRECISION*> NLlist_host;
std::vector<PosType> Rnew;
gpu::device_vector<int> AcceptList_GPU;
gpu::host_vector<int> AcceptList_host;
gpu::host_vector<CUDA_PRECISION*> hostlist;
gpu::host_vector<CUDA_PRECISION_FULL*> hostlist_AA;
gpu::host_vector<TinyVector<CUDA_PRECISION, OHMMS_DIM> > R_host;
gpu::host_vector<TinyVector<CudaValueType, OHMMS_DIM> > Grad_host;
void allocateGPU(size_t buffersize);
void copyWalkersToGPU(bool copyGrad=false);
void copyWalkerGradToGPU();
@ -411,7 +415,7 @@ private:
}
#endif
/***************************************************************************
* $RCSfile$ $Author$
* $Revision$ $Date$
* $Id$
* $RCSfile$ $Author: yingwai $
* $Revision: 7383 $ $Date: 2016-12-28 12:15:25 -0500 (Wed, 28 Dec 2016) $
* $Id: MCWalkerConfiguration.h 7383 2016-12-28 17:15:25Z yingwai $
st***************************************************************************/

View File

@ -75,10 +75,16 @@ struct Walker
/** typedef for value data type. */
typedef typename t_traits::ValueType ValueType;
#ifdef QMC_CUDA
/** typedef for real data type */
/** typedef for CUDA real data type */
typedef typename t_traits::CudaRealType CudaRealType;
/** typedef for value data type. */
/** typedef for CUDA value data type. */
typedef typename t_traits::CudaValueType CudaValueType;
/** array of particles */
typedef typename t_traits::CudaPosType CudaPosType;
/** array of gradients */
typedef typename t_traits::CudaGradType CudaGradType;
/** array of laplacians */
typedef typename t_traits::CudaValueType CudaLapType;
#endif
/** array of particles */
typedef typename p_traits::ParticlePos_t ParticlePos_t;
@ -140,13 +146,13 @@ struct Walker
/// Data for GPU-vectorized versions
#ifdef QMC_CUDA
static int cuda_DataSize;
typedef gpu::device_vector<CUDA_PRECISION> cuda_Buffer_t;
typedef gpu::device_vector<CudaValueType> cuda_Buffer_t;
cuda_Buffer_t cuda_DataSet;
// Note that R_GPU has size N+1. The last element contains the
// proposed position for single-particle moves.
gpu::device_vector<TinyVector<CudaRealType,OHMMS_DIM> > R_GPU;
gpu::device_vector<TinyVector<CudaValueType,OHMMS_DIM> > Grad_GPU;
gpu::device_vector<CUDA_PRECISION> Lap_GPU;
gpu::device_vector<CudaPosType> R_GPU;
gpu::device_vector<CudaGradType> Grad_GPU;
gpu::device_vector<CudaLapType> Lap_GPU;
gpu::device_vector<CUDA_PRECISION_FULL> Rhok_GPU;
int k_species_stride;
inline void resizeCuda(int size, int num_species, int num_k)
@ -421,10 +427,10 @@ struct Walker
//+R.size()*(DIM*2*sizeof(RealType)+(DIM+1)*sizeof(ValueType));//R+Drift+G+L
#ifdef QMC_CUDA
bsize += 3 *sizeof (int); // size and N and M
bsize += cuda_DataSize * sizeof(CUDA_PRECISION); // cuda_DataSet
bsize += R.size() * OHMMS_DIM * sizeof(CUDA_PRECISION); // R_GPU
bsize += R.size() * OHMMS_DIM * sizeof(CudaValueType); // Grad_GPU
bsize += R.size() * 1 * sizeof(CUDA_PRECISION); // Lap_GPU
bsize += cuda_DataSize * sizeof(CudaValueType); // cuda_DataSet
bsize += R.size() * OHMMS_DIM * sizeof(CudaPosType); // R_GPU
bsize += G.size() * OHMMS_DIM * sizeof(CudaValueType); // Grad_GPU
bsize += L.size() * 1 * sizeof(CudaLapType); // Lap_GPU
bsize += Rhok_GPU.size() * sizeof(CUDA_PRECISION_FULL); // Rhok
#endif
return bsize;
@ -453,23 +459,30 @@ struct Walker
m.Pack(&(PHindex[0]),PHindex.size());
#ifdef QMC_CUDA
// Pack GPU data
std::vector<CUDA_PRECISION> host_data;
std::vector<CudaValueType> host_data;
std::vector<CUDA_PRECISION_FULL> host_rhok;
std::vector<TinyVector<CUDA_PRECISION,OHMMS_DIM> > R_host;
std::vector<TinyVector<CudaValueType,OHMMS_DIM> > Grad_host;
std::vector<CUDA_PRECISION> host_lapl;
std::vector<CudaPosType> R_host;
std::vector<CudaGradType> Grad_host;
std::vector<CudaLapType> host_lapl;
cuda_DataSet.copyFromGPU(host_data);
R_GPU.copyFromGPU(R_host);
Grad_GPU.copyFromGPU(Grad_host);
Lap_GPU.copyFromGPU(host_lapl);
int size = host_data.size();
int N = R_host.size();
m.Pack(size);
m.Pack(N);
m.Pack(&(host_data[0]), host_data.size());
m.Pack(&(R_host[0][0]), OHMMS_DIM*R_host.size());
Grad_GPU.copyFromGPU(Grad_host);
#ifdef QMC_COMPLEX
m.Pack(reinterpret_cast<CudaRealType*>(&(host_data[0])), host_data.size()*2);
m.Pack(reinterpret_cast<CudaRealType*>(&(Grad_host[0][0])),OHMMS_DIM*Grad_host.size()*2);
m.Pack(reinterpret_cast<CudaRealType*>(&(host_lapl[0])), host_lapl.size()*2);
#else
m.Pack(&(host_data[0]), host_data.size());
m.Pack(&(Grad_host[0][0]), OHMMS_DIM*Grad_host.size());
Lap_GPU.copyFromGPU(host_lapl);
m.Pack(&(host_lapl[0]), host_lapl.size());
#endif
Rhok_GPU.copyFromGPU(host_rhok);
int M = host_rhok.size();
m.Pack(M);
@ -500,24 +513,33 @@ struct Walker
m.Unpack(&(PropertyHistory[iat][0]),PropertyHistory[iat].size());
m.Unpack(&(PHindex[0]),PHindex.size());
#ifdef QMC_CUDA
// Pack GPU data
std::vector<CUDA_PRECISION> host_data;
// Unpack GPU data
std::vector<CudaValueType> host_data;
std::vector<CUDA_PRECISION_FULL> host_rhok;
std::vector<TinyVector<CUDA_PRECISION,OHMMS_DIM> > R_host;
std::vector<CUDA_PRECISION> host_lapl;
std::vector<CudaPosType> R_host;
std::vector<CudaGradType> Grad_host;
std::vector<CudaLapType> host_lapl;
int size, N;
m.Unpack(size);
m.Unpack(N);
host_data.resize(size);
R_host.resize(N);
Grad_host.resize(N);
host_lapl.resize(N);
m.Unpack(&(host_data[0]), size);
cuda_DataSet = host_data;
m.Unpack(&(R_host[0][0]), OHMMS_DIM*N);
R_GPU = R_host;
m.Unpack(&(R_host[0][0]), OHMMS_DIM*N);
Grad_GPU = R_host;
#ifdef QMC_COMPLEX
m.Unpack(reinterpret_cast<CudaRealType*>(&(host_data[0])), size*2);
m.Unpack(reinterpret_cast<CudaRealType*>(&(Grad_host[0][0])),OHMMS_DIM*N*2);
m.Unpack(reinterpret_cast<CudaRealType*>(&(host_lapl[0])), N*2);
#else
m.Unpack(&(host_data[0]), size);
m.Unpack(&(Grad_host[0][0]), OHMMS_DIM*N);
m.Unpack(&(host_lapl[0]), N);
#endif
cuda_DataSet = host_data;
Grad_GPU = Grad_host;
Lap_GPU = host_lapl;
int M;
m.Unpack(M);
@ -543,7 +565,7 @@ std::ostream& operator<<(std::ostream& out, const Walker<RealType,PA>& rhs)
#endif
/***************************************************************************
* $RCSfile$ $Author$
* $Revision$ $Date$
* $Id$
* $RCSfile$ $Author: yingwai $
* $Revision: 7405 $ $Date: 2017-01-09 15:46:55 -0500 (Mon, 09 Jan 2017) $
* $Id: Walker.h 7405 2017-01-09 20:46:55Z yingwai $
***************************************************************************/

View File

@ -23,6 +23,7 @@
#include "Utilities/RandomGenerator.h"
#include "ParticleBase/RandomSeqGenerator.h"
#include "QMCDrivers/DriftOperators.h"
#include "type_traits/scalar_traits.h"
namespace qmcplusplus
@ -91,8 +92,13 @@ bool DMCcuda::run()
std::vector<PosType> delpos(nw);
std::vector<PosType> dr(nw);
std::vector<PosType> newpos(nw);
std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw);
std::vector<PosType> oldG(nw), newG(nw);
//std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw);
std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw);
std::vector<RealType> R2prop(nw), R2acc(nw);
#ifdef QMC_COMPLEX
std::vector<RealType> ratios_real(nw);
#endif
std::vector<GradType> oldG(nw), newG(nw);
std::vector<ValueType> oldL(nw), newL(nw);
std::vector<Walker_t*> accepted(nw);
Matrix<ValueType> lapl(nw, nat);
@ -119,6 +125,9 @@ bool DMCcuda::run()
dr.resize(nw);
newpos.resize(nw);
ratios.resize(nw);
#ifdef QMC_COMPLEX
ratios_real.resize(nw);
#endif
rplus.resize(nw);
rminus.resize(nw);
oldG.resize(nw);
@ -156,9 +165,17 @@ bool DMCcuda::run()
{
delpos[iw] *= m_sqrttau;
oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]);
#ifdef QMC_COMPLEX
convert(oldScale[iw] * oldG[iw], dr[iw]);
dr[iw] += delpos[iw];
#else
dr[iw] = delpos[iw] + (oldScale[iw]*oldG[iw]);
#endif
newpos[iw]=W[iw]->R[iat] + dr[iw];
ratios[iw] = 1.0;
#ifdef QMC_COMPLEX
ratios_real[iw] = 1.0;
#endif
R2prop[iw] += dot(delpos[iw], delpos[iw]);
}
W.proposeMove_GPU(newpos, iat);
@ -171,21 +188,42 @@ bool DMCcuda::run()
std::vector<RealType> rand_v(nw);
for(int iw=0; iw<nw; ++iw)
{
#ifdef QMC_COMPLEX
PosType drOld = 0.0;
convert(oldScale[iw] * oldG[iw], drOld);
drOld = newpos[iw] - (W[iw]->R[iat] + drOld);
#else
PosType drOld =
newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]);
#endif
logGf_v[iw] = -m_oneover2tau * dot(drOld, drOld);
rand_v[iw] = Random();
}
Psi.addRatio(W,iat,ratios, newG, newL);
Psi.addRatio(W, iat, ratios, newG, newL);
#ifdef QMC_COMPLEX
Psi.convertRatiosFromComplexToReal(ratios, ratios_real);
#endif
for(int iw=0; iw<nw; ++iw)
{
newScale[iw] = getDriftScale(m_tauovermass,newG[iw]);
#ifdef QMC_COMPLEX
PosType drNew = 0.0;
convert(newScale[iw] * newG[iw], drNew);
drNew += newpos[iw] - W[iw]->R[iat];
#else
PosType drNew =
(newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat];
#endif
RealType logGb = -m_oneover2tau * dot(drNew, drNew);
RealType x = logGb - logGf_v[iw];
#ifdef QMC_COMPLEX
RealType prob = ratios_real[iw]*ratios_real[iw]*std::exp(x);
if(acc[iw] && rand_v[iw] < prob && ratios_real[iw] > 0.0)
#else
RealType prob = ratios[iw]*ratios[iw]*std::exp(x);
if(acc[iw] && rand_v[iw] < prob && ratios[iw] > 0.0)
#endif
{
accepted.push_back(W[iw]);
nAccept++;
@ -284,10 +322,20 @@ bool DMCcuda::run()
RealType v2=0.0, v2bar=0.0;
for(int iat=0; iat<nat; iat++)
{
#ifdef QMC_COMPLEX
v2 += dot_real(W.G[iat],W.G[iat]);
#else
// should be removed when things work fine
v2 += dot(W.G[iat],W.G[iat]);
#endif
RealType newscale = getDriftScale(m_tauovermass,newG[iw]);
#ifdef QMC_COMPLEX
v2 += m_tauovermass * m_tauovermass * dot_real(newG[iw],newG[iw]);
v2bar += newscale * newscale * dot_real(newG[iw],newG[iw]);
#else
v2 += m_tauovermass * m_tauovermass * dot(newG[iw],newG[iw]);
v2bar += newscale * newscale * dot(newG[iw],newG[iw]);
#endif
}
//RealType scNew = std::sqrt(V2bar[iw] / V2[iw]);
RealType scNew = std::sqrt(v2bar/v2);
@ -346,8 +394,13 @@ bool DMCcuda::runWithNonlocal()
std::vector<PosType> delpos(nw);
std::vector<PosType> dr(nw);
std::vector<PosType> newpos(nw);
std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw);
std::vector<PosType> oldG(nw), newG(nw);
//std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw);
std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw);
std::vector<RealType> R2prop(nw), R2acc(nw);
#ifdef QMC_COMPLEX
std::vector<RealType> ratios_real(nw);
#endif
std::vector<GradType> oldG(nw), newG(nw);
std::vector<ValueType> oldL(nw), newL(nw);
std::vector<Walker_t*> accepted(nw);
Matrix<ValueType> lapl(nw, nat);
@ -372,6 +425,9 @@ bool DMCcuda::runWithNonlocal()
dr.resize(nw);
newpos.resize(nw);
ratios.resize(nw);
#ifdef QMC_COMPLEX
ratios_real.resize(nw);
#endif
rplus.resize(nw);
rminus.resize(nw);
oldG.resize(nw);
@ -401,7 +457,12 @@ bool DMCcuda::runWithNonlocal()
{
delpos[iw] *= m_sqrttau;
oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]);
#ifdef QMC_COMPLEX
convert(oldScale[iw] * oldG[iw], dr[iw]);
dr[iw] += delpos[iw];
#else
dr[iw] = delpos[iw] + (oldScale[iw]*oldG[iw]);
#endif
newpos[iw]=W[iw]->R[iat] + dr[iw];
ratios[iw] = 1.0;
R2prop[iw] += dot(delpos[iw], delpos[iw]);
@ -414,21 +475,41 @@ bool DMCcuda::runWithNonlocal()
std::vector<RealType> rand_v(nw);
for(int iw=0; iw<nw; ++iw)
{
#ifdef QMC_COMPLEX
PosType drOld = 0.0;
convert(oldScale[iw] * oldG[iw], drOld);
drOld = newpos[iw] - (W[iw]->R[iat] + drOld);
#else
PosType drOld =
newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]);
#endif
logGf_v[iw] = -m_oneover2tau * dot(drOld, drOld);
rand_v[iw] = Random();
}
Psi.addRatio(W,iat,ratios,newG, newL);
Psi.addRatio(W, iat, ratios, newG, newL);
#ifdef QMC_COMPLEX
Psi.convertRatiosFromComplexToReal(ratios, ratios_real);
#endif
for(int iw=0; iw<nw; ++iw)
{
newScale[iw] = getDriftScale(m_tauovermass,newG[iw]);
#ifdef QMC_COMPLEX
PosType drNew = 0.0;
convert(newScale[iw] * newG[iw], drNew);
drNew += newpos[iw] - W[iw]->R[iat];
#else
PosType drNew =
(newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat];
#endif
RealType logGb = -m_oneover2tau * dot(drNew, drNew);
RealType x = logGb - logGf_v[iw];
#ifdef QMC_COMPLEX
RealType prob = ratios_real[iw]*ratios_real[iw]*std::exp(x);
if(rand_v[iw] < prob && ratios_real[iw] > 0.0)
#else
RealType prob = ratios[iw]*ratios[iw]*std::exp(x);
if(rand_v[iw] < prob && ratios[iw] > 0.0)
#endif
{
accepted.push_back(W[iw]);
nAccept++;
@ -558,7 +639,7 @@ void DMCcuda::resetRun()
PointerPool<Walker_t::cuda_Buffer_t > pool;
Psi.reserve (pool);
app_log() << "Each walker requires "
<< pool.getTotalSize() * sizeof(CudaRealType)
<< pool.getTotalSize() * sizeof(CudaValueType)
<< " bytes in GPU memory.\n";
app_log() << "Preparing to allocate " << W.WalkerList.size()
<< " walkers.\n";

View File

@ -65,11 +65,11 @@ QMCCostFunctionCUDA::Return_t QMCCostFunctionCUDA::correlatedSampling(bool needD
int numParams = OptVariablesForPsi.size();
int numPtcl = W.getTotalNum();
std::vector<RealType> logpsi_new(nw), logpsi_fixed(nw), KE(nw);
TrialWaveFunction::ValueMatrix_t d_logpsi_dalpha, d_hpsioverpsi_dalpha;
TrialWaveFunction::GradMatrix_t fixedG(nw, numPtcl);
TrialWaveFunction::ValueMatrix_t fixedL(nw, numPtcl);
TrialWaveFunction::GradMatrix_t newG(nw, numPtcl);
TrialWaveFunction::ValueMatrix_t newL(nw, numPtcl);
RealMatrix_t d_logpsi_dalpha, d_hpsioverpsi_dalpha;
GradMatrix_t fixedG(nw, numPtcl);
ValueMatrix_t fixedL(nw, numPtcl);
GradMatrix_t newG(nw, numPtcl);
ValueMatrix_t newL(nw, numPtcl);
if (needDerivs)
{
d_logpsi_dalpha.resize(nw, numParams);

View File

@ -54,13 +54,14 @@ public:
protected:
Matrix<Return_t> Records;
typedef TrialWaveFunction::RealMatrix_t RealMatrix_t;
typedef TrialWaveFunction::ValueMatrix_t ValueMatrix_t;
typedef TrialWaveFunction::GradMatrix_t GradMatrix_t;
/** Temp derivative properties and Hderivative properties of all the walkers
*/
std::vector<std::vector<Return_t> > TempDerivRecords;
std::vector<std::vector<Return_t> > TempHDerivRecords;
ValueMatrix_t LogPsi_Derivs, LocE_Derivs;
RealMatrix_t LogPsi_Derivs, LocE_Derivs;
ValueMatrix_t d2logPsi_opt, d2logPsi_fixed;
GradMatrix_t dlogPsi_opt, dlogPsi_fixed;

View File

@ -22,6 +22,7 @@
#include "ParticleBase/RandomSeqGenerator.h"
#include "Message/CommOperators.h"
#include "QMCDrivers/DriftOperators.h"
#include "type_traits/scalar_traits.h"
#include "qmc_common.h"
namespace qmcplusplus
@ -63,6 +64,9 @@ void VMCcuda::advanceWalkers()
std::vector<PosType> delpos(nw);
std::vector<PosType> newpos(nw);
std::vector<ValueType> ratios(nw);
#ifdef QMC_COMPLEX
std::vector<RealType> ratios_real(nw);
#endif
std::vector<GradType> newG(nw);
std::vector<ValueType> newL(nw);
std::vector<Walker_t*> accepted(nw);
@ -75,19 +79,29 @@ void VMCcuda::advanceWalkers()
makeGaussRandomWithEngine(delpos,Random);
for(int iw=0; iw<nw; ++iw)
{
PosType G = W[iw]->G[iat];
GradType G = W[iw]->G[iat];
newpos[iw]=W[iw]->R[iat] + m_sqrttau*delpos[iw];
ratios[iw] = 1.0;
#ifdef QMC_COMPLEX
ratios_real[iw] = 1.0;
#endif
}
W.proposeMove_GPU(newpos, iat);
Psi.ratio(W,iat,ratios,newG, newL);
#ifdef QMC_COMPLEX
Psi.convertRatiosFromComplexToReal(ratios, ratios_real);
#endif
accepted.clear();
std::vector<bool> acc(nw, true);
if (W.UseBoundBox)
checkBounds (newpos, acc);
for(int iw=0; iw<nw; ++iw)
{
#ifdef QMC_COMPLEX
if(acc[iw] && ratios_real[iw]*ratios_real[iw] > Random())
#else
if(acc[iw] && ratios[iw]*ratios[iw] > Random())
#endif
{
accepted.push_back(W[iw]);
nAccept++;
@ -224,7 +238,10 @@ void VMCcuda::advanceWalkersWithDrift()
std::vector<PosType> dr(nw);
std::vector<PosType> newpos(nw);
std::vector<ValueType> ratios(nw);
std::vector<PosType> oldG(nw), newG(nw);
#ifdef QMC_COMPLEX
std::vector<RealType> ratios_real(nw);
#endif
std::vector<GradType> oldG(nw), newG(nw);
std::vector<ValueType> oldL(nw), newL(nw);
std::vector<Walker_t*> accepted(nw);
@ -239,9 +256,17 @@ void VMCcuda::advanceWalkersWithDrift()
for(int iw=0; iw<nw; iw++)
{
oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]);
#ifdef QMC_COMPLEX
convert(oldScale[iw]*oldG[iw], dr[iw]);
dr[iw] += m_sqrttau * delpos[iw];
#else
dr[iw] = (m_sqrttau*delpos[iw]) + (oldScale[iw]*oldG[iw]);
#endif
newpos[iw]=W[iw]->R[iat] + dr[iw];
ratios[iw] = 1.0;
#ifdef QMC_COMPLEX
ratios_real[iw] = 1.0;
#endif
}
W.proposeMove_GPU(newpos, iat);
Psi.calcRatio(W,iat,ratios,newG, newL);
@ -253,22 +278,41 @@ void VMCcuda::advanceWalkersWithDrift()
std::vector<RealType> rand_v(nw);
for(int iw=0; iw<nw; ++iw)
{
#ifdef QMC_COMPLEX
PosType drOld = 0.0;
convert(oldScale[iw] * oldG[iw], drOld);
drOld = newpos[iw] - (W[iw]->R[iat] + drOld);
#else
PosType drOld =
newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]);
#endif
logGf_v[iw] = -m_oneover2tau * dot(drOld, drOld);
rand_v[iw] = Random();
}
Psi.addRatio(W, iat, ratios, newG, newL);
#ifdef QMC_COMPLEX
Psi.convertRatiosFromComplexToReal(ratios, ratios_real);
#endif
for(int iw=0; iw<nw; ++iw)
{
newScale[iw] = getDriftScale(m_tauovermass,newG[iw]);
#ifdef QMC_COMPLEX
PosType drNew = 0.0;
convert(newScale[iw] * newG[iw], drNew);
drNew += newpos[iw] - W[iw]->R[iat];
#else
PosType drNew =
(newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat];
#endif
// if (dot(drNew, drNew) > 25.0)
// std::cerr << "Large drift encountered! Drift = " << drNew << std::endl;
RealType logGb = -m_oneover2tau * dot(drNew, drNew);
RealType x = logGb - logGf_v[iw];
#ifdef QMC_COMPLEX
RealType prob = ratios_real[iw]*ratios_real[iw]*std::exp(x);
#else
RealType prob = ratios[iw]*ratios[iw]*std::exp(x);
#endif
if(acc[iw] && rand_v[iw] < prob)
{
accepted.push_back(W[iw]);
@ -413,7 +457,7 @@ void VMCcuda::resetRun()
PointerPool<Walker_t::cuda_Buffer_t > pool;
app_log() << "Starting VMCcuda::resetRun() " << std::endl;
Psi.reserve (pool);
app_log() << "Each walker requires " << pool.getTotalSize() * sizeof(CudaRealType)
app_log() << "Each walker requires " << pool.getTotalSize() * sizeof(CudaValueType)
<< " bytes in GPU memory.\n";
// Now allocate memory on the GPU card for each walker
// for (int iw=0; iw<W.WalkerList.size(); iw++) {

View File

@ -73,7 +73,7 @@ private:
opt_variables_type dummy;
int numParams;
Matrix<ValueType> d_logpsi_dalpha, d_hpsioverpsi_dalpha;
Matrix<RealType> d_logpsi_dalpha, d_hpsioverpsi_dalpha;
RealType w_beta,w_alpha;
RealType E_avg, V_avg;
std::string GEVtype;

View File

@ -372,9 +372,7 @@ struct BareKineticEnergy: public QMCHamiltonianBase
for (int iw=0; iw<walkers.size(); iw++)
{
Walker_t &w = *(walkers[iw]);
RealType KE = 0.0;
for (int ptcl=0; ptcl<w.G.size(); ptcl++)
KE -= 0.5*(dot (w.G[ptcl],w.G[ptcl]) + w.L[ptcl]);
RealType KE = - OneOver2M * (Dot(w.G,w.G)+Sum(w.L));
w.getPropertyBase()[NUMPROPERTIES+myIndex] = KE;
LocalEnergy[iw] += KE;
}
@ -396,8 +394,8 @@ struct BareKineticEnergy: public QMCHamiltonianBase
#endif
/***************************************************************************
* $RCSfile$ $Author$
* $Revision$ $Date$
* $Id$
* $RCSfile$ $Author: yingwai $
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
* $Id: BareKineticEnergy.h 7279 2016-11-24 00:21:16Z yingwai $
***************************************************************************/

View File

@ -126,9 +126,15 @@ struct ConservedEnergy: public QMCHamiltonianBase
for (int iw=0; iw<walkers.size(); iw++)
{
Walker_t &w = *(walkers[iw]);
RealType flux = 0.0;
for (int ptcl=0; ptcl<w.G.size(); ptcl++)
flux +=2.0 * dot(w.G[ptcl],w.G[ptcl]) + w.L[ptcl];
RealType flux;
RealType gradsq = Dot(w.G,w.G);
RealType lap = Sum(w.L);
#ifdef QMC_COMPLEX
RealType gradsq_cc = Dot_CC(w.G,w.G);
flux = lap + gradsq + gradsq_cc;
#else
flux = lap + 2*gradsq;
#endif
w.getPropertyBase()[NUMPROPERTIES+myIndex] = flux;
}
}
@ -139,8 +145,8 @@ struct ConservedEnergy: public QMCHamiltonianBase
#endif
/***************************************************************************
* $RCSfile$ $Author$
* $Revision$ $Date$
* $Id$
* $RCSfile$ $Author: tillackaf $
* $Revision: 7408 $ $Date: 2017-01-10 13:29:49 -0500 (Tue, 10 Jan 2017) $
* $Id: ConservedEnergy.h 7408 2017-01-10 18:29:49Z tillackaf $
***************************************************************************/

View File

@ -303,7 +303,11 @@ void NonLocalECPotential_CUDA::addEnergy(MCWalkerConfiguration &W,
for (int iq=0; iq<numQuad; iq++)
{
RealType costheta = *(cos_ptr++);
#ifdef QMC_COMPLEX
RealType ratio = RatioList[ratioIndex++].real() * pp.sgridweight_m[iq]; // Abs(complex number)*cosine(phase of complex number) = Real part of said complex number
#else
RealType ratio = RatioList[ratioIndex++] * pp.sgridweight_m[iq];
#endif
// if (std::isnan(ratio)) {
// std::cerr << "NAN from ratio number " << ratioIndex-1 << "\n";
// std::cerr << "RatioList.size() = " << RatioList.size() << std::endl;
@ -422,7 +426,11 @@ void NonLocalECPotential_CUDA::addEnergy
for (int iq=0; iq<numQuad; iq++)
{
RealType costheta = *(cos_ptr++);
#ifdef QMC_COMPLEX
RealType ratio = RatioList[ratioIndex++].real() * pp.sgridweight_m[iq]; // Abs(complex number)*cosine(phase of complex number) = Real part of said complex number
#else
RealType ratio = RatioList[ratioIndex++] * pp.sgridweight_m[iq];
#endif
RealType lpolprev=0.0;
lpol[0] = 1.0;
for (int l=0 ; l< pp.lmax ; l++)

View File

@ -21,45 +21,11 @@
#include <einspline/multi_bspline_eval_cuda.h>
#include "Configuration.h"
#include "AtomicOrbitalCuda.h"
#include "PhaseFactors.h"
#ifdef HAVE_MKL
#include <mkl_vml.h>
#endif
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
float pos[], float *phi_in[], float *phi_out[],
int num_splines, int num_walkers);
#ifdef ALGO_CHRISTOS
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
float pos[], float *phi_in[], float *phi_out[],
float *GL_in[], float *GL_out[],
int num_splines, int num_walkers, int row_stride,
bool dontMakeTwoCopies);
#else
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
float pos[], float *phi_in[], float *phi_out[],
float *GL_in[], float *GL_out[],
int num_splines, int num_walkers, int row_stride);
#endif
void apply_phase_factors(float kPoints[], int makeTwoCopies[], int TwoCopiesIndex[],
float pos[], float *phi_in[], float *phi_out[],
float *GL_in[], float *GL_out[],
int num_splines, int num_walkers, int row_stride);
void apply_phase_factors(double kPoints[], int makeTwoCopies[],
double pos[], double *phi_in[], double *phi_out[],
int num_splines, int num_walkers);
void apply_phase_factors(double kPoints[], int makeTwoCopies[],
double pos[], double *phi_in[], double *phi_out[],
double *GL_in[], double *GL_out[],
int num_splines, int num_walkers, int row_stride);
void apply_phase_factors(double kPoints[], int makeTwoCopies[], int TwoCopiesIndex[],
double pos[], double *phi_in[], double *phi_out[],
double *GL_in[], double *GL_out[],
int num_splines, int num_walkers, int row_stride);
namespace qmcplusplus
{
inline void create_multi_UBspline_3d_cuda (multi_UBspline_3d_d *in,
@ -224,7 +190,6 @@ eval_multi_multi_UBspline_3d_cuda (multi_UBspline_3d_z_cuda *spline,
eval_multi_multi_UBspline_3d_z_cuda (spline, pos, phi, N);
}
inline void eval_multi_multi_UBspline_3d_vgl_cuda
(multi_UBspline_3d_c_cuda *spline, float *pos, float Linv[],
std::complex<float> *phi[], std::complex<float> *grad_lapl[], int N, int row_stride)
@ -241,7 +206,6 @@ inline void eval_multi_multi_UBspline_3d_vgl_cuda
(spline, pos, Linv, phi, grad_lapl, N, row_stride);
}
//////////////////////////////////////////////
// Vectorized evaluation routines using GPU //
//////////////////////////////////////////////
@ -323,6 +287,31 @@ EinsplineSetExtended<std::complex<double> >::evaluate
}
#ifdef QMC_COMPLEX
template<> void
EinsplineSetExtended<double>::evaluate
(std::vector<Walker_t*> &walkers, int iat,
gpu::device_vector<CudaComplexType*> &phi)
{
app_error() << "Code should not arrive at this point: "
<< "EinsplineSetExtended<double>::evaluate at line "
<< __LINE__ << " in file " << __FILE__ << "\n";
abort();
}
template<> void
EinsplineSetExtended<std::complex<double> >::evaluate
(std::vector<Walker_t*> &walkers, int iat,
gpu::device_vector<CudaComplexType*> &phi)
{
app_error() << "EinsplineSetExtended<std::complex<double> >::evaluate at line " << __LINE__
<< " in file " << __FILE__
<< " not yet implemented.\n";
abort();
}
#endif
template<> void
EinsplineSetExtended<double>::evaluate
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
@ -454,6 +443,32 @@ EinsplineSetExtended<std::complex<double> >::evaluate
walkers.size());
}
#ifdef QMC_COMPLEX
template<> void
EinsplineSetExtended<double>::evaluate
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
gpu::device_vector<CudaComplexType*> &phi)
{
app_error() << "Code should not arrive at this point: "
<< "EinsplineSetExtended<double>::evaluate at line "
<< __LINE__ << " in file " << __FILE__ << "\n";
abort();
}
template<> void
EinsplineSetExtended<std::complex<double> >::evaluate
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
gpu::device_vector<CudaComplexType*> &phi)
{
app_error() << "EinsplineSetExtended<std::complex<double>>::evaluate at line " << __LINE__
<< " in file " << __FILE__
<< " not yet implemented.\n";
abort();
}
#endif
template<> void
EinsplineSetExtended<double>::evaluate
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
@ -554,25 +569,6 @@ EinsplineSetExtended<std::complex<double> >::evaluate
hostPos[iw] = newpos[iw];
cudapos = hostPos;
#ifdef ALGO_CHRISTOS
// Slower for some runs
// christos: is MakeTwoCopies consistent with CudaMakeTwoCopies?
bool noTwoCopies = true;
for (int i=0 ; i<MakeTwoCopies.size() ; i++)
if (MakeTwoCopies[i]) {
noTwoCopies = false;
break;
}
// std::cout << "noTwoCopies=" << noTwoCopies << std::endl;
apply_phase_factors ((CUDA_PRECISION*) CudakPoints.data(),
CudaMakeTwoCopies.data(),
(CudaRealType*)cudapos.data(),
(CudaRealType**)CudaValuePointers.data(), phi.data(),
(CudaRealType**)CudaGradLaplPointers.data(), grad_lapl.data(),
CudaMultiSpline->num_splines, walkers.size(), row_stride,
noTwoCopies);
#else
/* Original implementation
apply_phase_factors ((CudaRealType*) CudakPoints.data(),
CudaMakeTwoCopies.data(),
@ -589,10 +585,67 @@ EinsplineSetExtended<std::complex<double> >::evaluate
(CudaRealType**)CudaValuePointers.data(), phi.data(),
(CudaRealType**)CudaGradLaplPointers.data(), grad_lapl.data(),
CudaMultiSpline->num_splines, walkers.size(), row_stride);
#endif
}
#ifdef QMC_COMPLEX
template<> void
EinsplineSetExtended<double>::evaluate
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
gpu::device_vector<CudaComplexType*> &phi,
gpu::device_vector<CudaComplexType*> &grad_lapl,
int row_stride)
{
app_error() << "Code should not arrive at this point: "
<< "EinsplineSetExtended<double>::evaluate at line "
<< __LINE__ << " in file " << __FILE__ << "\n";
abort();
}
template<> void
EinsplineSetExtended<std::complex<double> >::evaluate
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
gpu::device_vector<CudaComplexType*> &phi,
gpu::device_vector<CudaComplexType*> &grad_lapl,
int row_stride)
{
int N = walkers.size();
int M = CudaMultiSpline->num_splines;
if (CudaValuePointers.size() < N)
resize_cuda(N);
if (cudapos.size() < N)
{
hostPos.resize(N);
cudapos.resize(N);
}
for (int iw=0; iw < N; iw++)
{
PosType r = newpos[iw];
PosType ru(PrimLattice.toUnit(r));
ru[0] -= std::floor (ru[0]);
ru[1] -= std::floor (ru[1]);
ru[2] -= std::floor (ru[2]);
hostPos[iw] = ru;
}
cudapos = hostPos;
eval_multi_multi_UBspline_3d_vgl_cuda
(CudaMultiSpline, (CudaRealType*)cudapos.data(), Linv_cuda.data(), CudaValuePointers.data(),
CudaGradLaplPointers.data(), N, CudaMultiSpline->num_splines);
// Now, add on phases
for (int iw=0; iw < N; iw++)
hostPos[iw] = newpos[iw];
cudapos = hostPos;
apply_phase_factors ((CudaRealType*) CudakPoints.data(),
(CudaRealType*) cudapos.data(),
(CudaValueType**) CudaValuePointers.data(),
(CudaValueType**) phi.data(),
(CudaValueType**) CudaGradLaplPointers.data(),
(CudaValueType**) grad_lapl.data(),
CudaMultiSpline->num_splines, walkers.size(), row_stride);
}
#endif
template<> void
EinsplineSetExtended<double>::evaluate
(std::vector<PosType> &pos, gpu::device_vector<CudaRealType*> &phi)
@ -634,8 +687,9 @@ template<> void
EinsplineSetExtended<double>::evaluate
(std::vector<PosType> &pos, gpu::device_vector<CudaComplexType*> &phi)
{
app_error() << "EinsplineSetExtended<std::complex<double> >::evaluate "
<< "not yet implemented.\n";
app_error() << "EinsplineSetExtended<std::complex<double> >::evaluate at "
<< __LINE__ << " in file " << __FILE__
<< " not yet implemented.\n";
abort();
}
@ -682,9 +736,57 @@ template<> void
EinsplineSetExtended<std::complex<double> >::evaluate
(std::vector<PosType> &pos, gpu::device_vector<CudaComplexType*> &phi)
{
app_error() << "EinsplineSetExtended<std::complex<double> >::evaluate "
<< "not yet implemented.\n";
#ifdef QMC_COMPLEX
int N = pos.size();
if (CudaValuePointers.size() < N)
resize_cuda(N);
if (cudapos.size() < N)
{
hostPos.resize(N);
cudapos.resize(N);
}
for (int iw=0; iw < N; iw++)
{
PosType r = pos[iw];
PosType ru(PrimLattice.toUnit(r));
for (int i=0; i<OHMMS_DIM; i++)
ru[i] -= std::floor (ru[i]);
hostPos[iw] = ru;
}
cudapos = hostPos;
// AT debug:
// std::cout << "# splines: " << CudaMultiSpline->num_splines << "\n";
eval_multi_multi_UBspline_3d_cuda
(CudaMultiSpline, (CudaRealType*) cudapos.data(),
CudaValuePointers.data(), N);
// Now, add on phases
for (int iw=0; iw < N; iw++)
hostPos[iw] = pos[iw];
cudapos = hostPos;
apply_phase_factors((CudaRealType*) CudakPoints.data(),
(CudaRealType*) cudapos.data(),
CudaValuePointers.data(),
phi.data(),
CudaMultiSpline->num_splines, N);
// AT debug:
/* gpu::host_vector<CudaValueType*> pointers;
pointers = CudaValuePointers;
CudaValueType data[N], data_new[N];
cudaMemcpy (data, pointers[0], N*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
pointers = phi;
cudaMemcpy (data_new, pointers[0], N*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
std::cout << "CudaValuePointers -> phi (# splines: " << CudaMultiSpline->num_splines << "):\n";
for (int i=0; i<N; i++)
std::cout << i << ": " << data[i].real() << " + " << data[i].imag() << "i -> " << data_new[i].real() << " + " << data_new[i].imag() << "i\n";
std::cout.flush();
abort();*/
#else
app_error() << "EinsplineSetExtended<std::complex<double> >::evaluate at "
<< __LINE__ << " in file " << __FILE__
<< " not yet implemented.\n";
abort();
#endif
}
@ -1079,8 +1181,6 @@ EinsplineSetHybrid<double>::evaluate (std::vector<Walker_t*> &walkers, std::vect
fprintf (stderr, " N img dist ion lMax\n");
for (int i=0; i<HybridData_CPU.size(); i++)
{
//YingWai's trial fix
//HybridDataFloat &d = HybridData_CPU[i];
HybridData<CudaRealType> &d = HybridData_CPU[i];
fprintf (stderr, " %d %2.0f %2.0f %2.0f %8.5f %d %d\n",
i, d.img[0], d.img[1], d.img[2], d.dist, d.ion, d.lMax);
@ -1392,8 +1492,6 @@ EinsplineSetHybrid<std::complex<double> >::evaluate
gpu::host_vector<CudaRealType> vals_CPU(NumOrbitals), GL_CPU(4*row_stride);
gpu::host_vector<HybridJobType> HybridJobs_CPU(HybridJobs_GPU.size());
HybridJobs_CPU = HybridJobs_GPU;
//YingWai's trial fix
//gpu::host_vector<HybridDataFloat> HybridData_CPU(HybridData_GPU.size());
gpu::host_vector<HybridData<CudaRealType> > HybridData_CPU(HybridData_GPU.size());
HybridData_CPU = HybridData_GPU;
rhats_CPU = rhats_GPU;
@ -1409,8 +1507,6 @@ EinsplineSetHybrid<std::complex<double> >::evaluate
ComplexGradVector_t CPUzgrad(M);
ValueVector_t CPUvals(NumOrbitals), CPUlapl(NumOrbitals);
GradVector_t CPUgrad(NumOrbitals);
//YingWai's trial fix
//HybridDataFloat &d = HybridData_CPU[iw];
HybridData<CudaRealType> &d = HybridData_CPU[iw];
AtomicOrbital<std::complex<double> > &atom = AtomicOrbitals[d.ion];
atom.evaluate (newpos[iw], CPUzvals, CPUzgrad, CPUzlapl);
@ -1599,8 +1695,6 @@ EinsplineSetHybrid<std::complex<double> >::evaluate
fprintf (stderr, " N img dist ion lMax\n");
for (int i=0; i<HybridData_CPU.size(); i++)
{
//YingWai's trial fix
//HybridDataFloat &d = HybridData_CPU[i];
HybridData<CudaRealType> &d = HybridData_CPU[i];
fprintf (stderr, " %d %2.0f %2.0f %2.0f %8.5f %d %d\n",
i, d.img[0], d.img[1], d.img[2], d.dist, d.ion, d.lMax);

View File

@ -362,7 +362,7 @@ public:
}
virtual void
reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool)
reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool)
{
std::cerr << "Need specialization of DiracDetermiantBase::reserve.\n";
abort();
@ -464,7 +464,7 @@ public:
}
#endif
/***************************************************************************
* $RCSfile$ $Author$
* $Revision$ $Date$
* $Id$
* $RCSfile$ $Author: yingwai $
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
* $Id: DiracDeterminantBase.h 7279 2016-11-24 00:21:16Z yingwai $
***************************************************************************/

View File

@ -55,7 +55,7 @@ DiracDeterminantCUDA::DiracDeterminantCUDA(SPOSetBasePtr const &spos, int first)
NLratioList_d("DiracDeterminantBase::NLratioList_d")
{
for(int i = 0; i < 2; ++i)
NLratios_d[i] = gpu::device_vector<CudaRealType>("DiracDeterminantBase::NLratios_d");
NLratios_d[i] = gpu::device_vector<CudaValueType>("DiracDeterminantBase::NLratios_d");
}
DiracDeterminantCUDA::DiracDeterminantCUDA(const DiracDeterminantCUDA& s) :
@ -86,7 +86,7 @@ DiracDeterminantCUDA::DiracDeterminantCUDA(const DiracDeterminantCUDA& s) :
NLratioList_d("DiracDeterminantBase::NLratioList_d")
{
for(int i = 0; i < 2; ++i)
NLratios_d[i] = gpu::device_vector<CudaRealType>("DiracDeterminantBase::NLratios_d");
NLratios_d[i] = gpu::device_vector<CudaValueType>("DiracDeterminantBase::NLratios_d");
}
@ -161,18 +161,22 @@ DiracDeterminantCUDA::update (std::vector<Walker_t*> &walkers, int iat)
// // Copy temporary gradients and laplacians into matrix
// multi_copy (destList_d.data(), srcList_d.data(),
// 4*RowStride, walkers.size());
// Check matrix inversion + fixing the CUDA debugging code
#ifdef DEBUG_CUDA
float Ainv[NumPtcls][NumPtcls], A[NumPtcls][NumPtcls];
float new_row[NumPtcls], Ainv_delta[NumPtcls];
for (int iw=0; iw<walkers.size(); iw++)
{
CudaValueType Ainv[NumPtcls][RowStride], A[NumPtcls][RowStride];
CudaValueType new_row[RowStride]; //Ainv_delta[NumPtcls];
//for (int iw=0; iw<walkers.size(); iw++)
//{
int iw = 0;
Walker_t::cuda_Buffer_t &data = walkers[iw]->cuda_DataSet;
cudaMemcpy (A, &(data.data()[AOffset]),
NumPtcls*NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
cudaMemcpy (Ainv, &(data.data()[AinvOffset]),
NumPtcls*NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
cudaMemcpy (new_row, &(data.data()[newRowOffset]),
NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
// for (int i=0; i<NumPtcls; i++)
// cerr << "new_row(" << i << ") = " << new_row[i]
// << " old_row = " << A[iat-FirstIndex][i] << std::endl;
@ -192,10 +196,10 @@ DiracDeterminantCUDA::update (std::vector<Walker_t*> &walkers, int iat)
for (int i=0; i<NumPtcls; i++)
for (int j=0; j<NumPtcls; j++)
{
float val = 0.0;
CudaValueType val=0.0;
for (int k=0; k<NumPtcls; k++)
val += Ainv[i][k]*A[k][j];
if (i==j && (std::abs(val-1.0) > 1.0e-2))
if (i==j && (std::abs(std::real(val)-1.0) > 1.0e-2))
std::cerr << "Error in inverse at (i,j) = (" << i << ", " << j
<< ") val = " << val << " walker = " << iw
<< " of " << walkers.size() << std::endl;
@ -204,7 +208,7 @@ DiracDeterminantCUDA::update (std::vector<Walker_t*> &walkers, int iat)
<< ") val = " << val << " walker = " << iw
<< " of " << walkers.size() << std::endl;
}
}
//}
#endif
}
@ -278,17 +282,18 @@ DiracDeterminantCUDA::update (const std::vector<Walker_t*> &walkers,
// multi_copy (destList_d.data(), srcList_d.data(),
// 4*RowStride, walkers.size());
#ifdef DEBUG_CUDA
float Ainv[NumPtcls][NumPtcls], A[NumPtcls][NumPtcls];
float new_row[NumPtcls], Ainv_delta[NumPtcls];
for (int iw=0; iw<walkers.size(); iw++)
{
CudaValueType Ainv[NumPtcls][RowStride], A[NumPtcls][RowStride];
CudaValueType new_row[RowStride]; //Ainv_delta[NumPtcls];
//for (int iw=0; iw<walkers.size(); iw++)
//{
int iw = 0;
Walker_t::cuda_Buffer_t &data = walkers[iw]->cuda_DataSet;
cudaMemcpy (A, &(data.data()[AOffset]),
NumPtcls*NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
cudaMemcpy (Ainv, &(data.data()[AinvOffset]),
NumPtcls*NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
cudaMemcpy (new_row, &(data.data()[newRowOffset]),
NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
// for (int i=0; i<NumPtcls; i++)
// cerr << "new_row(" << i << ") = " << new_row[i]
// << " old_row = " << A[iat-FirstIndex][i] << std::endl;
@ -308,10 +313,10 @@ DiracDeterminantCUDA::update (const std::vector<Walker_t*> &walkers,
for (int i=0; i<NumPtcls; i++)
for (int j=0; j<NumPtcls; j++)
{
float val = 0.0;
CudaValueType val=0.0;
for (int k=0; k<NumPtcls; k++)
val += Ainv[i][k]*A[k][j];
if (i==j && (std::abs(val-1.0) > 1.0e-2))
if (i==j && (std::abs(std::real(val)-1.0) > 1.0e-2))
std::cerr << "Error in inverse at (i,j) = (" << i << ", " << j
<< ") val = " << val << " walker = " << iw
<< " of " << walkers.size() << std::endl;
@ -320,7 +325,7 @@ DiracDeterminantCUDA::update (const std::vector<Walker_t*> &walkers,
<< ") val = " << val << " walker = " << iw
<< " of " << walkers.size() << std::endl;
}
}
//}
#endif
}
@ -410,6 +415,35 @@ DiracDeterminantCUDA::recompute(MCWalkerConfiguration &W, bool firstTime)
AWorkList_d.data(), AinvWorkList_d.data(),
NumPtcls, RowStride, walkers.size(), useDoublePrecision);
#ifdef DEBUG_CUDA
CudaValueType Ainv[NumPtcls][RowStride], A[NumPtcls][RowStride];
//for (int iw=0; iw<walkers.size(); iw++)
//{
int iw = 0;
Walker_t::cuda_Buffer_t &data = walkers[iw]->cuda_DataSet;
cudaMemcpy (A, &(data.data()[AOffset]),
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
cudaMemcpy (Ainv, &(data.data()[AinvOffset]),
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
FILE *f1, *f2;
f1 = fopen("A.dat", "a");
f2 = fopen("Ainv.dat", "a");
for (int i=0; i<NumPtcls; i++)
for (int j=0; j<NumPtcls; j++)
{
#ifdef QMC_COMPLEX
fprintf(f1, "%5d %5d %20.15e %20.15e\n", i, j, A[i][j].real(), A[i][j].imag());
fprintf(f2, "%5d %5d %20.15e %20.15e\n", i, j, Ainv[i][j].real(), Ainv[i][j].imag());
#else
fprintf(f1, "%5d %5d %20.15e\n", i, j, A[i][j]);
fprintf(f2, "%5d %5d %20.15e\n", i, j, Ainv[i][j]);
#endif
}
//}
fclose(f1);
fclose(f2);
#endif
// HACK HACK HACK
// app_log() << "After recompute:\n";
// double A[NumPtcls*NumPtcls], work[NumPtcls*NumPtcls];
@ -479,10 +513,9 @@ DiracDeterminantCUDA::addLog (MCWalkerConfiguration &W, std::vector<RealType> &l
for (int iw=0; iw<walkers.size(); iw++)
{
Walker_t::cuda_Buffer_t& data = walkers[iw]->cuda_DataSet;
gpu::host_vector<CUDA_PRECISION> host_data;
gpu::host_vector<CudaValueType> host_data;
Vector<CudaValueType> A(NumPtcls*NumOrbitals);
host_data = data;
//Vector<double> A(NumPtcls*NumOrbitals);
Vector<CUDA_PRECISION> A(NumPtcls*NumOrbitals);
for (int i=0; i<NumPtcls; i++)
for (int j=0; j<NumOrbitals; j++)
A[i*NumOrbitals+j] = host_data[AOffset+i*RowStride+j];
@ -668,8 +701,8 @@ void DiracDeterminantCUDA::ratio (MCWalkerConfiguration &W, int iat,
ratio_host = ratio_d;
#ifdef CUDA_DEBUG
// Now, check against CPU
gpu::host_vector<CudaRealType> host_data;
std::vector<CudaRealType> cpu_ratios(walkers.size(), 0.0f);
gpu::host_vector<CudaValueType> host_data;
std::vector<CudaValueType> cpu_ratios(walkers.size(), 0.0f);
for (int iw=0; iw<walkers.size(); iw++)
{
host_data = walkers[iw]->cuda_DataSet;
@ -697,8 +730,8 @@ void DiracDeterminantCUDA::ratio (MCWalkerConfiguration &W, int iat,
#ifdef CUDA_DEBUG
if (NumOrbitals == 31)
{
gpu::host_vector<CudaRealType> host_data;
std::vector<CudaRealType> cpu_ratios(walkers.size(), 0.0f);
gpu::host_vector<CudaValueType> host_data;
std::vector<CudaValueType> cpu_ratios(walkers.size(), 0.0f);
for (int iw=0; iw<walkers.size(); iw++)
{
host_data = walkers[iw]->cuda_DataSet;
@ -763,8 +796,8 @@ void DiracDeterminantCUDA::calcRatio (MCWalkerConfiguration &W, int iat,
cudaEventRecord(gpu::ratioSyncDiracEvent, gpu::memoryStream);
#ifdef CUDA_DEBUG
// Now, check against CPU
gpu::host_vector<CudaRealType> host_data;
std::vector<CudaRealType> cpu_ratios(walkers.size(), 0.0f);
gpu::host_vector<CudaValueType> host_data;
std::vector<CudaValueType> cpu_ratios(walkers.size(), 0.0f);
for (int iw=0; iw<walkers.size(); iw++)
{
host_data = walkers[iw]->cuda_DataSet;
@ -799,8 +832,8 @@ void DiracDeterminantCUDA::addRatio (MCWalkerConfiguration &W, int iat,
#ifdef CUDA_DEBUG
if (NumOrbitals == 31)
{
gpu::host_vector<CudaRealType> host_data;
std::vector<CudaRealType> cpu_ratios(walkers.size(), 0.0f);
gpu::host_vector<CudaValueType> host_data;
std::vector<CudaValueType> cpu_ratios(walkers.size(), 0.0f);
for (int iw=0; iw<walkers.size(); iw++)
{
host_data = walkers[iw]->cuda_DataSet;
@ -896,8 +929,16 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
gradLapl_host[4*(iw*RowStride + iat)+1],
gradLapl_host[4*(iw*RowStride + iat)+2]);
grads(iw,iat+FirstIndex) += g;
#ifdef QMC_COMPLEX
//YingWai's trial fix; need to revise the dot product
// should be converted into "#if defined QMC_COMPLEX" later
lapl(iw,iat+FirstIndex) += gradLapl_host[4*(iw*RowStride + iat)+3] - (CudaValueType)dot(g,g);
ValueType lapl_temp = lapl(iw,iat+FirstIndex);
if ( std::isnan(std::real(lapl_temp)) || std::isnan(std::imag(lapl_temp)) )
#else
lapl(iw,iat+FirstIndex) += gradLapl_host[4*(iw*RowStride + iat)+3] - dot(g,g);
if (std::isnan(lapl(iw,iat+FirstIndex)))
#endif
{
char name[1000];
gethostname(name, 1000);
@ -905,7 +946,7 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
int dev;
cudaGetDevice(&dev);
fprintf (stderr, "Offending device = %d\n", dev);
gpu::host_vector<CUDA_PRECISION> host_data;
gpu::host_vector<CudaValueType> host_data;
host_data = walkers[iw]->cuda_DataSet;
FILE *Amat = fopen ("Amat.dat", "w");
FILE *Ainv = fopen ("Ainv.dat", "w");
@ -915,11 +956,19 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
{
for (int j=0; j<NumPtcls; j++)
{
#ifdef QMC_COMPLEX
fprintf (Amat, "%14.8e+%14.8ei ", host_data[AOffset+i*RowStride+j].real(), host_data[AOffset+i*RowStride+j].imag());
fprintf (Ainv, "%14.8e+%14.8ei ", host_data[AinvOffset+i*RowStride+j].real(), host_data[AinvOffset+i*RowStride+j].imag());
fprintf (Lmat, "%14.8e+%14.8ei ", host_data[gradLaplOffset+(4*i+3)*RowStride+j].real(), host_data[gradLaplOffset+(4*i+3)*RowStride+j].imag());
for (int k=0; k<3; k++)
fprintf (Lmat, "%14.8e+%14.8ei ", host_data[gradLaplOffset+(4*i+k)*RowStride+j].real(), host_data[gradLaplOffset+(4*i+k)*RowStride+j].imag());
#else
fprintf (Amat, "%14.8e ", host_data[AOffset+i*RowStride+j]);
fprintf (Ainv, "%14.8e ", host_data[AinvOffset+i*RowStride+j]);
fprintf (Lmat, "%14.8e ", host_data[gradLaplOffset+(4*i+3)*RowStride+j]);
for (int k=0; k<3; k++)
fprintf (Lmat, "%14.8e ", host_data[gradLaplOffset+(4*i+k)*RowStride+j]);
#endif
}
fprintf (Amat, "\n");
fprintf (Ainv, "\n");
@ -942,17 +991,29 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
walkers[iw]->R[iat+FirstIndex][1],
walkers[iw]->R[iat+FirstIndex][2]);
for (int orb=0; orb<NumPtcls; orb++)
#ifdef QMC_COMPLEX
fprintf (stderr, "%1.10e+%1.10ei %1.10e+%1.10ei %1.10e+%1.10ei %1.10e+%1.10ei \n",
host_data[gradLaplOffset +(4*iat+0)*RowStride+orb].real(),
host_data[gradLaplOffset +(4*iat+0)*RowStride+orb].imag(),
host_data[gradLaplOffset +(4*iat+1)*RowStride+orb].real(),
host_data[gradLaplOffset +(4*iat+1)*RowStride+orb].imag(),
host_data[gradLaplOffset +(4*iat+2)*RowStride+orb].real(),
host_data[gradLaplOffset +(4*iat+2)*RowStride+orb].imag(),
host_data[gradLaplOffset +(4*iat+3)*RowStride+orb].real(),
host_data[gradLaplOffset +(4*iat+3)*RowStride+orb].imag());
#else
fprintf (stderr, "%1.10e %1.10e %1.10e %1.10e \n",
host_data[gradLaplOffset +(4*iat+0)*RowStride+orb],
host_data[gradLaplOffset +(4*iat+1)*RowStride+orb],
host_data[gradLaplOffset +(4*iat+2)*RowStride+orb],
host_data[gradLaplOffset +(4*iat+3)*RowStride+orb]);
#endif
}
}
}
#ifdef CUDA_DEBUG
// Now do it on the CPU
gpu::host_vector<CudaRealType> host_data;
gpu::host_vector<CudaValueType> host_data;
GradMatrix_t cpu_grads(grads.rows(), grads.cols());
ValueMatrix_t cpu_lapl(grads.rows(), grads.cols());
for (int iw=0; iw<walkers.size(); iw++)
@ -983,12 +1044,12 @@ DiracDeterminantCUDA::NLratios_CPU
std::vector<ValueMatrix_t> Ainv_host;
int nw = walkers.size();
Ainv_host.resize(nw);
int mat_size = NumOrbitals*NumOrbitals*sizeof(CudaRealType);
int mat_size = NumOrbitals*NumOrbitals*sizeof(CudaValueType);
for (int iw=0; iw<nw; iw++)
{
Ainv_host[iw].resize(NumOrbitals, NumOrbitals);
ValueType *dest = &(Ainv_host[iw](0,0));
CudaRealType *src = &(walkers[iw]->cuda_DataSet.data()[AinvOffset]);
CudaValueType *src = &(walkers[iw]->cuda_DataSet.data()[AinvOffset]);
cudaMemcpy(dest, src, mat_size, cudaMemcpyDeviceToHost);
}
std::vector<RealType> phi(NumOrbitals);

View File

@ -50,44 +50,44 @@ protected:
size_t AOffset, AinvOffset, newRowOffset, AinvDeltaOffset,
AinvColkOffset, gradLaplOffset, newGradLaplOffset,
AWorkOffset, AinvWorkOffset;
gpu::host_vector<CudaRealType*> UpdateList;
gpu::device_vector<CudaRealType*> UpdateList_d;
gpu::host_vector<CudaValueType*> UpdateList;
gpu::device_vector<CudaValueType*> UpdateList_d;
gpu::host_vector<updateJob> UpdateJobList;
gpu::device_vector<updateJob> UpdateJobList_d;
std::vector<CudaRealType*> srcList, destList, AList, AinvList, newRowList,
std::vector<CudaValueType*> srcList, destList, AList, AinvList, newRowList,
AinvDeltaList, AinvColkList, gradLaplList, newGradLaplList,
AWorkList, AinvWorkList, GLList;
gpu::device_vector<CudaRealType*> srcList_d, destList_d, AList_d, AinvList_d, newRowList_d,
gpu::device_vector<CudaValueType*> srcList_d, destList_d, AList_d, AinvList_d, newRowList_d,
AinvDeltaList_d, AinvColkList_d, gradLaplList_d,
newGradLaplList_d, AWorkList_d, AinvWorkList_d, GLList_d;
gpu::device_vector<CudaRealType> ratio_d;
gpu::host_vector<CudaRealType> ratio_host;
gpu::device_vector<CudaRealType> gradLapl_d;
gpu::host_vector<CudaRealType> gradLapl_host;
gpu::device_vector<CudaValueType> ratio_d;
gpu::host_vector<CudaValueType> ratio_host;
gpu::device_vector<CudaValueType> gradLapl_d;
gpu::host_vector<CudaValueType> gradLapl_host;
gpu::device_vector<int> iatList_d;
gpu::host_vector<int> iatList;
// Data members for nonlocal psuedopotential ratio evaluation
static const int NLrowBufferRows = 4800;
gpu::device_vector<CudaRealType> NLrowBuffer_d;
gpu::host_vector<CudaRealType> NLrowBuffer_host;
gpu::device_vector<CudaRealType*> SplineRowList_d;
gpu::host_vector<CudaRealType*> SplineRowList_host;
gpu::device_vector<CudaRealType*> RatioRowList_d;
gpu::host_vector<CudaRealType*> RatioRowList_host[2];
gpu::device_vector<CudaValueType> NLrowBuffer_d;
gpu::host_vector<CudaValueType> NLrowBuffer_host;
gpu::device_vector<CudaValueType*> SplineRowList_d;
gpu::host_vector<CudaValueType*> SplineRowList_host;
gpu::device_vector<CudaValueType*> RatioRowList_d;
gpu::host_vector<CudaValueType*> RatioRowList_host[2];
gpu::device_vector<CudaRealType> NLposBuffer_d;
gpu::host_vector<CudaRealType> NLposBuffer_host;
gpu::device_vector<CudaRealType*> NLAinvList_d;
gpu::host_vector<CudaRealType*> NLAinvList_host[2];
gpu::device_vector<CudaValueType*> NLAinvList_d;
gpu::host_vector<CudaValueType*> NLAinvList_host[2];
gpu::device_vector<int> NLnumRatioList_d;
gpu::host_vector<int> NLnumRatioList_host[2];
gpu::device_vector<int> NLelecList_d;
gpu::host_vector<int> NLelecList_host[2];
gpu::device_vector<CudaRealType> NLratios_d[2];
gpu::host_vector<CudaRealType> NLratios_host;
gpu::device_vector<CudaRealType*> NLratioList_d;
gpu::host_vector<CudaRealType*> NLratioList_host[2];
gpu::device_vector<CudaValueType> NLratios_d[2];
gpu::host_vector<CudaValueType> NLratios_host;
gpu::device_vector<CudaValueType*> NLratioList_d;
gpu::host_vector<CudaValueType*> NLratioList_host[2];
void resizeLists(int numWalkers)
{
@ -161,8 +161,7 @@ public:
void update (std::vector<Walker_t*> &walkers, int iat);
void update (const std::vector<Walker_t*> &walkers, const std::vector<int> &iatList);
void reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool)
{
void reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool) {
RowStride = ((NumOrbitals + 31)/32) * 32;
AOffset = pool.reserve((size_t) NumPtcls * RowStride);
AinvOffset = pool.reserve((size_t) NumPtcls * RowStride);

View File

@ -306,7 +306,7 @@ public:
Dets[id]->recompute(W, firstTime);
}
void reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool)
void reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool)
{
for (int id=0; id<Dets.size(); id++)
Dets[id]->reserve(pool);
@ -381,7 +381,7 @@ private:
}
#endif
/***************************************************************************
* $RCSfile$ $Author$
* $Revision$ $Date$
* $Id$
* $RCSfile$ $Author: yingwai $
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
* $Id: SlaterDet.h 7279 2016-11-24 00:21:16Z yingwai $
***************************************************************************/

File diff suppressed because it is too large Load Diff

View File

@ -14,6 +14,8 @@
#ifndef CUDA_DETERMINANT_UPDATE_H
#define CUDA_DETERMINANT_UPDATE_H
#include <complex>
struct updateJob
{
void *A, *Ainv, *newRow, *AinvDelta, *AinvColk, *gradLapl, *newGradLapl, *dummy;
@ -57,6 +59,21 @@ update_inverse_cuda(double **data, int iat[],
int AinvDelta_off, int AinvColk_off,
int N, int rowstride, int numWalkers);
#ifdef QMC_COMPLEX
void
update_inverse_cuda(std::complex<float> **data, int iat[],
int A_off, int Ainv_off, int newRow_off,
int AinvDelta_off, int AinvColk_off,
int N, int rowstride, int numWalkers);
void
update_inverse_cuda(std::complex<double> **data, int iat[],
int A_off, int Ainv_off, int newRow_off,
int AinvDelta_off, int AinvColk_off,
int N, int rowstride, int numWalkers);
#endif
// These are for single-particle move updates. Each
// walker is moving the same electron.
void
@ -71,7 +88,19 @@ update_inverse_cuda(double **data, int iat,
int AinvDelta_off, int AinvColk_off,
int N, int rowstride, int numWalkers);
#ifdef QMC_COMPLEX
void
update_inverse_cuda(std::complex<float>** data, int iat,
int A_off, int Ainv_off, int newRow_off,
int AinvDelta_off, int AinvColk_off,
int N, int rowstride, int numWalkers);
void
update_inverse_cuda(std::complex<double>** data, int iat,
int A_off, int Ainv_off, int newRow_off,
int AinvDelta_off, int AinvColk_off,
int N, int rowstride, int numWalkers);
#endif
void
@ -84,6 +113,18 @@ determinant_ratios_cuda (double *Ainv_list[], double *new_row_list[],
double *ratios, int N, int row_stride, int iat,
int numWalkers);
#ifdef QMC_COMPLEX
void
determinant_ratios_cuda (std::complex<float> *Ainv_list[], std::complex<float> *new_row_list[],
std::complex<float> *ratios, int N, int row_stride, int iat,
int numWalkers);
void
determinant_ratios_cuda (std::complex<double> *Ainv_list[], std::complex<double> *new_row_list[],
std::complex<double> *ratios, int N, int row_stride, int iat,
int numWalkers);
#endif
void
calc_many_ratios (float *Ainv_list[], float *new_row_list[],
@ -97,6 +138,20 @@ calc_many_ratios (double *Ainv_list[], double *new_row_list[],
int N, int row_stride, int elec_list[],
int numWalkers);
#ifdef QMC_COMPLEX
void
calc_many_ratios (std::complex<float> *Ainv_list[], std::complex<float> *new_row_list[],
std::complex<float>* ratio_list[], int num_ratio_list[],
int N, int row_stride, int elec_list[],
int numWalkers);
void
calc_many_ratios (std::complex<double> *Ainv_list[], std::complex<double> *new_row_list[],
std::complex<double>* ratio_list[], int num_ratio_list[],
int N, int row_stride, int elec_list[],
int numWalkers);
#endif
void
scale_grad_lapl(float *grad_list[], float *hess_list[],
float *grad_lapl_list[], float Linv[], int N,
@ -111,18 +166,42 @@ calc_grad_lapl (double *Ainv_list[], double *grad_lapl_list[],
double *out_list[], int N, int row_stride, int num_mats);
#ifdef QMC_COMPLEX
void
calc_grad_lapl (std::complex<float> *Ainv_list[], std::complex<float> *grad_lapl_list[],
std::complex<float> *out_list[], int N, int row_stride, int num_mats);
void
calc_grad_lapl (std::complex<double> *Ainv_list[], std::complex<double> *grad_lapl_list[],
std::complex<double> *out_list[], int N, int row_stride, int num_mats);
#endif
void
multi_copy (float *dest[], float *src[], int len, int num);
void
multi_copy (double *dest[], double *src[], int len, int num);
void
multi_copy (float *buff[], int srcoff, int dest_off, int len, int num);
multi_copy (float *buff[], int dest_off, int src_off, int len, int num);
void
multi_copy (double *buff[], int src_off, int dest_off, int len, int num);
multi_copy (double *buff[], int dest_off, int src_off, int len, int num);
#ifdef QMC_COMPLEX
void
multi_copy (std::complex<float> *dest[], std::complex<float> *src[], int len, int num);
void
multi_copy (std::complex<double> *dest[], std::complex<double> *src[], int len, int num);
void
multi_copy (std::complex<float> *buff[], int dest_off, int src_off, int len, int num);
void
multi_copy (std::complex<double> *buff[], int dest_off, int src_off, int len, int num);
#endif
//YingWai: the following two are repeated (Dec 28, 15)
/*
void
calc_many_ratios (float *Ainv_list[], float *new_row_list[],
float* ratio_list[], int num_ratio_list[],
@ -134,6 +213,7 @@ calc_many_ratios (double *Ainv_list[], double *new_row_list[],
double* ratio_list[], int num_ratio_list[],
int N, int row_stride, int elec_list[],
int numWalkers);
*/
void
determinant_ratios_grad_lapl_cuda
@ -143,8 +223,23 @@ determinant_ratios_grad_lapl_cuda
void
determinant_ratios_grad_lapl_cuda
(double *Ainv_list[], double *new_row_list[], double *grad_lapl_list[],
double ratios_grad_lapl[], int N, int row_stride, int iat, int numWalkers);
(double *Ainv_list[], double *new_row_list[],
double *grad_lapl_list[], double ratios_grad_lapl[],
int N, int row_stride, int iat, int numWalkers);
#ifdef QMC_COMPLEX
void
determinant_ratios_grad_lapl_cuda
(std::complex<float> *Ainv_list[], std::complex<float> *new_row_list[],
std::complex<float> *grad_lapl_list[], std::complex<float> ratios_grad_lapl[],
int N, int row_stride, int iat, int numWalkers);
void
determinant_ratios_grad_lapl_cuda
(std::complex<double> *Ainv_list[], std::complex<double> *new_row_list[],
std::complex<double> *grad_lapl_list[], std::complex<double> ratios_grad_lapl[],
int N, int row_stride, int iat, int numWalkers);
#endif
void
determinant_ratios_grad_lapl_cuda
@ -158,6 +253,19 @@ determinant_ratios_grad_lapl_cuda
double ratios_grad_lapl[], int N, int row_stride,
int iat_list[], int numWalkers);
#ifdef QMC_COMPLEX
void
determinant_ratios_grad_lapl_cuda
(std::complex<float> *Ainv_list[], std::complex<float> *new_row_list[],
std::complex<float> *grad_lapl_list[], std::complex<float> ratios_grad_lapl[],
int N, int row_stride, int iat_list[], int numWalkers);
void
determinant_ratios_grad_lapl_cuda
(std::complex<double> *Ainv_list[], std::complex<double> *new_row_list[],
std::complex<double> *grad_lapl_list[], std::complex<double> ratios_grad_lapl[],
int N, int row_stride, int iat_list[], int numWalkers);
#endif
void
calc_gradient (float *Ainv_list[], float *grad_lapl_list[],
@ -169,5 +277,16 @@ calc_gradient (double *Ainv_list[], double *grad_lapl_list[],
double grad[], int N, int row_stride, int elec,
int numWalkers);
#ifdef QMC_COMPLEX
void
calc_gradient (std::complex<float> *Ainv_list[], std::complex<float> *grad_lapl_list[],
std::complex<float> grad[], int N, int row_stride, int elec,
int numWalkers);
void
calc_gradient (std::complex<double> *Ainv_list[], std::complex<double> *grad_lapl_list[],
std::complex<double> grad[], int N, int row_stride, int elec,
int numWalkers);
#endif
#endif

View File

@ -941,7 +941,7 @@ two_body_gradient (double *R[], int first, int last, int iat,
template<typename T, int BS>
template<typename T, int BS, unsigned COMPLEX>
__global__ void
two_body_derivs_kernel(T **R, T **gradLogPsi,
int e1_first, int e1_last,
@ -991,7 +991,7 @@ two_body_derivs_kernel(T **R, T **gradLogPsi,
int outoff = i*BS+tid;
int inoff = outoff + 3*e1_first + 3*b1*BS;
r1[0][outoff] = myR[inoff];//[3*e1_first + (3*b1+i)*BS + tid];
sGrad[0][outoff] = myGrad[inoff];
sGrad[0][outoff] = myGrad[inoff*COMPLEX];
}
__syncthreads();
int ptcl1 = e1_first+b1*BS + tid;
@ -1082,7 +1082,7 @@ two_body_derivs(float *R[], float *gradLogPsi[], int e1_first, int e1_last,
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
two_body_derivs_kernel<float,BS><<<dimGrid,dimBlock>>>
two_body_derivs_kernel<float,BS,1><<<dimGrid,dimBlock>>>
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, derivs);
}
@ -1096,12 +1096,46 @@ two_body_derivs(double *R[], double *gradLogPsi[], int e1_first, int e1_last,
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
two_body_derivs_kernel<double,BS><<<dimGrid,dimBlock>>>
two_body_derivs_kernel<double,BS,1><<<dimGrid,dimBlock>>>
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, derivs);
}
// Ye: use offset to recycle the old routines
// block size can be further optimized.
#ifdef QMC_COMPLEX
void
two_body_derivs(float *R[], std::complex<float> *gradLogPsi[], int e1_first, int e1_last,
int e2_first, int e2_last,
int numCoefs, float rMax,
float *derivs[], int numWalkers)
{
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
two_body_derivs_kernel<float,BS,2><<<dimGrid,dimBlock>>>
(R, (float**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, derivs);
}
void
two_body_derivs(double *R[], std::complex<double> *gradLogPsi[], int e1_first, int e1_last,
int e2_first, int e2_last,
int numCoefs, double rMax,
double *derivs[], int numWalkers)
{
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
two_body_derivs_kernel<double,BS,2><<<dimGrid,dimBlock>>>
(R, (double**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, derivs);
}
#endif
////////////////////////////////////////////////////////////////
// One-body routines //
@ -1892,7 +1926,7 @@ one_body_gradient (double *Rlist[], int iat, double C[], int first, int last,
template<typename T, int BS>
template<typename T, int BS, unsigned COMPLEX>
__global__ void
one_body_derivs_kernel(T* C, T **R, T **gradLogPsi,
int cfirst, int clast,
@ -1939,7 +1973,7 @@ one_body_derivs_kernel(T* C, T **R, T **gradLogPsi,
int outoff = i*BS+tid;
int inoff = outoff + 3*efirst + 3*be*BS;
r[0][outoff] = myR[inoff];
sGrad[0][outoff] = myGrad[inoff];
sGrad[0][outoff] = myGrad[inoff*COMPLEX];
}
__syncthreads();
int eptcl = efirst+be*BS + tid;
@ -2023,7 +2057,7 @@ one_body_derivs(float C[], float *R[], float *gradLogPsi[],
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
one_body_derivs_kernel<float,BS><<<dimGrid,dimBlock>>>
one_body_derivs_kernel<float,BS,1><<<dimGrid,dimBlock>>>
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, derivs);
}
@ -2040,13 +2074,48 @@ one_body_derivs(double C[], double *R[], double *gradLogPsi[],
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
one_body_derivs_kernel<double,BS><<<dimGrid,dimBlock>>>
one_body_derivs_kernel<double,BS,1><<<dimGrid,dimBlock>>>
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, derivs);
}
// Ye: use offset to recycle the old routines
// block size can be further optimized.
#ifdef QMC_COMPLEX
void
one_body_derivs(float C[], float *R[], std::complex<float> *gradLogPsi[],
int cfirst, int clast,
int efirst, int elast,
int numCoefs, float rMax,
float *derivs[], int numWalkers)
{
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
one_body_derivs_kernel<float,BS,2><<<dimGrid,dimBlock>>>
(C, R, (float**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, derivs);
}
void
one_body_derivs(double C[], double *R[], std::complex<double> *gradLogPsi[],
int cfirst, int clast,
int efirst, int elast,
int numCoefs, double rMax,
double *derivs[], int numWalkers)
{
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
one_body_derivs_kernel<double,BS,2><<<dimGrid,dimBlock>>>
(C, R, (double**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, derivs);
}
#endif
void test()
{

View File

@ -14,6 +14,7 @@
#ifndef BSPLINE_JASTROW_CUDA_H
#define BSPLINE_JASTROW_CUDA_H
#include <complex>
#include "NLjobGPU.h"
///////////////////////
@ -106,6 +107,19 @@ two_body_derivs(double *R[], double *gradLogPsi[], int e1_first, int e1_last,
double *derivs[], int numWalkers);
#ifdef QMC_COMPLEX
void
two_body_derivs(float *R[], std::complex<float> *gradLogPsi[], int e1_first, int e1_last,
int e2_first, int e2_last,
int numCoefs, float rMax,
float *derivs[], int numWalkers);
void
two_body_derivs(double *R[], std::complex<double> *gradLogPsi[], int e1_first, int e1_last,
int e2_first, int e2_last,
int numCoefs, double rMax,
double *derivs[], int numWalkers);
#endif
///////////////////////
// One-Body routines //
///////////////////////
@ -199,4 +213,22 @@ one_body_derivs(double C[], double *R[], double *gradLogPsi[],
int numCoefs, double rMax,
double *derivs[], int numWalkers);
#ifdef QMC_COMPLEX
void
one_body_derivs(float C[], float *R[], std::complex<float> *gradLogPsi[],
int cfirst, int clast,
int efirst, int elast,
int numCoefs, float rMax,
float *derivs[], int numWalkers);
void
one_body_derivs(double C[], double *R[], std::complex<double> *gradLogPsi[],
int cfirst, int clast,
int efirst, int elast,
int numCoefs, double rMax,
double *derivs[], int numWalkers);
#endif
#endif

View File

@ -17,9 +17,9 @@
#define MAX_SPLINES 100
#include <stdio.h>
#include <config.h>
#include "BsplineJastrowCudaPBC.h"
#include "../../CUDA/gpu_misc.h"
#include <config.h>
bool AisInitializedPBC = false;
@ -1791,7 +1791,7 @@ two_body_gradient_PBC (double *R[], int first, int last, int iat,
template<typename T, int BS>
template<typename T, int BS, unsigned COMPLEX>
__global__ void
two_body_derivs_PBC_kernel(T **R, T **gradLogPsi,
int e1_first, int e1_last,
@ -1848,7 +1848,7 @@ two_body_derivs_PBC_kernel(T **R, T **gradLogPsi,
int outoff = i*BS+tid;
int inoff = outoff + 3*e1_first + 3*b1*BS;
r1[0][outoff] = myR[inoff];//[3*e1_first + (3*b1+i)*BS + tid];
sGrad[0][outoff] = myGrad[inoff];
sGrad[0][outoff] = myGrad[inoff*COMPLEX];
}
__syncthreads();
int ptcl1 = e1_first+b1*BS + tid;
@ -1941,11 +1941,11 @@ two_body_derivs_PBC(float *R[], float *gradLogPsi[], int e1_first, int e1_last,
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
if (sim_cell_radius >= rMax)
two_body_derivs_PBC_kernel<float,BS><<<dimGrid,dimBlock>>>
two_body_derivs_PBC_kernel<float,BS,1><<<dimGrid,dimBlock>>>
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, lattice, latticeInv, derivs);
else
two_body_derivs_PBC_kernel<float,BS><<<dimGrid,dimBlock>>>
two_body_derivs_PBC_kernel<float,BS,1><<<dimGrid,dimBlock>>>
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, lattice, latticeInv, derivs);
}
@ -1961,16 +1961,62 @@ two_body_derivs_PBC(double *R[], double *gradLogPsi[], int e1_first, int e1_last
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
if (sim_cell_radius >= rMax)
two_body_derivs_PBC_kernel<double,BS><<<dimGrid,dimBlock>>>
two_body_derivs_PBC_kernel<double,BS,1><<<dimGrid,dimBlock>>>
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, lattice, latticeInv, derivs);
else
two_body_derivs_PBC_kernel<double,BS><<<dimGrid,dimBlock>>>
two_body_derivs_PBC_kernel<double,BS,1><<<dimGrid,dimBlock>>>
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, lattice, latticeInv, derivs);
}
// Ye: use offset to recycle the old routines
// block size can be further optimized.
#ifdef QMC_COMPLEX
void
two_body_derivs_PBC(float *R[], std::complex<float> *gradLogPsi[], int e1_first, int e1_last,
int e2_first, int e2_last,
int numCoefs, float rMax,
float lattice[], float latticeInv[], float sim_cell_radius,
float *derivs[], int numWalkers)
{
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
if (sim_cell_radius >= rMax)
two_body_derivs_PBC_kernel<float,BS,2><<<dimGrid,dimBlock>>>
(R, (float**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, lattice, latticeInv, derivs);
else
two_body_derivs_PBC_kernel<float,BS,2><<<dimGrid,dimBlock>>>
(R, (float**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, lattice, latticeInv, derivs);
}
void
two_body_derivs_PBC(double *R[], std::complex<double> *gradLogPsi[], int e1_first, int e1_last,
int e2_first, int e2_last,
int numCoefs, double rMax,
double lattice[], double latticeInv[], double sim_cell_radius,
double *derivs[], int numWalkers)
{
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
if (sim_cell_radius >= rMax)
two_body_derivs_PBC_kernel<double,BS,2><<<dimGrid,dimBlock>>>
(R, (double**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, lattice, latticeInv, derivs);
else
two_body_derivs_PBC_kernel<double,BS,2><<<dimGrid,dimBlock>>>
(R, (double**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
rMax, lattice, latticeInv, derivs);
}
#endif
////////////////////////////////////////////////////////////////
// One-body routines //
@ -3310,7 +3356,7 @@ one_body_gradient_PBC (double *Rlist[], int iat, double C[], int first, int last
template<typename T, int BS>
template<typename T, int BS, unsigned COMPLEX>
__global__ void
one_body_derivs_PBC_kernel(T* C, T **R, T **gradLogPsi,
int cfirst, int clast,
@ -3364,7 +3410,7 @@ one_body_derivs_PBC_kernel(T* C, T **R, T **gradLogPsi,
int outoff = i*BS+tid;
int inoff = outoff + 3*efirst + 3*be*BS;
r[0][outoff] = myR[inoff];
sGrad[0][outoff] = myGrad[inoff];
sGrad[0][outoff] = myGrad[inoff*COMPLEX];
}
__syncthreads();
int eptcl = efirst+be*BS + tid;
@ -3450,11 +3496,11 @@ one_body_derivs_PBC(float C[], float *R[], float *gradLogPsi[],
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
if (sim_cell_radius >= rMax)
one_body_derivs_PBC_kernel<float,BS><<<dimGrid,dimBlock>>>
one_body_derivs_PBC_kernel<float,BS,1><<<dimGrid,dimBlock>>>
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, lattice, latticeInv, derivs);
else
one_body_derivs_PBC_kernel<float,BS><<<dimGrid,dimBlock>>>
one_body_derivs_PBC_kernel<float,BS,1><<<dimGrid,dimBlock>>>
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, lattice, latticeInv, derivs);
}
@ -3473,16 +3519,65 @@ one_body_derivs_PBC(double C[], double *R[], double *gradLogPsi[],
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
if (sim_cell_radius >= rMax)
one_body_derivs_PBC_kernel<double,BS><<<dimGrid,dimBlock>>>
one_body_derivs_PBC_kernel<double,BS,1><<<dimGrid,dimBlock>>>
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, lattice, latticeInv, derivs);
else
one_body_derivs_PBC_kernel<double,BS><<<dimGrid,dimBlock>>>
one_body_derivs_PBC_kernel<double,BS,1><<<dimGrid,dimBlock>>>
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, lattice, latticeInv, derivs);
}
// Ye: use offset to recycle the old routines
// block size can be further optimized.
#ifdef QMC_COMPLEX
void
one_body_derivs_PBC(float C[], float *R[], std::complex<float> *gradLogPsi[],
int cfirst, int clast,
int efirst, int elast,
int numCoefs, float rMax,
float lattice[], float latticeInv[], float sim_cell_radius,
float *derivs[], int numWalkers)
{
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
if (sim_cell_radius >= rMax)
one_body_derivs_PBC_kernel<float,BS,2><<<dimGrid,dimBlock>>>
(C, R, (float**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, lattice, latticeInv, derivs);
else
one_body_derivs_PBC_kernel<float,BS,2><<<dimGrid,dimBlock>>>
(C, R, (float**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, lattice, latticeInv, derivs);
}
void
one_body_derivs_PBC(double C[], double *R[], std::complex<double> *gradLogPsi[],
int cfirst, int clast,
int efirst, int elast,
int numCoefs, double rMax,
double lattice[], double latticeInv[], double sim_cell_radius,
double *derivs[], int numWalkers)
{
const int BS=32;
dim3 dimBlock(BS);
dim3 dimGrid(numWalkers);
if (sim_cell_radius >= rMax)
one_body_derivs_PBC_kernel<double,BS,2><<<dimGrid,dimBlock>>>
(C, R, (double**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, lattice, latticeInv, derivs);
else
one_body_derivs_PBC_kernel<double,BS,2><<<dimGrid,dimBlock>>>
(C, R, (double**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
rMax, lattice, latticeInv, derivs);
}
#endif
void testPBC()

View File

@ -15,6 +15,7 @@
#ifndef BSPLINE_JASTROW_CUDA_PBC_H
#define BSPLINE_JASTROW_CUDA_PBC_H
#include <complex>
#include "NLjobGPU.h"
///////////////////////
@ -117,6 +118,23 @@ two_body_derivs_PBC(double *R[], double *gradLogPsi[], int e1_first, int e1_last
double *derivs[], int numWalkers);
#ifdef QMC_COMPLEX
void
two_body_derivs_PBC(float *R[], std::complex<float> *gradLogPsi[], int e1_first, int e1_last,
int e2_first, int e2_last,
int numCoefs, float rMax,
float lattice[], float latticeInv[], float sim_cell_radius,
float *derivs[], int numWalkers);
void
two_body_derivs_PBC(double *R[], std::complex<double> *gradLogPsi[], int e1_first, int e1_last,
int e2_first, int e2_last,
int numCoefs, double rMax,
double lattice[], double latticeInv[], double sim_cell_radius,
double *derivs[], int numWalkers);
#endif
///////////////////////
// One-Body routines //
///////////////////////
@ -222,4 +240,25 @@ one_body_derivs_PBC(double C[], double *R[], double *gradLogPsi[],
double lattice[], double latticeInv[], double sim_cell_radius,
double *derivs[], int numWalkers);
#ifdef QMC_COMPLEX
void
one_body_derivs_PBC(float C[], float *R[], std::complex<float> *gradLogPsi[],
int cfirst, int clast,
int efirst, int elast,
int numCoefs, float rMax,
float lattice[], float latticeInv[], float sim_cell_radius,
float *derivs[], int numWalkers);
void
one_body_derivs_PBC(double C[], double *R[], std::complex<double> *gradLogPsi[],
int cfirst, int clast,
int efirst, int elast,
int numCoefs, double rMax,
double lattice[], double latticeInv[], double sim_cell_radius,
double *derivs[], int numWalkers);
#endif
#endif

View File

@ -495,7 +495,7 @@ OneBodyJastrowOrbitalBspline::resetParameters(const opt_variables_type& active)
void
OneBodyJastrowOrbitalBspline::evaluateDerivatives
(MCWalkerConfiguration &W, const opt_variables_type& optvars,
ValueMatrix_t &d_logpsi, ValueMatrix_t &dlapl_over_psi)
RealMatrix_t &d_logpsi, RealMatrix_t &dlapl_over_psi)
{
CudaReal sim_cell_radius = W.Lattice.SimulationCellRadius;
std::vector<Walker_t*> &walkers = W.WalkerList;

View File

@ -108,8 +108,8 @@ public:
std::vector<PosType> &quadPoints, std::vector<ValueType> &psi_ratios);
void evaluateDerivatives (MCWalkerConfiguration &W,
const opt_variables_type& optvars,
ValueMatrix_t &dlogpsi,
ValueMatrix_t &dlapl_over_psi);
RealMatrix_t &dlogpsi,
RealMatrix_t &dlapl_over_psi);
OneBodyJastrowOrbitalBspline(ParticleSet &centers, ParticleSet& elecs) :
OneBodyJastrowOrbital<BsplineFunctor<OrbitalBase::RealType> > (centers,elecs),
ElecRef(elecs),

View File

@ -568,7 +568,7 @@ TwoBodyJastrowOrbitalBspline::resetParameters(const opt_variables_type& active)
void
TwoBodyJastrowOrbitalBspline::evaluateDerivatives
(MCWalkerConfiguration &W, const opt_variables_type& optvars,
ValueMatrix_t &d_logpsi, ValueMatrix_t &dlapl_over_psi)
RealMatrix_t &d_logpsi, RealMatrix_t &dlapl_over_psi)
{
CudaReal sim_cell_radius = W.Lattice.SimulationCellRadius;
std::vector<Walker_t*> &walkers = W.WalkerList;

View File

@ -109,8 +109,8 @@ public:
void
evaluateDerivatives (MCWalkerConfiguration &W,
const opt_variables_type& optvars,
ValueMatrix_t &dlogpsi,
ValueMatrix_t &dlapl_over_psi);
RealMatrix_t &dlogpsi,
RealMatrix_t &dlapl_over_psi);
//TwoBodyJastrowOrbitalBspline(ParticleSet& pset, bool is_master) :
// TwoBodyJastrowOrbital<BsplineFunctor<OrbitalBase::RealType> > (pset, is_master),

View File

@ -95,6 +95,7 @@ struct OrbitalBase: public QMCTraits
typedef ParticleAttrib<GradType> GradVectorType;
typedef PooledData<RealType> BufferType;
typedef ParticleSet::Walker_t Walker_t;
typedef OrbitalSetTraits<RealType>::ValueMatrix_t RealMatrix_t;
typedef OrbitalSetTraits<ValueType>::ValueMatrix_t ValueMatrix_t;
typedef OrbitalSetTraits<ValueType>::GradMatrix_t GradMatrix_t;
typedef OrbitalSetTraits<ValueType>::HessType HessType;
@ -502,7 +503,7 @@ struct OrbitalBase: public QMCTraits
virtual void recompute(MCWalkerConfiguration &W, bool firstTime)
{ }
virtual void reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool)
virtual void reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool)
{ }
/** Evaluate the log of the WF for all walkers
@ -645,8 +646,8 @@ struct OrbitalBase: public QMCTraits
virtual void
evaluateDerivatives (MCWalkerConfiguration &W,
const opt_variables_type& optvars,
ValueMatrix_t &dgrad_logpsi,
ValueMatrix_t &dhpsi_over_psi)
RealMatrix_t &dgrad_logpsi,
RealMatrix_t &dhpsi_over_psi)
{
app_error() << "Need specialization of OrbitalBase::evaluateDerivatives.\n";
abort();
@ -656,8 +657,8 @@ struct OrbitalBase: public QMCTraits
}
#endif
/***************************************************************************
* $RCSfile$ $Author$
* $Revision$ $Date$
* $Id$
* $RCSfile$ $Author: yingwai $
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
* $Id: OrbitalBase.h 7279 2016-11-24 00:21:16Z yingwai $
***************************************************************************/

View File

@ -15,6 +15,7 @@
#include <assert.h>
#include <thrust/complex.h>
/* !!!WARNING!!!
Kernels in this file strongly depend on warp-synchronous behavior
@ -550,6 +551,7 @@ void apply_phase_factors(double kPoints[], int makeTwoCopies[],
GL_in, GL_out, num_splines, num_walkers, row_stride);
}
void apply_phase_factors(double kPoints[], int makeTwoCopies[], int TwoCopiesIndex[],
double pos[], double *phi_in[], double *phi_out[],
double *GL_in[], double *GL_out[],
@ -563,23 +565,77 @@ void apply_phase_factors(double kPoints[], int makeTwoCopies[], int TwoCopiesInd
GL_in, GL_out, num_splines, num_walkers, row_stride);
}
#ifdef ALGO_CHRISTOS
// Slower for some runs
#define WARP_SIZE 32
// NO_TWO_COPIES can be set to true if the makeTwoCopies is guaranteed to
// only hold 0s.
template<typename T, int BS, bool NO_TWO_COPIES> __global__
void phase_factor_kernel (T *kPoints, int *makeTwoCopies,
T *pos, T **phi_in, T **phi_out,
T **grad_lapl_in, T **grad_lapl_out,
// YingWai's implementation for complex wavefunctions
// * The use of shared memory can be further optimized.
// * Thread count can be further optimized.
// Ye's optimization eliminates the shared memory buffer and potential race conditions.
// * BS>32 is safe but the optimal choice still needs further attempts.
#ifdef QMC_COMPLEX
// blockIdx.x = counter for walker
// BS threads per block
template<typename T, typename T2, int BS> __global__
void phase_factor_kernel (T *kPoints,
T *pos, T2 **phi_in, T2 **phi_out,
int num_splines)
{
__shared__ T pos_s[3];
__shared__ T2 *phi_in_ptr, *phi_out_ptr;
int tid = threadIdx.x;
// Copy electron position into shared memory
if (tid < 3)
pos_s[tid] = pos[3*blockIdx.x+tid];
// Set pointers to point to the address (of the 1st element) where phi for that walker is stored
if (tid == 0)
{
phi_in_ptr = phi_in[blockIdx.x];
phi_out_ptr = phi_out[blockIdx.x];
}
// BS>warpSize sync needed
//__syncthreads();
// "ib" is NOT the thread block! This is just a "block" of BS splines that the BS threads
// process together at one time, so it has to repeat NB times to take care of all the splines
int NB = (num_splines+BS-1)/BS;
for (int ib=0; ib<NB; ib++)
{
int off = ib*BS + tid;
if (off < num_splines)
{
// Now compute phase factors
T phase = -(kPoints[off*3 ]*pos_s[0] +
kPoints[off*3+1]*pos_s[1] +
kPoints[off*3+2]*pos_s[2]);
T s, c;
sincos (phase, &s, &c);
T2 e_mikr = T2(c,s);
// Write back to global memory directly
phi_out_ptr[ib*BS+tid] = phi_in_ptr[ib*BS+tid]*e_mikr;
}
}
}
template<typename T, typename T2, int BS> __global__
void phase_factor_kernel (T *kPoints,
T *pos, T2 **phi_in, T2 **phi_out,
T2 **grad_lapl_in, T2 **grad_lapl_out,
int num_splines, int num_walkers,
int row_stride)
{
volatile __shared__ T in_shared[5][2*BS+1], out_shared[5][BS+1], kPoints_s[BS][3];
// Dummy quantities to make use of shared memory
__shared__ T pos_s[3];
__shared__ T *my_phi_in, *my_phi_out, *my_GL_in, *my_GL_out;
__shared__ T2 *my_phi_in, *my_phi_out, *my_GL_in, *my_GL_out;
int tid = threadIdx.x;
// Set pointers to point to the address (of the 1st element) where phi for that walker is stored
if (tid == 0)
{
my_phi_in = phi_in[blockIdx.x];
@ -587,189 +643,122 @@ void phase_factor_kernel (T *kPoints, int *makeTwoCopies,
my_GL_in = grad_lapl_in[blockIdx.x];
my_GL_out = grad_lapl_out[blockIdx.x];
}
// Copy electron position into shared memory
if (tid < 3)
pos_s[tid] = pos[3*blockIdx.x+tid];
if (BS > WARP_SIZE) __syncthreads();
__syncthreads();
// "block" is NOT the thread block! This is just a "block" of BS splines that the BS threads
// process together at one time, so it has to repeat nb times to take care of all the splines
int nb = (num_splines + BS-1)/BS;
int outIndex=0;
int outBlock=0;
// 0 if NO_TWO_COPIES, but won't compile
__shared__ int m2c[NO_TWO_COPIES ? 1 : BS];
for (int block=0; block<nb; block++)
{
// Load kpoints into shared memory
for (int i=0; i<3; i++)
T kPoints_s[3];
int off = block*BS + tid;
if (off < num_splines)
{
int off = (3*block+i)*BS + tid;
if (off < 3*num_splines)
kPoints_s[0][i*BS+tid] = kPoints[off];
}
// Load phi_in with coallesced reads
if ((2*block+0)*BS+tid < 2*num_splines)
{
in_shared[0][tid+ 0] = my_phi_in[(2*block+0)*BS+tid];
for (int j=0; j<4; j++)
in_shared[j+1][tid+ 0] = my_GL_in[2*j*num_splines+(2*block+0)*BS+tid];
}
if ((2*block+1)*BS+tid < 2*num_splines)
{
in_shared[0][tid+BS] = my_phi_in[(2*block+1)*BS+tid];
for (int j=0; j<4; j++)
in_shared[j+1][tid+BS] = my_GL_in[2*j*num_splines+(2*block+1)*BS+tid];
}
if (BS > WARP_SIZE) __syncthreads();
// Now add on phase factors
T phase = -(pos_s[0]*kPoints_s[tid][0] +
pos_s[1]*kPoints_s[tid][1] +
pos_s[2]*kPoints_s[tid][2]);
// Load kpoints
kPoints_s[0] = kPoints[off*3 ];
kPoints_s[1] = kPoints[off*3+1];
kPoints_s[2] = kPoints[off*3+2];
// Now compute phase factors
T phase = -(kPoints_s[0]*pos_s[0] +
kPoints_s[1]*pos_s[1] +
kPoints_s[2]*pos_s[2]);
T s, c;
sincosf (phase, &s, &c);
T u_re, u_im, gradu_re[3], gradu_im[3], laplu_re, laplu_im;
u_re = in_shared[0][2*tid+0];
u_im = in_shared[0][2*tid+1];
gradu_re[0] = in_shared[1][2*tid+0];
gradu_im[0] = in_shared[1][2*tid+1];
gradu_re[1] = in_shared[2][2*tid+0];
gradu_im[1] = in_shared[2][2*tid+1];
gradu_re[2] = in_shared[3][2*tid+0];
gradu_im[2] = in_shared[3][2*tid+1];
laplu_re = in_shared[4][2*tid+0];
laplu_im = in_shared[4][2*tid+1];
in_shared[0][2*tid+0] = u_re*c - u_im*s;
in_shared[0][2*tid+1] = u_re*s + u_im*c;
sincos (phase, &s, &c);
T2 e_mikr = T2(c,s);
// Temp. storage for phi, grad, lapl in the registers
T2 u, gradlaplu[4];
u = my_phi_in[off];
for (int j=0; j<4; j++)
gradlaplu[j] = my_GL_in[j*num_splines+off];
// Apply e^(-ikr) to the wavefunction
my_phi_out[off] = u * e_mikr;
// Laplacian = e^(-ikr)*(laplu - u*k^2 - 2ik*gradu)
T k2 = (kPoints_s[0]*kPoints_s[0] +
kPoints_s[1]*kPoints_s[1] +
kPoints_s[2]*kPoints_s[2]);
gradlaplu[3] = e_mikr * ( gradlaplu[3] - k2*u
- T2(0.0,2.0)*(kPoints_s[0]*gradlaplu[0]+
kPoints_s[1]*gradlaplu[1]+
kPoints_s[2]*gradlaplu[2]) );
// Gradient = e^(-ikr)*(-i*u*k + gradu)
for (int dim=0; dim<3; dim++)
{
T gre, gim;
gre = gradu_re[dim] + kPoints_s[tid][dim]*u_im;
gim = gradu_im[dim] - kPoints_s[tid][dim]*u_re;
in_shared[dim+1][2*tid+0] = gre*c - gim*s;
in_shared[dim+1][2*tid+1] = gre*s + gim*c;
}
// Add phase contribution to laplacian
T k2 = (kPoints_s[tid][0]*kPoints_s[tid][0] +
kPoints_s[tid][1]*kPoints_s[tid][1] +
kPoints_s[tid][2]*kPoints_s[tid][2]);
T lre = laplu_re - k2*u_re + 2.0*(kPoints_s[tid][0]*gradu_im[0]+
kPoints_s[tid][1]*gradu_im[1]+
kPoints_s[tid][2]*gradu_im[2]);
T lim = laplu_im - k2*u_im - 2.0*(kPoints_s[tid][0]*gradu_re[0]+
kPoints_s[tid][1]*gradu_re[1]+
kPoints_s[tid][2]*gradu_re[2]);
in_shared[4][2*tid+0] = lre*c - lim*s;
in_shared[4][2*tid+1] = lre*s + lim*c;
// Load makeTwoCopies with coallesced reads
if (!NO_TWO_COPIES)
if (block*BS+tid < num_splines)
m2c[tid] = makeTwoCopies[block*BS + tid];
if (BS > WARP_SIZE) __syncthreads();
// Now, serialize to output buffer
int end = min (BS, num_splines - block*BS);
// christos: When NO_TWO_COPIES is on, the output buffer logic can be
// optimized further, cutting down on instruction level clutter,
// synchronization & overall divergence. This is because outIndex's
// updates follow the induction variable (i).
if (NO_TWO_COPIES && end==BS) {
// christos: note: it is possible to eliminate out_shared under
// certain conditions
for (int i=tid; i<end; i+=BS) {
for (int j=0 ; j<5 ; j++)
out_shared[j][outIndex+i] = in_shared[j][2*i+0];
T2 g;
g = gradlaplu[dim] - (T2(0.0,1.0) * u * kPoints_s[dim]);
gradlaplu[dim] = g * e_mikr;
}
if (BS > WARP_SIZE) __syncthreads();
my_phi_out[ outBlock*BS+tid] = out_shared[0][tid];
my_GL_out[0*row_stride +outBlock*BS+tid] = out_shared[1][tid];
my_GL_out[1*row_stride +outBlock*BS+tid] = out_shared[2][tid];
my_GL_out[2*row_stride +outBlock*BS+tid] = out_shared[3][tid];
my_GL_out[3*row_stride +outBlock*BS+tid] = out_shared[4][tid];
outIndex = 0;
outBlock++;
} else {
for (int i=0; i<end; i++)
{
if (tid < 5)
out_shared[tid][outIndex] = in_shared[tid][2*i+0];
outIndex++;
if (BS > WARP_SIZE) __syncthreads();
if (outIndex == BS)
{
// Write back to global memory
my_phi_out[ outBlock*BS+tid] = out_shared[0][tid];
my_GL_out[0*row_stride +outBlock*BS+tid] = out_shared[1][tid];
my_GL_out[1*row_stride +outBlock*BS+tid] = out_shared[2][tid];
my_GL_out[2*row_stride +outBlock*BS+tid] = out_shared[3][tid];
my_GL_out[3*row_stride +outBlock*BS+tid] = out_shared[4][tid];
outIndex = 0;
outBlock++;
// Write back to global memory directly
for (int j=0; j<4; j++)
my_GL_out[j*row_stride + off] = gradlaplu[j];
}
if (!NO_TWO_COPIES && m2c[i])
{
if (tid < 5)
out_shared[tid][outIndex] = in_shared[tid][2*i+1];
outIndex++;
if (BS > WARP_SIZE) __syncthreads();
if (outIndex == BS)
{
// Write back to global memory
my_phi_out[ outBlock*BS+tid] = out_shared[0][tid];
my_GL_out[0*row_stride +outBlock*BS+tid] = out_shared[1][tid];
my_GL_out[1*row_stride +outBlock*BS+tid] = out_shared[2][tid];
my_GL_out[2*row_stride +outBlock*BS+tid] = out_shared[3][tid];
my_GL_out[3*row_stride +outBlock*BS+tid] = out_shared[4][tid];
outIndex = 0;
outBlock++;
if (BS > WARP_SIZE) __syncthreads();
}
}
}
if (BS > WARP_SIZE) __syncthreads();
}
}
if (tid < outIndex)
{
my_phi_out[ outBlock*BS+tid] = out_shared[0][tid];
my_GL_out[0*row_stride +outBlock*BS+tid] = out_shared[1][tid];
my_GL_out[1*row_stride +outBlock*BS+tid] = out_shared[2][tid];
my_GL_out[2*row_stride +outBlock*BS+tid] = out_shared[3][tid];
my_GL_out[3*row_stride +outBlock*BS+tid] = out_shared[4][tid];
}
}
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
float pos[], float *phi_in[], float *phi_out[],
float *GL_in[], float *GL_out[],
int num_splines, int num_walkers, int row_stride,
bool dontMakeTwoCopies)
// YingWai's implementation for complex wavefunction
void apply_phase_factors(float kPoints[], float pos[],
std::complex<float>* phi_in[], std::complex<float>* phi_out[],
int num_splines, int num_walkers)
{
const int BS = 64;
const int BS = 32;
dim3 dimBlock(BS);
dim3 dimGrid (num_walkers);
// std::cout << "num_splines=" << num_splines << ", nb=" << ((num_splines + BS-1)/BS) << std::endl;
if (dontMakeTwoCopies) {
phase_factor_kernel<float,BS,true><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
(kPoints, makeTwoCopies, pos, phi_in, phi_out,
GL_in, GL_out, num_splines, num_walkers, row_stride);
} else {
phase_factor_kernel<float,BS,false><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
(kPoints, makeTwoCopies, pos, phi_in, phi_out,
GL_in, GL_out, num_splines, num_walkers, row_stride);
}
phase_factor_kernel<float,thrust::complex<float>,BS><<<dimGrid,dimBlock>>>
(kPoints, pos, (thrust::complex<float>**)phi_in, (thrust::complex<float>**)phi_out, num_splines);
}
void apply_phase_factors(double kPoints[], double pos[],
std::complex<double>* phi_in[], std::complex<double>* phi_out[],
int num_splines, int num_walkers)
{
const int BS = 32;
dim3 dimBlock(BS);
dim3 dimGrid (num_walkers);
phase_factor_kernel<double,thrust::complex<double>,BS><<<dimGrid,dimBlock>>>
(kPoints, pos, (thrust::complex<double>**)phi_in, (thrust::complex<double>**)phi_out, num_splines);
}
void apply_phase_factors(float kPoints[], float pos[],
std::complex<float>* phi_in[], std::complex<float>* phi_out[],
std::complex<float>* GL_in[], std::complex<float>* GL_out[],
int num_splines, int num_walkers, int row_stride)
{
const int BS = 128;
dim3 dimBlock(BS);
dim3 dimGrid (num_walkers);
phase_factor_kernel<float,thrust::complex<float>,BS><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
(kPoints, pos, (thrust::complex<float>**)phi_in, (thrust::complex<float>**)phi_out, (thrust::complex<float>**)GL_in, (thrust::complex<float>**)GL_out,
num_splines, num_walkers, row_stride);
}
void apply_phase_factors(double kPoints[], double pos[],
std::complex<double>* phi_in[], std::complex<double>* phi_out[],
std::complex<double>* GL_in[], std::complex<double>* GL_out[],
int num_splines, int num_walkers, int row_stride)
{
const int BS = 128;
dim3 dimBlock(BS);
dim3 dimGrid (num_walkers);
phase_factor_kernel<double,thrust::complex<double>,BS><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
(kPoints, pos, (thrust::complex<double>**)phi_in, (thrust::complex<double>**)phi_out, (thrust::complex<double>**)GL_in, (thrust::complex<double>**)GL_out,
num_splines, num_walkers, row_stride);
}
#endif

View File

@ -0,0 +1,57 @@
#ifndef PHASE_FACTORS_H
#define PHASE_FACTORS_H
#include<complex>
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
float pos[], float *phi_in[], float *phi_out[],
int num_splines, int num_walkers);
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
float pos[], float *phi_in[], float *phi_out[],
float *GL_in[], float *GL_out[],
int num_splines, int num_walkers, int row_stride);
void apply_phase_factors(float kPoints[], int makeTwoCopies[], int TwoCopiesIndex[],
float pos[], float *phi_in[], float *phi_out[],
float *GL_in[], float *GL_out[],
int num_splines, int num_walkers, int row_stride);
void apply_phase_factors(double kPoints[], int makeTwoCopies[],
double pos[], double *phi_in[], double *phi_out[],
int num_splines, int num_walkers);
void apply_phase_factors(double kPoints[], int makeTwoCopies[],
double pos[], double *phi_in[], double *phi_out[],
double *GL_in[], double *GL_out[],
int num_splines, int num_walkers, int row_stride);
void apply_phase_factors(double kPoints[], int makeTwoCopies[], int TwoCopiesIndex[],
double pos[], double *phi_in[], double *phi_out[],
double *GL_in[], double *GL_out[],
int num_splines, int num_walkers, int row_stride);
#ifdef QMC_COMPLEX
// YingWai's complex implementation
void apply_phase_factors(float kPoints[], float pos[],
std::complex<float>* phi_in[], std::complex<float>* phi_out[],
int num_splines, int num_walkers);
void apply_phase_factors(double kPoints[], double pos[],
std::complex<double>* phi_in[], std::complex<double>* phi_out[],
int num_splines, int num_walkers);
void apply_phase_factors(float kPoints[], float pos[],
std::complex<float>* phi_in[], std::complex<float>* phi_out[],
std::complex<float>* GL_in[], std::complex<float>* GL_out[],
int num_splines, int num_walkers, int row_stride);
void apply_phase_factors(double kPoints[], double pos[],
std::complex<double>* phi_in[], std::complex<double>* phi_out[],
std::complex<double>* GL_in[], std::complex<double>* GL_out[],
int num_splines, int num_walkers, int row_stride);
#endif
#endif

View File

@ -313,7 +313,7 @@ public:
// Walker-parallel vectorized functions //
//////////////////////////////////////////
virtual void
reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool) { }
reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool) { }
virtual void
evaluate (std::vector<Walker_t*> &walkers, int iat, gpu::device_vector<CudaValueType*> &phi);
@ -350,8 +350,8 @@ typedef SPOSetBase* SPOSetBasePtr;
#endif
/***************************************************************************
* $RCSfile$ $Author$
* $Revision$ $Date$
* $Id$
* $RCSfile$ $Author: yingwai $
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
* $Id: SPOSetBase.h 7279 2016-11-24 00:21:16Z yingwai $
***************************************************************************/

View File

@ -113,6 +113,7 @@ public:
typedef OrbitalBase::CudaValueType CudaValueType;
typedef OrbitalBase::CudaGradType CudaGradType;
typedef OrbitalBase::CudaRealType CudaRealType;
typedef OrbitalBase::RealMatrix_t RealMatrix_t;
typedef OrbitalBase::ValueMatrix_t ValueMatrix_t;
typedef OrbitalBase::GradMatrix_t GradMatrix_t;
typedef ParticleSet::Walker_t Walker_t;
@ -427,9 +428,8 @@ public:
void recompute (MCWalkerConfiguration &W, bool firstTime=true);
void reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool,
void reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool,
bool onlyOptimizable=false);
void getGradient (MCWalkerConfiguration &W, int iat,
std::vector<GradType> &grad);
void calcGradient (MCWalkerConfiguration &W, int iat,
@ -455,12 +455,16 @@ public:
std::vector<ValueType> &psi_ratios,
std::vector<GradType> &newG,
std::vector<ValueType> &newL);
#ifdef QMC_COMPLEX
void convertRatiosFromComplexToReal (std::vector<ValueType> &psi_ratios,
std::vector<RealType> &psi_ratios_real);
#endif
void ratio (std::vector<Walker_t*> &walkers, std::vector<int> &iatList,
std::vector<PosType> &rNew,
std::vector<ValueType> &psi_ratios,
std::vector<GradType> &newG,
std::vector<ValueType> &newL);
void NLratios (MCWalkerConfiguration &W,
gpu::device_vector<CUDA_PRECISION*> &Rlist,
gpu::device_vector<int*> &ElecList,
@ -468,6 +472,7 @@ public:
gpu::device_vector<CUDA_PRECISION*> &QuadPosList,
gpu::device_vector<CUDA_PRECISION*> &RatioList,
int numQuadPoints);
void NLratios (MCWalkerConfiguration &W, std::vector<NLjob> &jobList,
std::vector<PosType> &quadPoints, std::vector<ValueType> &psi_ratios);
@ -493,11 +498,10 @@ public:
GradMatrix_t& optG,
ValueMatrix_t& optL);
void evaluateDerivatives (MCWalkerConfiguration &W,
const opt_variables_type& optvars,
ValueMatrix_t &dlogpsi,
ValueMatrix_t &dhpsioverpsi);
RealMatrix_t &dlogpsi,
RealMatrix_t &dhpsioverpsi);
#endif
@ -507,7 +511,7 @@ public:
}
#endif
/***************************************************************************
* $RCSfile$ $Author$
* $Revision$ $Date$
* $Id$
* $RCSfile$ $Author: tillackaf $
* $Revision: 7408 $ $Date: 2017-01-10 13:29:49 -0500 (Tue, 10 Jan 2017) $
* $Id: TrialWaveFunction.h 7408 2017-01-10 18:29:49Z tillackaf $
***************************************************************************/

View File

@ -47,7 +47,7 @@ TrialWaveFunction::recompute
void
TrialWaveFunction::reserve
(PointerPool<gpu::device_vector<CudaRealType> > &pool,
(PointerPool<gpu::device_vector<CudaValueType> > &pool,
bool onlyOptimizable)
{
for(int i=0; i<Z.size(); i++)
@ -131,6 +131,8 @@ TrialWaveFunction::ratio (MCWalkerConfiguration &W, int iat,
myTimers[ii]->stop();
}
}
void
TrialWaveFunction::calcRatio (MCWalkerConfiguration &W, int iat,
std::vector<ValueType> &psi_ratios,
@ -149,6 +151,8 @@ TrialWaveFunction::calcRatio (MCWalkerConfiguration &W, int iat,
myTimers[ii]->stop();
}
}
void
TrialWaveFunction::addRatio (MCWalkerConfiguration &W, int iat,
std::vector<ValueType> &psi_ratios,
@ -166,7 +170,26 @@ TrialWaveFunction::addRatio (MCWalkerConfiguration &W, int iat,
myTimers[1+TIMER_SKIP*(Z.size()-1)]->stop();
}
#if defined (QMC_COMPLEX)
void
TrialWaveFunction::convertRatiosFromComplexToReal (std::vector<ValueType> &psi_ratios,
std::vector<RealType> &psi_ratios_real)
{
RealType PhaseValue;
if (psi_ratios.size() != psi_ratios_real.size() ) {
std::cerr << "Error: In " << __FILE__ << " , line " << __LINE__ << " , "
<< "input vector and output vector sizes unmatched." << std::endl;
abort();
}
for (int iw=0; iw<psi_ratios.size(); iw++)
{
psi_ratios_real[iw] = evaluateLogAndPhase(psi_ratios[iw], PhaseValue);
psi_ratios_real[iw] = std::exp(psi_ratios_real[iw]);
}
}
#endif
void
TrialWaveFunction::ratio (std::vector<Walker_t*> &walkers, std::vector<int> &iatList,
@ -378,8 +401,8 @@ TrialWaveFunction::evaluateOptimizableLog (MCWalkerConfiguration &W,
void
TrialWaveFunction::evaluateDerivatives (MCWalkerConfiguration &W,
const opt_variables_type& optvars,
ValueMatrix_t &dlogpsi,
ValueMatrix_t &dhpsioverpsi)
RealMatrix_t &dlogpsi,
RealMatrix_t &dhpsioverpsi)
{
for (int i=0,ii=DERIVS_TIMER; i<Z.size(); i++,ii+=TIMER_SKIP)
{