mirror of https://github.com/QMCPACK/qmcpack.git
Merge pull request #84 from atillack/qmcpack_gpu_complex
GPU Complex Implementation
This commit is contained in:
commit
63480e964e
|
@ -645,6 +645,9 @@ IF(QMC_CUDA)
|
|||
else(CUDA_NVCC_FLAGS MATCHES "arch")
|
||||
# Automatically set the default NVCC flags
|
||||
SET(CUDA_NVCC_FLAGS "-Drestrict=__restrict__;-DNO_CUDA_MAIN")
|
||||
if (QMC_COMPLEX)
|
||||
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-DQMC_COMPLEX")
|
||||
endif()
|
||||
if ( CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
|
||||
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-g;-G")
|
||||
else()
|
||||
|
|
|
@ -22,9 +22,14 @@
|
|||
//
|
||||
// 1. Make sure libcublas.so is in the search path
|
||||
// 2. cd to build/ directory
|
||||
// 3. nvcc -o cuda_inverse -arch=sm_35 -lcublas -DCUDA_TEST_MAIN
|
||||
// 3. For real numbers, compile with
|
||||
// nvcc -o cuda_inverse -arch=sm_35 -lcublas -DCUDA_TEST_MAIN
|
||||
// ../src/Numerics/CUDA/cuda_inverse.cu
|
||||
//
|
||||
// For complex numbers, compile with
|
||||
// nvcc -o cuda_inverse -arch=sm_35 -lcublas -DCUDA_TEST_MAIN
|
||||
// -DQMC_COMPLEX=1 ../src/Numerics/CUDA/cuda_inverse.cu
|
||||
//
|
||||
// =============================================================== //
|
||||
|
||||
#include <cstdio>
|
||||
|
@ -32,8 +37,10 @@
|
|||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
#include <cuda.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuComplex.h>
|
||||
|
||||
#define CONVERT_BS 256
|
||||
#define INVERSE_BS 16
|
||||
|
@ -117,10 +124,31 @@ convert (Tdest **dest_list, Tsrc **src_list, int len)
|
|||
mydest[i] = (Tdest) mysrc[i];
|
||||
}
|
||||
|
||||
// Two matrix inversion functions
|
||||
// Convert for complex numbers
|
||||
template <typename Tdest, typename Tdest2, typename Tsrc>
|
||||
__global__ void
|
||||
convert_complex (Tdest **dest_list, Tsrc **src_list, int len)
|
||||
{
|
||||
__shared__ Tsrc *mysrc;
|
||||
__shared__ Tdest *mydest;
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
mysrc = src_list[blockIdx.y];
|
||||
mydest = dest_list[blockIdx.y];
|
||||
}
|
||||
__syncthreads();
|
||||
int i = blockIdx.x * CONVERT_BS + threadIdx.x;
|
||||
if (i < len) {
|
||||
mydest[i].x = (Tdest2) mysrc[i].x;
|
||||
mydest[i].y = (Tdest2) mysrc[i].y;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Four matrix inversion functions
|
||||
// 1. for float matrices
|
||||
// useHigherPrecision = false --> single precision operations
|
||||
// useHigherPrecision = true --> double precision operations
|
||||
// useHigherPrecision = true --> double precision operations (default)
|
||||
void
|
||||
cublas_inverse (cublasHandle_t handle,
|
||||
float *Alist_d[], float *Ainvlist_d[],
|
||||
|
@ -228,6 +256,107 @@ cublas_inverse (cublasHandle_t handle,
|
|||
}
|
||||
|
||||
|
||||
// 3. for complex float matrices
|
||||
// useHigherPrecision = false --> single precision operations
|
||||
// useHigherPrecision = true --> double precision operations (default)
|
||||
void
|
||||
cublas_inverse (cublasHandle_t handle,
|
||||
std::complex<float> *Alist_d[], std::complex<float> *Ainvlist_d[],
|
||||
std::complex<float> *AWorklist_d[], std::complex<float> *AinvWorklist_d[],
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision)
|
||||
{
|
||||
|
||||
// Info array tells if a matrix inversion is successful
|
||||
// = 0 : successful
|
||||
// = k : U(k,k) = 0; inversion failed
|
||||
int *infoArray;
|
||||
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
|
||||
|
||||
// If double precision operations are desired...
|
||||
if (useHigherPrecision)
|
||||
{
|
||||
|
||||
// (i) convert elements in Alist from float to double, put them in AWorklist
|
||||
dim3 dimBlockConvert (CONVERT_BS);
|
||||
dim3 dimGridConvert ((N*rowStride + (CONVERT_BS-1)) / CONVERT_BS, numMats);
|
||||
convert_complex<cuDoubleComplex, double, cuComplex> <<< dimGridConvert, dimBlockConvert >>> ((cuDoubleComplex**)AWorklist_d, (cuComplex**)Alist_d, N*rowStride);
|
||||
|
||||
// (ii) call cublas to do matrix inversion
|
||||
// LU decomposition
|
||||
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, infoArray, numMats), __LINE__ );
|
||||
// Inversion
|
||||
#if (CUDA_VERSION >= 6050)
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
#else
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)AinvWorklist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
#endif
|
||||
|
||||
// (iii) convert results back to single precision
|
||||
convert_complex<cuComplex, float, cuDoubleComplex> <<< dimGridConvert, dimBlockConvert >>> ((cuComplex**)Ainvlist_d, (cuDoubleComplex**)AinvWorklist_d, N*rowStride);
|
||||
|
||||
}
|
||||
// else, carry out single precision operations
|
||||
else
|
||||
{
|
||||
// Call cublas to do matrix inversion
|
||||
// LU decomposition
|
||||
callAndCheckError( cublasCgetrfBatched( handle, N, (cuComplex**)Alist_d, rowStride, NULL,
|
||||
infoArray, numMats), __LINE__ );
|
||||
|
||||
// Inversion
|
||||
#if (CUDA_VERSION >= 6050)
|
||||
callAndCheckError( cublasCgetriBatched( handle, N, (const cuComplex**)Alist_d, rowStride, NULL, (cuComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
#else
|
||||
callAndCheckError( cublasCgetriBatched( handle, N, (cuComplex**)Alist_d, rowStride, NULL, (cuComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
#endif
|
||||
}
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Free resources
|
||||
cudaFree(infoArray);
|
||||
|
||||
}
|
||||
|
||||
// 4. for complex double matrices
|
||||
void
|
||||
cublas_inverse (cublasHandle_t handle,
|
||||
std::complex<double> *Alist_d[], std::complex<double> *Ainvlist_d[],
|
||||
std::complex<double> *AWorklist_d[], std::complex<double> *AinvWorklist_d[],
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision)
|
||||
{
|
||||
|
||||
// Info array tells if a matrix inversion is successful
|
||||
// = 0 : successful
|
||||
// = k : U(k,k) = 0; inversion failed
|
||||
int *infoArray;
|
||||
callAndCheckError( cudaMalloc((void**) &infoArray, numMats * sizeof(int)), __LINE__ );
|
||||
|
||||
// (i) copy all the elements of Alist to AWorklist
|
||||
dim3 dimBlockConvert (CONVERT_BS);
|
||||
dim3 dimGridConvert ((N*rowStride + (CONVERT_BS-1)) / CONVERT_BS, numMats);
|
||||
convert_complex<cuDoubleComplex, double, cuDoubleComplex> <<< dimGridConvert, dimBlockConvert >>> ((cuDoubleComplex**)AWorklist_d, (cuDoubleComplex**)Alist_d, N*rowStride);
|
||||
|
||||
// (ii) call cublas to do matrix inversion
|
||||
// LU decomposition
|
||||
callAndCheckError( cublasZgetrfBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, infoArray, numMats), __LINE__ );
|
||||
// Inversion
|
||||
#if (CUDA_VERSION >= 6050)
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (const cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
#else
|
||||
callAndCheckError( cublasZgetriBatched( handle, N, (cuDoubleComplex**)AWorklist_d, rowStride, NULL, (cuDoubleComplex**)Ainvlist_d, rowStride, infoArray, numMats), __LINE__ );
|
||||
#endif
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
cudaFree(infoArray);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Test routines //
|
||||
//////////////////////////////////////////////////////
|
||||
|
@ -270,8 +399,13 @@ test_cublas_inverse(int matSize, int numMats)
|
|||
T* A = (T*) malloc(sizeof(T) * numMats * N * row_stride);
|
||||
|
||||
for (int j=0; j<numMats; j++)
|
||||
for (int i=0; i<N*row_stride; i++)
|
||||
A[j*N*row_stride+i] = 1.0 * (drand48() - 0.5);
|
||||
for (int i=0; i<N*row_stride; i++) {
|
||||
#ifndef QMC_COMPLEX
|
||||
A[j*N*row_stride+i] = 1.0 * (drand48() - 0.5);
|
||||
#else
|
||||
A[j*N*row_stride+i] = T(1.0 * (drand48() - 0.5), 1.0 * (drand48() - 0.5));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Allocate memory on device for each matrix
|
||||
for (int mat=0; mat<numMats; mat++)
|
||||
|
@ -328,10 +462,10 @@ test_cublas_inverse(int matSize, int numMats)
|
|||
for (int i=0; i<N; i++)
|
||||
for (int j=0; j<N; j++)
|
||||
{
|
||||
double val = 0.0;
|
||||
T val = 0.0;
|
||||
for (int k=0; k<N; k++)
|
||||
val += Ainv[i*row_stride+k] * A[mat*N*row_stride+k*row_stride+j];
|
||||
double diff = (i==j) ? (1.0f - val) : val;
|
||||
double diff = (i==j) ? (1.0f - std::real(val)) : std::real(val);
|
||||
error += diff * diff;
|
||||
}
|
||||
fprintf (stderr, "error = %1.8e\n", sqrt(error/(double)(N*N)));
|
||||
|
@ -378,8 +512,13 @@ int main(int argc, char** argv)
|
|||
exit(1);
|
||||
}
|
||||
|
||||
#ifndef QMC_COMPLEX
|
||||
test_cublas_inverse<double>(matSize, numMats);
|
||||
test_cublas_inverse<float>(matSize, numMats);
|
||||
#else
|
||||
test_cublas_inverse<std::complex<double> >(matSize, numMats);
|
||||
test_cublas_inverse<std::complex<float> >(matSize, numMats);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
|
||||
//
|
||||
// File developed by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
|
||||
// Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
|
||||
// Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
|
||||
// Ying Wai Li, yingwaili@ornl.gov, Oak Ridge National Laboratory
|
||||
//
|
||||
// File created by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
|
||||
|
@ -15,6 +15,7 @@
|
|||
#ifndef CUDA_INVERSE_H
|
||||
#define CUDA_INVERSE_H
|
||||
|
||||
#include <complex>
|
||||
#include <cublas_v2.h>
|
||||
|
||||
//////////////////////////////
|
||||
|
@ -37,4 +38,25 @@ cublas_inverse (cublasHandle_t handle,
|
|||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision = true);
|
||||
|
||||
|
||||
//////////////////////////////////////
|
||||
// Complex single / mixed precision //
|
||||
//////////////////////////////////////
|
||||
void
|
||||
cublas_inverse (cublasHandle_t handle,
|
||||
std::complex<float> *Alist_d[], std::complex<float> *Ainvlist_d[],
|
||||
std::complex<float> *AWorkList_d[], std::complex<float> *AinvWorkList_d[],
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision = true);
|
||||
|
||||
//////////////////////////////
|
||||
// Complex double precision //
|
||||
//////////////////////////////
|
||||
void
|
||||
cublas_inverse (cublasHandle_t handle,
|
||||
std::complex<double> *Alist_d[], std::complex<double> *Ainvlist_d[],
|
||||
std::complex<double> *AWorklist_d[], std::complex<double> *AinvWorklist_d[],
|
||||
int N, int rowStride, int numMats,
|
||||
bool useHigherPrecision = true);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -565,31 +565,37 @@ void MCWalkerConfiguration::updateLists_GPU()
|
|||
DataList_GPU.resize(nw);
|
||||
}
|
||||
hostlist.resize(nw);
|
||||
hostlist_valueType.resize(nw);
|
||||
hostlist_AA.resize(nw);
|
||||
|
||||
for (int iw=0; iw<nw; iw++)
|
||||
{
|
||||
if (WalkerList[iw]->R_GPU.size() != R.size())
|
||||
std::cerr << "Error in R_GPU size for iw = " << iw << "!\n";
|
||||
hostlist[iw] = (CUDA_PRECISION*)WalkerList[iw]->R_GPU.data();
|
||||
hostlist[iw] = (CudaRealType*)WalkerList[iw]->R_GPU.data();
|
||||
}
|
||||
RList_GPU = hostlist;
|
||||
|
||||
for (int iw=0; iw<nw; iw++)
|
||||
{
|
||||
if (WalkerList[iw]->Grad_GPU.size() != R.size())
|
||||
std::cerr << "Error in Grad_GPU size for iw = " << iw << "!\n";
|
||||
hostlist[iw] = (CUDA_PRECISION*)WalkerList[iw]->Grad_GPU.data();
|
||||
hostlist_valueType[iw] = (CudaValueType*)WalkerList[iw]->Grad_GPU.data();
|
||||
}
|
||||
GradList_GPU = hostlist;
|
||||
GradList_GPU = hostlist_valueType;
|
||||
|
||||
for (int iw=0; iw<nw; iw++)
|
||||
{
|
||||
if (WalkerList[iw]->Lap_GPU.size() != R.size())
|
||||
std::cerr << "Error in Lap_GPU size for iw = " << iw << "!\n";
|
||||
hostlist[iw] = (CUDA_PRECISION*)WalkerList[iw]->Lap_GPU.data();
|
||||
hostlist_valueType[iw] = (CudaValueType*)WalkerList[iw]->Lap_GPU.data();
|
||||
}
|
||||
LapList_GPU = hostlist;
|
||||
LapList_GPU = hostlist_valueType;
|
||||
|
||||
for (int iw=0; iw<nw; iw++)
|
||||
hostlist[iw] = WalkerList[iw]->cuda_DataSet.data();
|
||||
DataList_GPU = hostlist;
|
||||
hostlist_valueType[iw] = WalkerList[iw]->cuda_DataSet.data();
|
||||
DataList_GPU = hostlist_valueType;
|
||||
|
||||
for (int isp=0; isp<NumSpecies; isp++)
|
||||
{
|
||||
for (int iw=0; iw<nw; iw++)
|
||||
|
@ -629,10 +635,9 @@ void MCWalkerConfiguration::copyWalkersToGPU(bool copyGrad)
|
|||
copyWalkerGradToGPU();
|
||||
}
|
||||
|
||||
|
||||
void MCWalkerConfiguration::copyWalkerGradToGPU()
|
||||
{
|
||||
Grad_host.resize(WalkerList[0]->R.size());
|
||||
Grad_host.resize(WalkerList[0]->G.size());
|
||||
for (int iw=0; iw<WalkerList.size(); iw++)
|
||||
{
|
||||
for (int i=0; i<WalkerList[iw]->size(); i++)
|
||||
|
@ -642,7 +647,6 @@ void MCWalkerConfiguration::copyWalkerGradToGPU()
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
void MCWalkerConfiguration::proposeMove_GPU
|
||||
(std::vector<PosType> &newPos, int iat)
|
||||
{
|
||||
|
@ -713,7 +717,7 @@ void MCWalkerConfiguration::NLMove_GPU(std::vector<Walker_t*> &walkers,
|
|||
}
|
||||
|
||||
/***************************************************************************
|
||||
* $RCSfile$ $Author$
|
||||
* $Revision$ $Date$
|
||||
* $Id$
|
||||
* $RCSfile$ $Author: yingwai $
|
||||
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
|
||||
* $Id: MCWalkerConfiguration.cpp 7279 2016-11-24 00:21:16Z yingwai $
|
||||
***************************************************************************/
|
||||
|
|
|
@ -91,23 +91,27 @@ public:
|
|||
// laplacians for each walker. These vectors .data() is often
|
||||
// passed to GPU kernels.
|
||||
#ifdef QMC_CUDA
|
||||
gpu::device_vector<CUDA_PRECISION*> RList_GPU, GradList_GPU, LapList_GPU;
|
||||
|
||||
gpu::device_vector<CudaRealType*> RList_GPU;
|
||||
gpu::device_vector<CudaValueType*> GradList_GPU, LapList_GPU;
|
||||
// First index is the species. The second index is the walker
|
||||
std::vector<gpu::device_vector<CUDA_PRECISION_FULL*> > RhokLists_GPU;
|
||||
gpu::device_vector<CUDA_PRECISION*> DataList_GPU;
|
||||
gpu::device_vector<TinyVector<CUDA_PRECISION,OHMMS_DIM> > Rnew_GPU;
|
||||
gpu::host_vector<TinyVector<CUDA_PRECISION,OHMMS_DIM> > Rnew_host;
|
||||
gpu::device_vector<CudaValueType*> DataList_GPU;
|
||||
gpu::device_vector<CudaPosType> Rnew_GPU;
|
||||
gpu::host_vector<CudaPosType> Rnew_host;
|
||||
std::vector<PosType> Rnew;
|
||||
gpu::device_vector<CudaRealType*> NLlist_GPU;
|
||||
gpu::host_vector<CudaRealType*> NLlist_host;
|
||||
gpu::host_vector<CudaRealType*> hostlist;
|
||||
gpu::host_vector<CudaValueType*> hostlist_valueType;
|
||||
gpu::host_vector<CUDA_PRECISION_FULL*> hostlist_AA;
|
||||
gpu::host_vector<CudaPosType> R_host;
|
||||
gpu::host_vector<CudaGradType> Grad_host;
|
||||
gpu::device_vector<int> iatList_GPU;
|
||||
gpu::host_vector<int> iatList_host;
|
||||
gpu::device_vector<CUDA_PRECISION*> NLlist_GPU;
|
||||
gpu::host_vector<CUDA_PRECISION*> NLlist_host;
|
||||
std::vector<PosType> Rnew;
|
||||
gpu::device_vector<int> AcceptList_GPU;
|
||||
gpu::host_vector<int> AcceptList_host;
|
||||
gpu::host_vector<CUDA_PRECISION*> hostlist;
|
||||
gpu::host_vector<CUDA_PRECISION_FULL*> hostlist_AA;
|
||||
gpu::host_vector<TinyVector<CUDA_PRECISION, OHMMS_DIM> > R_host;
|
||||
gpu::host_vector<TinyVector<CudaValueType, OHMMS_DIM> > Grad_host;
|
||||
|
||||
void allocateGPU(size_t buffersize);
|
||||
void copyWalkersToGPU(bool copyGrad=false);
|
||||
void copyWalkerGradToGPU();
|
||||
|
@ -411,7 +415,7 @@ private:
|
|||
}
|
||||
#endif
|
||||
/***************************************************************************
|
||||
* $RCSfile$ $Author$
|
||||
* $Revision$ $Date$
|
||||
* $Id$
|
||||
* $RCSfile$ $Author: yingwai $
|
||||
* $Revision: 7383 $ $Date: 2016-12-28 12:15:25 -0500 (Wed, 28 Dec 2016) $
|
||||
* $Id: MCWalkerConfiguration.h 7383 2016-12-28 17:15:25Z yingwai $
|
||||
st***************************************************************************/
|
||||
|
|
|
@ -75,10 +75,16 @@ struct Walker
|
|||
/** typedef for value data type. */
|
||||
typedef typename t_traits::ValueType ValueType;
|
||||
#ifdef QMC_CUDA
|
||||
/** typedef for real data type */
|
||||
/** typedef for CUDA real data type */
|
||||
typedef typename t_traits::CudaRealType CudaRealType;
|
||||
/** typedef for value data type. */
|
||||
/** typedef for CUDA value data type. */
|
||||
typedef typename t_traits::CudaValueType CudaValueType;
|
||||
/** array of particles */
|
||||
typedef typename t_traits::CudaPosType CudaPosType;
|
||||
/** array of gradients */
|
||||
typedef typename t_traits::CudaGradType CudaGradType;
|
||||
/** array of laplacians */
|
||||
typedef typename t_traits::CudaValueType CudaLapType;
|
||||
#endif
|
||||
/** array of particles */
|
||||
typedef typename p_traits::ParticlePos_t ParticlePos_t;
|
||||
|
@ -140,13 +146,13 @@ struct Walker
|
|||
/// Data for GPU-vectorized versions
|
||||
#ifdef QMC_CUDA
|
||||
static int cuda_DataSize;
|
||||
typedef gpu::device_vector<CUDA_PRECISION> cuda_Buffer_t;
|
||||
typedef gpu::device_vector<CudaValueType> cuda_Buffer_t;
|
||||
cuda_Buffer_t cuda_DataSet;
|
||||
// Note that R_GPU has size N+1. The last element contains the
|
||||
// proposed position for single-particle moves.
|
||||
gpu::device_vector<TinyVector<CudaRealType,OHMMS_DIM> > R_GPU;
|
||||
gpu::device_vector<TinyVector<CudaValueType,OHMMS_DIM> > Grad_GPU;
|
||||
gpu::device_vector<CUDA_PRECISION> Lap_GPU;
|
||||
gpu::device_vector<CudaPosType> R_GPU;
|
||||
gpu::device_vector<CudaGradType> Grad_GPU;
|
||||
gpu::device_vector<CudaLapType> Lap_GPU;
|
||||
gpu::device_vector<CUDA_PRECISION_FULL> Rhok_GPU;
|
||||
int k_species_stride;
|
||||
inline void resizeCuda(int size, int num_species, int num_k)
|
||||
|
@ -421,10 +427,10 @@ struct Walker
|
|||
//+R.size()*(DIM*2*sizeof(RealType)+(DIM+1)*sizeof(ValueType));//R+Drift+G+L
|
||||
#ifdef QMC_CUDA
|
||||
bsize += 3 *sizeof (int); // size and N and M
|
||||
bsize += cuda_DataSize * sizeof(CUDA_PRECISION); // cuda_DataSet
|
||||
bsize += R.size() * OHMMS_DIM * sizeof(CUDA_PRECISION); // R_GPU
|
||||
bsize += R.size() * OHMMS_DIM * sizeof(CudaValueType); // Grad_GPU
|
||||
bsize += R.size() * 1 * sizeof(CUDA_PRECISION); // Lap_GPU
|
||||
bsize += cuda_DataSize * sizeof(CudaValueType); // cuda_DataSet
|
||||
bsize += R.size() * OHMMS_DIM * sizeof(CudaPosType); // R_GPU
|
||||
bsize += G.size() * OHMMS_DIM * sizeof(CudaValueType); // Grad_GPU
|
||||
bsize += L.size() * 1 * sizeof(CudaLapType); // Lap_GPU
|
||||
bsize += Rhok_GPU.size() * sizeof(CUDA_PRECISION_FULL); // Rhok
|
||||
#endif
|
||||
return bsize;
|
||||
|
@ -453,23 +459,30 @@ struct Walker
|
|||
m.Pack(&(PHindex[0]),PHindex.size());
|
||||
#ifdef QMC_CUDA
|
||||
// Pack GPU data
|
||||
std::vector<CUDA_PRECISION> host_data;
|
||||
std::vector<CudaValueType> host_data;
|
||||
std::vector<CUDA_PRECISION_FULL> host_rhok;
|
||||
std::vector<TinyVector<CUDA_PRECISION,OHMMS_DIM> > R_host;
|
||||
std::vector<TinyVector<CudaValueType,OHMMS_DIM> > Grad_host;
|
||||
std::vector<CUDA_PRECISION> host_lapl;
|
||||
std::vector<CudaPosType> R_host;
|
||||
std::vector<CudaGradType> Grad_host;
|
||||
std::vector<CudaLapType> host_lapl;
|
||||
|
||||
cuda_DataSet.copyFromGPU(host_data);
|
||||
R_GPU.copyFromGPU(R_host);
|
||||
Grad_GPU.copyFromGPU(Grad_host);
|
||||
Lap_GPU.copyFromGPU(host_lapl);
|
||||
int size = host_data.size();
|
||||
int N = R_host.size();
|
||||
m.Pack(size);
|
||||
m.Pack(N);
|
||||
m.Pack(&(host_data[0]), host_data.size());
|
||||
m.Pack(&(R_host[0][0]), OHMMS_DIM*R_host.size());
|
||||
Grad_GPU.copyFromGPU(Grad_host);
|
||||
#ifdef QMC_COMPLEX
|
||||
m.Pack(reinterpret_cast<CudaRealType*>(&(host_data[0])), host_data.size()*2);
|
||||
m.Pack(reinterpret_cast<CudaRealType*>(&(Grad_host[0][0])),OHMMS_DIM*Grad_host.size()*2);
|
||||
m.Pack(reinterpret_cast<CudaRealType*>(&(host_lapl[0])), host_lapl.size()*2);
|
||||
#else
|
||||
m.Pack(&(host_data[0]), host_data.size());
|
||||
m.Pack(&(Grad_host[0][0]), OHMMS_DIM*Grad_host.size());
|
||||
Lap_GPU.copyFromGPU(host_lapl);
|
||||
m.Pack(&(host_lapl[0]), host_lapl.size());
|
||||
#endif
|
||||
Rhok_GPU.copyFromGPU(host_rhok);
|
||||
int M = host_rhok.size();
|
||||
m.Pack(M);
|
||||
|
@ -500,24 +513,33 @@ struct Walker
|
|||
m.Unpack(&(PropertyHistory[iat][0]),PropertyHistory[iat].size());
|
||||
m.Unpack(&(PHindex[0]),PHindex.size());
|
||||
#ifdef QMC_CUDA
|
||||
// Pack GPU data
|
||||
std::vector<CUDA_PRECISION> host_data;
|
||||
// Unpack GPU data
|
||||
std::vector<CudaValueType> host_data;
|
||||
std::vector<CUDA_PRECISION_FULL> host_rhok;
|
||||
std::vector<TinyVector<CUDA_PRECISION,OHMMS_DIM> > R_host;
|
||||
std::vector<CUDA_PRECISION> host_lapl;
|
||||
std::vector<CudaPosType> R_host;
|
||||
std::vector<CudaGradType> Grad_host;
|
||||
std::vector<CudaLapType> host_lapl;
|
||||
|
||||
int size, N;
|
||||
m.Unpack(size);
|
||||
m.Unpack(N);
|
||||
host_data.resize(size);
|
||||
R_host.resize(N);
|
||||
Grad_host.resize(N);
|
||||
host_lapl.resize(N);
|
||||
m.Unpack(&(host_data[0]), size);
|
||||
cuda_DataSet = host_data;
|
||||
m.Unpack(&(R_host[0][0]), OHMMS_DIM*N);
|
||||
R_GPU = R_host;
|
||||
m.Unpack(&(R_host[0][0]), OHMMS_DIM*N);
|
||||
Grad_GPU = R_host;
|
||||
#ifdef QMC_COMPLEX
|
||||
m.Unpack(reinterpret_cast<CudaRealType*>(&(host_data[0])), size*2);
|
||||
m.Unpack(reinterpret_cast<CudaRealType*>(&(Grad_host[0][0])),OHMMS_DIM*N*2);
|
||||
m.Unpack(reinterpret_cast<CudaRealType*>(&(host_lapl[0])), N*2);
|
||||
#else
|
||||
m.Unpack(&(host_data[0]), size);
|
||||
m.Unpack(&(Grad_host[0][0]), OHMMS_DIM*N);
|
||||
m.Unpack(&(host_lapl[0]), N);
|
||||
#endif
|
||||
cuda_DataSet = host_data;
|
||||
Grad_GPU = Grad_host;
|
||||
Lap_GPU = host_lapl;
|
||||
int M;
|
||||
m.Unpack(M);
|
||||
|
@ -543,7 +565,7 @@ std::ostream& operator<<(std::ostream& out, const Walker<RealType,PA>& rhs)
|
|||
|
||||
#endif
|
||||
/***************************************************************************
|
||||
* $RCSfile$ $Author$
|
||||
* $Revision$ $Date$
|
||||
* $Id$
|
||||
* $RCSfile$ $Author: yingwai $
|
||||
* $Revision: 7405 $ $Date: 2017-01-09 15:46:55 -0500 (Mon, 09 Jan 2017) $
|
||||
* $Id: Walker.h 7405 2017-01-09 20:46:55Z yingwai $
|
||||
***************************************************************************/
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "Utilities/RandomGenerator.h"
|
||||
#include "ParticleBase/RandomSeqGenerator.h"
|
||||
#include "QMCDrivers/DriftOperators.h"
|
||||
#include "type_traits/scalar_traits.h"
|
||||
|
||||
|
||||
namespace qmcplusplus
|
||||
|
@ -91,8 +92,13 @@ bool DMCcuda::run()
|
|||
std::vector<PosType> delpos(nw);
|
||||
std::vector<PosType> dr(nw);
|
||||
std::vector<PosType> newpos(nw);
|
||||
std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw);
|
||||
std::vector<PosType> oldG(nw), newG(nw);
|
||||
//std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw);
|
||||
std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw);
|
||||
std::vector<RealType> R2prop(nw), R2acc(nw);
|
||||
#ifdef QMC_COMPLEX
|
||||
std::vector<RealType> ratios_real(nw);
|
||||
#endif
|
||||
std::vector<GradType> oldG(nw), newG(nw);
|
||||
std::vector<ValueType> oldL(nw), newL(nw);
|
||||
std::vector<Walker_t*> accepted(nw);
|
||||
Matrix<ValueType> lapl(nw, nat);
|
||||
|
@ -119,6 +125,9 @@ bool DMCcuda::run()
|
|||
dr.resize(nw);
|
||||
newpos.resize(nw);
|
||||
ratios.resize(nw);
|
||||
#ifdef QMC_COMPLEX
|
||||
ratios_real.resize(nw);
|
||||
#endif
|
||||
rplus.resize(nw);
|
||||
rminus.resize(nw);
|
||||
oldG.resize(nw);
|
||||
|
@ -156,9 +165,17 @@ bool DMCcuda::run()
|
|||
{
|
||||
delpos[iw] *= m_sqrttau;
|
||||
oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]);
|
||||
#ifdef QMC_COMPLEX
|
||||
convert(oldScale[iw] * oldG[iw], dr[iw]);
|
||||
dr[iw] += delpos[iw];
|
||||
#else
|
||||
dr[iw] = delpos[iw] + (oldScale[iw]*oldG[iw]);
|
||||
#endif
|
||||
newpos[iw]=W[iw]->R[iat] + dr[iw];
|
||||
ratios[iw] = 1.0;
|
||||
#ifdef QMC_COMPLEX
|
||||
ratios_real[iw] = 1.0;
|
||||
#endif
|
||||
R2prop[iw] += dot(delpos[iw], delpos[iw]);
|
||||
}
|
||||
W.proposeMove_GPU(newpos, iat);
|
||||
|
@ -171,21 +188,42 @@ bool DMCcuda::run()
|
|||
std::vector<RealType> rand_v(nw);
|
||||
for(int iw=0; iw<nw; ++iw)
|
||||
{
|
||||
#ifdef QMC_COMPLEX
|
||||
PosType drOld = 0.0;
|
||||
convert(oldScale[iw] * oldG[iw], drOld);
|
||||
drOld = newpos[iw] - (W[iw]->R[iat] + drOld);
|
||||
#else
|
||||
PosType drOld =
|
||||
newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]);
|
||||
#endif
|
||||
logGf_v[iw] = -m_oneover2tau * dot(drOld, drOld);
|
||||
rand_v[iw] = Random();
|
||||
}
|
||||
Psi.addRatio(W,iat,ratios, newG, newL);
|
||||
Psi.addRatio(W, iat, ratios, newG, newL);
|
||||
#ifdef QMC_COMPLEX
|
||||
Psi.convertRatiosFromComplexToReal(ratios, ratios_real);
|
||||
#endif
|
||||
for(int iw=0; iw<nw; ++iw)
|
||||
{
|
||||
newScale[iw] = getDriftScale(m_tauovermass,newG[iw]);
|
||||
#ifdef QMC_COMPLEX
|
||||
PosType drNew = 0.0;
|
||||
convert(newScale[iw] * newG[iw], drNew);
|
||||
drNew += newpos[iw] - W[iw]->R[iat];
|
||||
#else
|
||||
PosType drNew =
|
||||
(newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat];
|
||||
#endif
|
||||
RealType logGb = -m_oneover2tau * dot(drNew, drNew);
|
||||
RealType x = logGb - logGf_v[iw];
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
RealType prob = ratios_real[iw]*ratios_real[iw]*std::exp(x);
|
||||
if(acc[iw] && rand_v[iw] < prob && ratios_real[iw] > 0.0)
|
||||
#else
|
||||
RealType prob = ratios[iw]*ratios[iw]*std::exp(x);
|
||||
if(acc[iw] && rand_v[iw] < prob && ratios[iw] > 0.0)
|
||||
#endif
|
||||
{
|
||||
accepted.push_back(W[iw]);
|
||||
nAccept++;
|
||||
|
@ -284,10 +322,20 @@ bool DMCcuda::run()
|
|||
RealType v2=0.0, v2bar=0.0;
|
||||
for(int iat=0; iat<nat; iat++)
|
||||
{
|
||||
#ifdef QMC_COMPLEX
|
||||
v2 += dot_real(W.G[iat],W.G[iat]);
|
||||
#else
|
||||
// should be removed when things work fine
|
||||
v2 += dot(W.G[iat],W.G[iat]);
|
||||
#endif
|
||||
RealType newscale = getDriftScale(m_tauovermass,newG[iw]);
|
||||
#ifdef QMC_COMPLEX
|
||||
v2 += m_tauovermass * m_tauovermass * dot_real(newG[iw],newG[iw]);
|
||||
v2bar += newscale * newscale * dot_real(newG[iw],newG[iw]);
|
||||
#else
|
||||
v2 += m_tauovermass * m_tauovermass * dot(newG[iw],newG[iw]);
|
||||
v2bar += newscale * newscale * dot(newG[iw],newG[iw]);
|
||||
#endif
|
||||
}
|
||||
//RealType scNew = std::sqrt(V2bar[iw] / V2[iw]);
|
||||
RealType scNew = std::sqrt(v2bar/v2);
|
||||
|
@ -346,8 +394,13 @@ bool DMCcuda::runWithNonlocal()
|
|||
std::vector<PosType> delpos(nw);
|
||||
std::vector<PosType> dr(nw);
|
||||
std::vector<PosType> newpos(nw);
|
||||
std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw);
|
||||
std::vector<PosType> oldG(nw), newG(nw);
|
||||
//std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw);
|
||||
std::vector<ValueType> ratios(nw), rplus(nw), rminus(nw);
|
||||
std::vector<RealType> R2prop(nw), R2acc(nw);
|
||||
#ifdef QMC_COMPLEX
|
||||
std::vector<RealType> ratios_real(nw);
|
||||
#endif
|
||||
std::vector<GradType> oldG(nw), newG(nw);
|
||||
std::vector<ValueType> oldL(nw), newL(nw);
|
||||
std::vector<Walker_t*> accepted(nw);
|
||||
Matrix<ValueType> lapl(nw, nat);
|
||||
|
@ -372,6 +425,9 @@ bool DMCcuda::runWithNonlocal()
|
|||
dr.resize(nw);
|
||||
newpos.resize(nw);
|
||||
ratios.resize(nw);
|
||||
#ifdef QMC_COMPLEX
|
||||
ratios_real.resize(nw);
|
||||
#endif
|
||||
rplus.resize(nw);
|
||||
rminus.resize(nw);
|
||||
oldG.resize(nw);
|
||||
|
@ -401,7 +457,12 @@ bool DMCcuda::runWithNonlocal()
|
|||
{
|
||||
delpos[iw] *= m_sqrttau;
|
||||
oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]);
|
||||
#ifdef QMC_COMPLEX
|
||||
convert(oldScale[iw] * oldG[iw], dr[iw]);
|
||||
dr[iw] += delpos[iw];
|
||||
#else
|
||||
dr[iw] = delpos[iw] + (oldScale[iw]*oldG[iw]);
|
||||
#endif
|
||||
newpos[iw]=W[iw]->R[iat] + dr[iw];
|
||||
ratios[iw] = 1.0;
|
||||
R2prop[iw] += dot(delpos[iw], delpos[iw]);
|
||||
|
@ -414,21 +475,41 @@ bool DMCcuda::runWithNonlocal()
|
|||
std::vector<RealType> rand_v(nw);
|
||||
for(int iw=0; iw<nw; ++iw)
|
||||
{
|
||||
#ifdef QMC_COMPLEX
|
||||
PosType drOld = 0.0;
|
||||
convert(oldScale[iw] * oldG[iw], drOld);
|
||||
drOld = newpos[iw] - (W[iw]->R[iat] + drOld);
|
||||
#else
|
||||
PosType drOld =
|
||||
newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]);
|
||||
#endif
|
||||
logGf_v[iw] = -m_oneover2tau * dot(drOld, drOld);
|
||||
rand_v[iw] = Random();
|
||||
}
|
||||
Psi.addRatio(W,iat,ratios,newG, newL);
|
||||
Psi.addRatio(W, iat, ratios, newG, newL);
|
||||
#ifdef QMC_COMPLEX
|
||||
Psi.convertRatiosFromComplexToReal(ratios, ratios_real);
|
||||
#endif
|
||||
for(int iw=0; iw<nw; ++iw)
|
||||
{
|
||||
newScale[iw] = getDriftScale(m_tauovermass,newG[iw]);
|
||||
#ifdef QMC_COMPLEX
|
||||
PosType drNew = 0.0;
|
||||
convert(newScale[iw] * newG[iw], drNew);
|
||||
drNew += newpos[iw] - W[iw]->R[iat];
|
||||
#else
|
||||
PosType drNew =
|
||||
(newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat];
|
||||
#endif
|
||||
RealType logGb = -m_oneover2tau * dot(drNew, drNew);
|
||||
RealType x = logGb - logGf_v[iw];
|
||||
#ifdef QMC_COMPLEX
|
||||
RealType prob = ratios_real[iw]*ratios_real[iw]*std::exp(x);
|
||||
if(rand_v[iw] < prob && ratios_real[iw] > 0.0)
|
||||
#else
|
||||
RealType prob = ratios[iw]*ratios[iw]*std::exp(x);
|
||||
if(rand_v[iw] < prob && ratios[iw] > 0.0)
|
||||
#endif
|
||||
{
|
||||
accepted.push_back(W[iw]);
|
||||
nAccept++;
|
||||
|
@ -558,7 +639,7 @@ void DMCcuda::resetRun()
|
|||
PointerPool<Walker_t::cuda_Buffer_t > pool;
|
||||
Psi.reserve (pool);
|
||||
app_log() << "Each walker requires "
|
||||
<< pool.getTotalSize() * sizeof(CudaRealType)
|
||||
<< pool.getTotalSize() * sizeof(CudaValueType)
|
||||
<< " bytes in GPU memory.\n";
|
||||
app_log() << "Preparing to allocate " << W.WalkerList.size()
|
||||
<< " walkers.\n";
|
||||
|
|
|
@ -65,11 +65,11 @@ QMCCostFunctionCUDA::Return_t QMCCostFunctionCUDA::correlatedSampling(bool needD
|
|||
int numParams = OptVariablesForPsi.size();
|
||||
int numPtcl = W.getTotalNum();
|
||||
std::vector<RealType> logpsi_new(nw), logpsi_fixed(nw), KE(nw);
|
||||
TrialWaveFunction::ValueMatrix_t d_logpsi_dalpha, d_hpsioverpsi_dalpha;
|
||||
TrialWaveFunction::GradMatrix_t fixedG(nw, numPtcl);
|
||||
TrialWaveFunction::ValueMatrix_t fixedL(nw, numPtcl);
|
||||
TrialWaveFunction::GradMatrix_t newG(nw, numPtcl);
|
||||
TrialWaveFunction::ValueMatrix_t newL(nw, numPtcl);
|
||||
RealMatrix_t d_logpsi_dalpha, d_hpsioverpsi_dalpha;
|
||||
GradMatrix_t fixedG(nw, numPtcl);
|
||||
ValueMatrix_t fixedL(nw, numPtcl);
|
||||
GradMatrix_t newG(nw, numPtcl);
|
||||
ValueMatrix_t newL(nw, numPtcl);
|
||||
if (needDerivs)
|
||||
{
|
||||
d_logpsi_dalpha.resize(nw, numParams);
|
||||
|
|
|
@ -54,13 +54,14 @@ public:
|
|||
|
||||
protected:
|
||||
Matrix<Return_t> Records;
|
||||
typedef TrialWaveFunction::RealMatrix_t RealMatrix_t;
|
||||
typedef TrialWaveFunction::ValueMatrix_t ValueMatrix_t;
|
||||
typedef TrialWaveFunction::GradMatrix_t GradMatrix_t;
|
||||
/** Temp derivative properties and Hderivative properties of all the walkers
|
||||
*/
|
||||
std::vector<std::vector<Return_t> > TempDerivRecords;
|
||||
std::vector<std::vector<Return_t> > TempHDerivRecords;
|
||||
ValueMatrix_t LogPsi_Derivs, LocE_Derivs;
|
||||
RealMatrix_t LogPsi_Derivs, LocE_Derivs;
|
||||
ValueMatrix_t d2logPsi_opt, d2logPsi_fixed;
|
||||
GradMatrix_t dlogPsi_opt, dlogPsi_fixed;
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "ParticleBase/RandomSeqGenerator.h"
|
||||
#include "Message/CommOperators.h"
|
||||
#include "QMCDrivers/DriftOperators.h"
|
||||
#include "type_traits/scalar_traits.h"
|
||||
#include "qmc_common.h"
|
||||
|
||||
namespace qmcplusplus
|
||||
|
@ -63,6 +64,9 @@ void VMCcuda::advanceWalkers()
|
|||
std::vector<PosType> delpos(nw);
|
||||
std::vector<PosType> newpos(nw);
|
||||
std::vector<ValueType> ratios(nw);
|
||||
#ifdef QMC_COMPLEX
|
||||
std::vector<RealType> ratios_real(nw);
|
||||
#endif
|
||||
std::vector<GradType> newG(nw);
|
||||
std::vector<ValueType> newL(nw);
|
||||
std::vector<Walker_t*> accepted(nw);
|
||||
|
@ -75,19 +79,29 @@ void VMCcuda::advanceWalkers()
|
|||
makeGaussRandomWithEngine(delpos,Random);
|
||||
for(int iw=0; iw<nw; ++iw)
|
||||
{
|
||||
PosType G = W[iw]->G[iat];
|
||||
GradType G = W[iw]->G[iat];
|
||||
newpos[iw]=W[iw]->R[iat] + m_sqrttau*delpos[iw];
|
||||
ratios[iw] = 1.0;
|
||||
#ifdef QMC_COMPLEX
|
||||
ratios_real[iw] = 1.0;
|
||||
#endif
|
||||
}
|
||||
W.proposeMove_GPU(newpos, iat);
|
||||
Psi.ratio(W,iat,ratios,newG, newL);
|
||||
#ifdef QMC_COMPLEX
|
||||
Psi.convertRatiosFromComplexToReal(ratios, ratios_real);
|
||||
#endif
|
||||
accepted.clear();
|
||||
std::vector<bool> acc(nw, true);
|
||||
if (W.UseBoundBox)
|
||||
checkBounds (newpos, acc);
|
||||
for(int iw=0; iw<nw; ++iw)
|
||||
{
|
||||
#ifdef QMC_COMPLEX
|
||||
if(acc[iw] && ratios_real[iw]*ratios_real[iw] > Random())
|
||||
#else
|
||||
if(acc[iw] && ratios[iw]*ratios[iw] > Random())
|
||||
#endif
|
||||
{
|
||||
accepted.push_back(W[iw]);
|
||||
nAccept++;
|
||||
|
@ -224,7 +238,10 @@ void VMCcuda::advanceWalkersWithDrift()
|
|||
std::vector<PosType> dr(nw);
|
||||
std::vector<PosType> newpos(nw);
|
||||
std::vector<ValueType> ratios(nw);
|
||||
std::vector<PosType> oldG(nw), newG(nw);
|
||||
#ifdef QMC_COMPLEX
|
||||
std::vector<RealType> ratios_real(nw);
|
||||
#endif
|
||||
std::vector<GradType> oldG(nw), newG(nw);
|
||||
std::vector<ValueType> oldL(nw), newL(nw);
|
||||
std::vector<Walker_t*> accepted(nw);
|
||||
|
||||
|
@ -239,9 +256,17 @@ void VMCcuda::advanceWalkersWithDrift()
|
|||
for(int iw=0; iw<nw; iw++)
|
||||
{
|
||||
oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]);
|
||||
#ifdef QMC_COMPLEX
|
||||
convert(oldScale[iw]*oldG[iw], dr[iw]);
|
||||
dr[iw] += m_sqrttau * delpos[iw];
|
||||
#else
|
||||
dr[iw] = (m_sqrttau*delpos[iw]) + (oldScale[iw]*oldG[iw]);
|
||||
#endif
|
||||
newpos[iw]=W[iw]->R[iat] + dr[iw];
|
||||
ratios[iw] = 1.0;
|
||||
#ifdef QMC_COMPLEX
|
||||
ratios_real[iw] = 1.0;
|
||||
#endif
|
||||
}
|
||||
W.proposeMove_GPU(newpos, iat);
|
||||
Psi.calcRatio(W,iat,ratios,newG, newL);
|
||||
|
@ -253,22 +278,41 @@ void VMCcuda::advanceWalkersWithDrift()
|
|||
std::vector<RealType> rand_v(nw);
|
||||
for(int iw=0; iw<nw; ++iw)
|
||||
{
|
||||
#ifdef QMC_COMPLEX
|
||||
PosType drOld = 0.0;
|
||||
convert(oldScale[iw] * oldG[iw], drOld);
|
||||
drOld = newpos[iw] - (W[iw]->R[iat] + drOld);
|
||||
#else
|
||||
PosType drOld =
|
||||
newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]);
|
||||
#endif
|
||||
logGf_v[iw] = -m_oneover2tau * dot(drOld, drOld);
|
||||
rand_v[iw] = Random();
|
||||
}
|
||||
Psi.addRatio(W, iat, ratios, newG, newL);
|
||||
#ifdef QMC_COMPLEX
|
||||
Psi.convertRatiosFromComplexToReal(ratios, ratios_real);
|
||||
#endif
|
||||
for(int iw=0; iw<nw; ++iw)
|
||||
{
|
||||
newScale[iw] = getDriftScale(m_tauovermass,newG[iw]);
|
||||
#ifdef QMC_COMPLEX
|
||||
PosType drNew = 0.0;
|
||||
convert(newScale[iw] * newG[iw], drNew);
|
||||
drNew += newpos[iw] - W[iw]->R[iat];
|
||||
#else
|
||||
PosType drNew =
|
||||
(newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat];
|
||||
#endif
|
||||
// if (dot(drNew, drNew) > 25.0)
|
||||
// std::cerr << "Large drift encountered! Drift = " << drNew << std::endl;
|
||||
RealType logGb = -m_oneover2tau * dot(drNew, drNew);
|
||||
RealType x = logGb - logGf_v[iw];
|
||||
#ifdef QMC_COMPLEX
|
||||
RealType prob = ratios_real[iw]*ratios_real[iw]*std::exp(x);
|
||||
#else
|
||||
RealType prob = ratios[iw]*ratios[iw]*std::exp(x);
|
||||
#endif
|
||||
if(acc[iw] && rand_v[iw] < prob)
|
||||
{
|
||||
accepted.push_back(W[iw]);
|
||||
|
@ -413,7 +457,7 @@ void VMCcuda::resetRun()
|
|||
PointerPool<Walker_t::cuda_Buffer_t > pool;
|
||||
app_log() << "Starting VMCcuda::resetRun() " << std::endl;
|
||||
Psi.reserve (pool);
|
||||
app_log() << "Each walker requires " << pool.getTotalSize() * sizeof(CudaRealType)
|
||||
app_log() << "Each walker requires " << pool.getTotalSize() * sizeof(CudaValueType)
|
||||
<< " bytes in GPU memory.\n";
|
||||
// Now allocate memory on the GPU card for each walker
|
||||
// for (int iw=0; iw<W.WalkerList.size(); iw++) {
|
||||
|
|
|
@ -73,7 +73,7 @@ private:
|
|||
|
||||
opt_variables_type dummy;
|
||||
int numParams;
|
||||
Matrix<ValueType> d_logpsi_dalpha, d_hpsioverpsi_dalpha;
|
||||
Matrix<RealType> d_logpsi_dalpha, d_hpsioverpsi_dalpha;
|
||||
RealType w_beta,w_alpha;
|
||||
RealType E_avg, V_avg;
|
||||
std::string GEVtype;
|
||||
|
|
|
@ -372,9 +372,7 @@ struct BareKineticEnergy: public QMCHamiltonianBase
|
|||
for (int iw=0; iw<walkers.size(); iw++)
|
||||
{
|
||||
Walker_t &w = *(walkers[iw]);
|
||||
RealType KE = 0.0;
|
||||
for (int ptcl=0; ptcl<w.G.size(); ptcl++)
|
||||
KE -= 0.5*(dot (w.G[ptcl],w.G[ptcl]) + w.L[ptcl]);
|
||||
RealType KE = - OneOver2M * (Dot(w.G,w.G)+Sum(w.L));
|
||||
w.getPropertyBase()[NUMPROPERTIES+myIndex] = KE;
|
||||
LocalEnergy[iw] += KE;
|
||||
}
|
||||
|
@ -396,8 +394,8 @@ struct BareKineticEnergy: public QMCHamiltonianBase
|
|||
#endif
|
||||
|
||||
/***************************************************************************
|
||||
* $RCSfile$ $Author$
|
||||
* $Revision$ $Date$
|
||||
* $Id$
|
||||
* $RCSfile$ $Author: yingwai $
|
||||
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
|
||||
* $Id: BareKineticEnergy.h 7279 2016-11-24 00:21:16Z yingwai $
|
||||
***************************************************************************/
|
||||
|
||||
|
|
|
@ -126,9 +126,15 @@ struct ConservedEnergy: public QMCHamiltonianBase
|
|||
for (int iw=0; iw<walkers.size(); iw++)
|
||||
{
|
||||
Walker_t &w = *(walkers[iw]);
|
||||
RealType flux = 0.0;
|
||||
for (int ptcl=0; ptcl<w.G.size(); ptcl++)
|
||||
flux +=2.0 * dot(w.G[ptcl],w.G[ptcl]) + w.L[ptcl];
|
||||
RealType flux;
|
||||
RealType gradsq = Dot(w.G,w.G);
|
||||
RealType lap = Sum(w.L);
|
||||
#ifdef QMC_COMPLEX
|
||||
RealType gradsq_cc = Dot_CC(w.G,w.G);
|
||||
flux = lap + gradsq + gradsq_cc;
|
||||
#else
|
||||
flux = lap + 2*gradsq;
|
||||
#endif
|
||||
w.getPropertyBase()[NUMPROPERTIES+myIndex] = flux;
|
||||
}
|
||||
}
|
||||
|
@ -139,8 +145,8 @@ struct ConservedEnergy: public QMCHamiltonianBase
|
|||
#endif
|
||||
|
||||
/***************************************************************************
|
||||
* $RCSfile$ $Author$
|
||||
* $Revision$ $Date$
|
||||
* $Id$
|
||||
* $RCSfile$ $Author: tillackaf $
|
||||
* $Revision: 7408 $ $Date: 2017-01-10 13:29:49 -0500 (Tue, 10 Jan 2017) $
|
||||
* $Id: ConservedEnergy.h 7408 2017-01-10 18:29:49Z tillackaf $
|
||||
***************************************************************************/
|
||||
|
||||
|
|
|
@ -303,7 +303,11 @@ void NonLocalECPotential_CUDA::addEnergy(MCWalkerConfiguration &W,
|
|||
for (int iq=0; iq<numQuad; iq++)
|
||||
{
|
||||
RealType costheta = *(cos_ptr++);
|
||||
#ifdef QMC_COMPLEX
|
||||
RealType ratio = RatioList[ratioIndex++].real() * pp.sgridweight_m[iq]; // Abs(complex number)*cosine(phase of complex number) = Real part of said complex number
|
||||
#else
|
||||
RealType ratio = RatioList[ratioIndex++] * pp.sgridweight_m[iq];
|
||||
#endif
|
||||
// if (std::isnan(ratio)) {
|
||||
// std::cerr << "NAN from ratio number " << ratioIndex-1 << "\n";
|
||||
// std::cerr << "RatioList.size() = " << RatioList.size() << std::endl;
|
||||
|
@ -422,7 +426,11 @@ void NonLocalECPotential_CUDA::addEnergy
|
|||
for (int iq=0; iq<numQuad; iq++)
|
||||
{
|
||||
RealType costheta = *(cos_ptr++);
|
||||
#ifdef QMC_COMPLEX
|
||||
RealType ratio = RatioList[ratioIndex++].real() * pp.sgridweight_m[iq]; // Abs(complex number)*cosine(phase of complex number) = Real part of said complex number
|
||||
#else
|
||||
RealType ratio = RatioList[ratioIndex++] * pp.sgridweight_m[iq];
|
||||
#endif
|
||||
RealType lpolprev=0.0;
|
||||
lpol[0] = 1.0;
|
||||
for (int l=0 ; l< pp.lmax ; l++)
|
||||
|
|
|
@ -21,45 +21,11 @@
|
|||
#include <einspline/multi_bspline_eval_cuda.h>
|
||||
#include "Configuration.h"
|
||||
#include "AtomicOrbitalCuda.h"
|
||||
#include "PhaseFactors.h"
|
||||
#ifdef HAVE_MKL
|
||||
#include <mkl_vml.h>
|
||||
#endif
|
||||
|
||||
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
|
||||
float pos[], float *phi_in[], float *phi_out[],
|
||||
int num_splines, int num_walkers);
|
||||
#ifdef ALGO_CHRISTOS
|
||||
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
|
||||
float pos[], float *phi_in[], float *phi_out[],
|
||||
float *GL_in[], float *GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride,
|
||||
bool dontMakeTwoCopies);
|
||||
#else
|
||||
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
|
||||
float pos[], float *phi_in[], float *phi_out[],
|
||||
float *GL_in[], float *GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride);
|
||||
#endif
|
||||
|
||||
void apply_phase_factors(float kPoints[], int makeTwoCopies[], int TwoCopiesIndex[],
|
||||
float pos[], float *phi_in[], float *phi_out[],
|
||||
float *GL_in[], float *GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride);
|
||||
|
||||
void apply_phase_factors(double kPoints[], int makeTwoCopies[],
|
||||
double pos[], double *phi_in[], double *phi_out[],
|
||||
int num_splines, int num_walkers);
|
||||
|
||||
void apply_phase_factors(double kPoints[], int makeTwoCopies[],
|
||||
double pos[], double *phi_in[], double *phi_out[],
|
||||
double *GL_in[], double *GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride);
|
||||
|
||||
void apply_phase_factors(double kPoints[], int makeTwoCopies[], int TwoCopiesIndex[],
|
||||
double pos[], double *phi_in[], double *phi_out[],
|
||||
double *GL_in[], double *GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride);
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
inline void create_multi_UBspline_3d_cuda (multi_UBspline_3d_d *in,
|
||||
|
@ -224,7 +190,6 @@ eval_multi_multi_UBspline_3d_cuda (multi_UBspline_3d_z_cuda *spline,
|
|||
eval_multi_multi_UBspline_3d_z_cuda (spline, pos, phi, N);
|
||||
}
|
||||
|
||||
|
||||
inline void eval_multi_multi_UBspline_3d_vgl_cuda
|
||||
(multi_UBspline_3d_c_cuda *spline, float *pos, float Linv[],
|
||||
std::complex<float> *phi[], std::complex<float> *grad_lapl[], int N, int row_stride)
|
||||
|
@ -241,7 +206,6 @@ inline void eval_multi_multi_UBspline_3d_vgl_cuda
|
|||
(spline, pos, Linv, phi, grad_lapl, N, row_stride);
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Vectorized evaluation routines using GPU //
|
||||
//////////////////////////////////////////////
|
||||
|
@ -323,6 +287,31 @@ EinsplineSetExtended<std::complex<double> >::evaluate
|
|||
}
|
||||
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
template<> void
|
||||
EinsplineSetExtended<double>::evaluate
|
||||
(std::vector<Walker_t*> &walkers, int iat,
|
||||
gpu::device_vector<CudaComplexType*> &phi)
|
||||
{
|
||||
app_error() << "Code should not arrive at this point: "
|
||||
<< "EinsplineSetExtended<double>::evaluate at line "
|
||||
<< __LINE__ << " in file " << __FILE__ << "\n";
|
||||
abort();
|
||||
}
|
||||
|
||||
template<> void
|
||||
EinsplineSetExtended<std::complex<double> >::evaluate
|
||||
(std::vector<Walker_t*> &walkers, int iat,
|
||||
gpu::device_vector<CudaComplexType*> &phi)
|
||||
{
|
||||
app_error() << "EinsplineSetExtended<std::complex<double> >::evaluate at line " << __LINE__
|
||||
<< " in file " << __FILE__
|
||||
<< " not yet implemented.\n";
|
||||
abort();
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template<> void
|
||||
EinsplineSetExtended<double>::evaluate
|
||||
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
|
||||
|
@ -454,6 +443,32 @@ EinsplineSetExtended<std::complex<double> >::evaluate
|
|||
walkers.size());
|
||||
}
|
||||
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
template<> void
|
||||
EinsplineSetExtended<double>::evaluate
|
||||
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
|
||||
gpu::device_vector<CudaComplexType*> &phi)
|
||||
{
|
||||
app_error() << "Code should not arrive at this point: "
|
||||
<< "EinsplineSetExtended<double>::evaluate at line "
|
||||
<< __LINE__ << " in file " << __FILE__ << "\n";
|
||||
abort();
|
||||
}
|
||||
|
||||
template<> void
|
||||
EinsplineSetExtended<std::complex<double> >::evaluate
|
||||
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
|
||||
gpu::device_vector<CudaComplexType*> &phi)
|
||||
{
|
||||
app_error() << "EinsplineSetExtended<std::complex<double>>::evaluate at line " << __LINE__
|
||||
<< " in file " << __FILE__
|
||||
<< " not yet implemented.\n";
|
||||
abort();
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template<> void
|
||||
EinsplineSetExtended<double>::evaluate
|
||||
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
|
||||
|
@ -554,25 +569,6 @@ EinsplineSetExtended<std::complex<double> >::evaluate
|
|||
hostPos[iw] = newpos[iw];
|
||||
cudapos = hostPos;
|
||||
|
||||
#ifdef ALGO_CHRISTOS
|
||||
// Slower for some runs
|
||||
// christos: is MakeTwoCopies consistent with CudaMakeTwoCopies?
|
||||
bool noTwoCopies = true;
|
||||
for (int i=0 ; i<MakeTwoCopies.size() ; i++)
|
||||
if (MakeTwoCopies[i]) {
|
||||
noTwoCopies = false;
|
||||
break;
|
||||
}
|
||||
// std::cout << "noTwoCopies=" << noTwoCopies << std::endl;
|
||||
|
||||
apply_phase_factors ((CUDA_PRECISION*) CudakPoints.data(),
|
||||
CudaMakeTwoCopies.data(),
|
||||
(CudaRealType*)cudapos.data(),
|
||||
(CudaRealType**)CudaValuePointers.data(), phi.data(),
|
||||
(CudaRealType**)CudaGradLaplPointers.data(), grad_lapl.data(),
|
||||
CudaMultiSpline->num_splines, walkers.size(), row_stride,
|
||||
noTwoCopies);
|
||||
#else
|
||||
/* Original implementation
|
||||
apply_phase_factors ((CudaRealType*) CudakPoints.data(),
|
||||
CudaMakeTwoCopies.data(),
|
||||
|
@ -589,10 +585,67 @@ EinsplineSetExtended<std::complex<double> >::evaluate
|
|||
(CudaRealType**)CudaValuePointers.data(), phi.data(),
|
||||
(CudaRealType**)CudaGradLaplPointers.data(), grad_lapl.data(),
|
||||
CudaMultiSpline->num_splines, walkers.size(), row_stride);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
template<> void
|
||||
EinsplineSetExtended<double>::evaluate
|
||||
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
|
||||
gpu::device_vector<CudaComplexType*> &phi,
|
||||
gpu::device_vector<CudaComplexType*> &grad_lapl,
|
||||
int row_stride)
|
||||
{
|
||||
app_error() << "Code should not arrive at this point: "
|
||||
<< "EinsplineSetExtended<double>::evaluate at line "
|
||||
<< __LINE__ << " in file " << __FILE__ << "\n";
|
||||
abort();
|
||||
}
|
||||
|
||||
template<> void
|
||||
EinsplineSetExtended<std::complex<double> >::evaluate
|
||||
(std::vector<Walker_t*> &walkers, std::vector<PosType> &newpos,
|
||||
gpu::device_vector<CudaComplexType*> &phi,
|
||||
gpu::device_vector<CudaComplexType*> &grad_lapl,
|
||||
int row_stride)
|
||||
{
|
||||
int N = walkers.size();
|
||||
int M = CudaMultiSpline->num_splines;
|
||||
if (CudaValuePointers.size() < N)
|
||||
resize_cuda(N);
|
||||
if (cudapos.size() < N)
|
||||
{
|
||||
hostPos.resize(N);
|
||||
cudapos.resize(N);
|
||||
}
|
||||
for (int iw=0; iw < N; iw++)
|
||||
{
|
||||
PosType r = newpos[iw];
|
||||
PosType ru(PrimLattice.toUnit(r));
|
||||
ru[0] -= std::floor (ru[0]);
|
||||
ru[1] -= std::floor (ru[1]);
|
||||
ru[2] -= std::floor (ru[2]);
|
||||
hostPos[iw] = ru;
|
||||
}
|
||||
cudapos = hostPos;
|
||||
eval_multi_multi_UBspline_3d_vgl_cuda
|
||||
(CudaMultiSpline, (CudaRealType*)cudapos.data(), Linv_cuda.data(), CudaValuePointers.data(),
|
||||
CudaGradLaplPointers.data(), N, CudaMultiSpline->num_splines);
|
||||
// Now, add on phases
|
||||
for (int iw=0; iw < N; iw++)
|
||||
hostPos[iw] = newpos[iw];
|
||||
cudapos = hostPos;
|
||||
apply_phase_factors ((CudaRealType*) CudakPoints.data(),
|
||||
(CudaRealType*) cudapos.data(),
|
||||
(CudaValueType**) CudaValuePointers.data(),
|
||||
(CudaValueType**) phi.data(),
|
||||
(CudaValueType**) CudaGradLaplPointers.data(),
|
||||
(CudaValueType**) grad_lapl.data(),
|
||||
CudaMultiSpline->num_splines, walkers.size(), row_stride);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template<> void
|
||||
EinsplineSetExtended<double>::evaluate
|
||||
(std::vector<PosType> &pos, gpu::device_vector<CudaRealType*> &phi)
|
||||
|
@ -634,8 +687,9 @@ template<> void
|
|||
EinsplineSetExtended<double>::evaluate
|
||||
(std::vector<PosType> &pos, gpu::device_vector<CudaComplexType*> &phi)
|
||||
{
|
||||
app_error() << "EinsplineSetExtended<std::complex<double> >::evaluate "
|
||||
<< "not yet implemented.\n";
|
||||
app_error() << "EinsplineSetExtended<std::complex<double> >::evaluate at "
|
||||
<< __LINE__ << " in file " << __FILE__
|
||||
<< " not yet implemented.\n";
|
||||
abort();
|
||||
}
|
||||
|
||||
|
@ -682,9 +736,57 @@ template<> void
|
|||
EinsplineSetExtended<std::complex<double> >::evaluate
|
||||
(std::vector<PosType> &pos, gpu::device_vector<CudaComplexType*> &phi)
|
||||
{
|
||||
app_error() << "EinsplineSetExtended<std::complex<double> >::evaluate "
|
||||
<< "not yet implemented.\n";
|
||||
#ifdef QMC_COMPLEX
|
||||
int N = pos.size();
|
||||
if (CudaValuePointers.size() < N)
|
||||
resize_cuda(N);
|
||||
if (cudapos.size() < N)
|
||||
{
|
||||
hostPos.resize(N);
|
||||
cudapos.resize(N);
|
||||
}
|
||||
for (int iw=0; iw < N; iw++)
|
||||
{
|
||||
PosType r = pos[iw];
|
||||
PosType ru(PrimLattice.toUnit(r));
|
||||
for (int i=0; i<OHMMS_DIM; i++)
|
||||
ru[i] -= std::floor (ru[i]);
|
||||
hostPos[iw] = ru;
|
||||
}
|
||||
cudapos = hostPos;
|
||||
// AT debug:
|
||||
// std::cout << "# splines: " << CudaMultiSpline->num_splines << "\n";
|
||||
eval_multi_multi_UBspline_3d_cuda
|
||||
(CudaMultiSpline, (CudaRealType*) cudapos.data(),
|
||||
CudaValuePointers.data(), N);
|
||||
// Now, add on phases
|
||||
for (int iw=0; iw < N; iw++)
|
||||
hostPos[iw] = pos[iw];
|
||||
cudapos = hostPos;
|
||||
apply_phase_factors((CudaRealType*) CudakPoints.data(),
|
||||
(CudaRealType*) cudapos.data(),
|
||||
CudaValuePointers.data(),
|
||||
phi.data(),
|
||||
CudaMultiSpline->num_splines, N);
|
||||
// AT debug:
|
||||
/* gpu::host_vector<CudaValueType*> pointers;
|
||||
pointers = CudaValuePointers;
|
||||
CudaValueType data[N], data_new[N];
|
||||
cudaMemcpy (data, pointers[0], N*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
pointers = phi;
|
||||
cudaMemcpy (data_new, pointers[0], N*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
std::cout << "CudaValuePointers -> phi (# splines: " << CudaMultiSpline->num_splines << "):\n";
|
||||
for (int i=0; i<N; i++)
|
||||
std::cout << i << ": " << data[i].real() << " + " << data[i].imag() << "i -> " << data_new[i].real() << " + " << data_new[i].imag() << "i\n";
|
||||
std::cout.flush();
|
||||
abort();*/
|
||||
#else
|
||||
app_error() << "EinsplineSetExtended<std::complex<double> >::evaluate at "
|
||||
<< __LINE__ << " in file " << __FILE__
|
||||
<< " not yet implemented.\n";
|
||||
abort();
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -1079,8 +1181,6 @@ EinsplineSetHybrid<double>::evaluate (std::vector<Walker_t*> &walkers, std::vect
|
|||
fprintf (stderr, " N img dist ion lMax\n");
|
||||
for (int i=0; i<HybridData_CPU.size(); i++)
|
||||
{
|
||||
//YingWai's trial fix
|
||||
//HybridDataFloat &d = HybridData_CPU[i];
|
||||
HybridData<CudaRealType> &d = HybridData_CPU[i];
|
||||
fprintf (stderr, " %d %2.0f %2.0f %2.0f %8.5f %d %d\n",
|
||||
i, d.img[0], d.img[1], d.img[2], d.dist, d.ion, d.lMax);
|
||||
|
@ -1392,8 +1492,6 @@ EinsplineSetHybrid<std::complex<double> >::evaluate
|
|||
gpu::host_vector<CudaRealType> vals_CPU(NumOrbitals), GL_CPU(4*row_stride);
|
||||
gpu::host_vector<HybridJobType> HybridJobs_CPU(HybridJobs_GPU.size());
|
||||
HybridJobs_CPU = HybridJobs_GPU;
|
||||
//YingWai's trial fix
|
||||
//gpu::host_vector<HybridDataFloat> HybridData_CPU(HybridData_GPU.size());
|
||||
gpu::host_vector<HybridData<CudaRealType> > HybridData_CPU(HybridData_GPU.size());
|
||||
HybridData_CPU = HybridData_GPU;
|
||||
rhats_CPU = rhats_GPU;
|
||||
|
@ -1409,8 +1507,6 @@ EinsplineSetHybrid<std::complex<double> >::evaluate
|
|||
ComplexGradVector_t CPUzgrad(M);
|
||||
ValueVector_t CPUvals(NumOrbitals), CPUlapl(NumOrbitals);
|
||||
GradVector_t CPUgrad(NumOrbitals);
|
||||
//YingWai's trial fix
|
||||
//HybridDataFloat &d = HybridData_CPU[iw];
|
||||
HybridData<CudaRealType> &d = HybridData_CPU[iw];
|
||||
AtomicOrbital<std::complex<double> > &atom = AtomicOrbitals[d.ion];
|
||||
atom.evaluate (newpos[iw], CPUzvals, CPUzgrad, CPUzlapl);
|
||||
|
@ -1599,8 +1695,6 @@ EinsplineSetHybrid<std::complex<double> >::evaluate
|
|||
fprintf (stderr, " N img dist ion lMax\n");
|
||||
for (int i=0; i<HybridData_CPU.size(); i++)
|
||||
{
|
||||
//YingWai's trial fix
|
||||
//HybridDataFloat &d = HybridData_CPU[i];
|
||||
HybridData<CudaRealType> &d = HybridData_CPU[i];
|
||||
fprintf (stderr, " %d %2.0f %2.0f %2.0f %8.5f %d %d\n",
|
||||
i, d.img[0], d.img[1], d.img[2], d.dist, d.ion, d.lMax);
|
||||
|
|
|
@ -362,7 +362,7 @@ public:
|
|||
}
|
||||
|
||||
virtual void
|
||||
reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool)
|
||||
reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool)
|
||||
{
|
||||
std::cerr << "Need specialization of DiracDetermiantBase::reserve.\n";
|
||||
abort();
|
||||
|
@ -464,7 +464,7 @@ public:
|
|||
}
|
||||
#endif
|
||||
/***************************************************************************
|
||||
* $RCSfile$ $Author$
|
||||
* $Revision$ $Date$
|
||||
* $Id$
|
||||
* $RCSfile$ $Author: yingwai $
|
||||
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
|
||||
* $Id: DiracDeterminantBase.h 7279 2016-11-24 00:21:16Z yingwai $
|
||||
***************************************************************************/
|
||||
|
|
|
@ -55,7 +55,7 @@ DiracDeterminantCUDA::DiracDeterminantCUDA(SPOSetBasePtr const &spos, int first)
|
|||
NLratioList_d("DiracDeterminantBase::NLratioList_d")
|
||||
{
|
||||
for(int i = 0; i < 2; ++i)
|
||||
NLratios_d[i] = gpu::device_vector<CudaRealType>("DiracDeterminantBase::NLratios_d");
|
||||
NLratios_d[i] = gpu::device_vector<CudaValueType>("DiracDeterminantBase::NLratios_d");
|
||||
}
|
||||
|
||||
DiracDeterminantCUDA::DiracDeterminantCUDA(const DiracDeterminantCUDA& s) :
|
||||
|
@ -86,7 +86,7 @@ DiracDeterminantCUDA::DiracDeterminantCUDA(const DiracDeterminantCUDA& s) :
|
|||
NLratioList_d("DiracDeterminantBase::NLratioList_d")
|
||||
{
|
||||
for(int i = 0; i < 2; ++i)
|
||||
NLratios_d[i] = gpu::device_vector<CudaRealType>("DiracDeterminantBase::NLratios_d");
|
||||
NLratios_d[i] = gpu::device_vector<CudaValueType>("DiracDeterminantBase::NLratios_d");
|
||||
}
|
||||
|
||||
|
||||
|
@ -161,18 +161,22 @@ DiracDeterminantCUDA::update (std::vector<Walker_t*> &walkers, int iat)
|
|||
// // Copy temporary gradients and laplacians into matrix
|
||||
// multi_copy (destList_d.data(), srcList_d.data(),
|
||||
// 4*RowStride, walkers.size());
|
||||
|
||||
|
||||
// Check matrix inversion + fixing the CUDA debugging code
|
||||
#ifdef DEBUG_CUDA
|
||||
float Ainv[NumPtcls][NumPtcls], A[NumPtcls][NumPtcls];
|
||||
float new_row[NumPtcls], Ainv_delta[NumPtcls];
|
||||
for (int iw=0; iw<walkers.size(); iw++)
|
||||
{
|
||||
CudaValueType Ainv[NumPtcls][RowStride], A[NumPtcls][RowStride];
|
||||
CudaValueType new_row[RowStride]; //Ainv_delta[NumPtcls];
|
||||
//for (int iw=0; iw<walkers.size(); iw++)
|
||||
//{
|
||||
int iw = 0;
|
||||
Walker_t::cuda_Buffer_t &data = walkers[iw]->cuda_DataSet;
|
||||
cudaMemcpy (A, &(data.data()[AOffset]),
|
||||
NumPtcls*NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy (Ainv, &(data.data()[AinvOffset]),
|
||||
NumPtcls*NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy (new_row, &(data.data()[newRowOffset]),
|
||||
NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
// for (int i=0; i<NumPtcls; i++)
|
||||
// cerr << "new_row(" << i << ") = " << new_row[i]
|
||||
// << " old_row = " << A[iat-FirstIndex][i] << std::endl;
|
||||
|
@ -192,19 +196,19 @@ DiracDeterminantCUDA::update (std::vector<Walker_t*> &walkers, int iat)
|
|||
for (int i=0; i<NumPtcls; i++)
|
||||
for (int j=0; j<NumPtcls; j++)
|
||||
{
|
||||
float val = 0.0;
|
||||
CudaValueType val=0.0;
|
||||
for (int k=0; k<NumPtcls; k++)
|
||||
val += Ainv[i][k]*A[k][j];
|
||||
if (i==j && (std::abs(val-1.0) > 1.0e-2))
|
||||
if (i==j && (std::abs(std::real(val)-1.0) > 1.0e-2))
|
||||
std::cerr << "Error in inverse at (i,j) = (" << i << ", " << j
|
||||
<< ") val = " << val << " walker = " << iw
|
||||
<< " of " << walkers.size() << std::endl;
|
||||
<< ") val = " << val << " walker = " << iw
|
||||
<< " of " << walkers.size() << std::endl;
|
||||
else if ((i!=j) && (std::abs(val) > 1.0e-2))
|
||||
std::cerr << "Error in inverse at (i,j) = (" << i << ", " << j
|
||||
<< ") val = " << val << " walker = " << iw
|
||||
<< " of " << walkers.size() << std::endl;
|
||||
<< ") val = " << val << " walker = " << iw
|
||||
<< " of " << walkers.size() << std::endl;
|
||||
}
|
||||
}
|
||||
//}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -278,17 +282,18 @@ DiracDeterminantCUDA::update (const std::vector<Walker_t*> &walkers,
|
|||
// multi_copy (destList_d.data(), srcList_d.data(),
|
||||
// 4*RowStride, walkers.size());
|
||||
#ifdef DEBUG_CUDA
|
||||
float Ainv[NumPtcls][NumPtcls], A[NumPtcls][NumPtcls];
|
||||
float new_row[NumPtcls], Ainv_delta[NumPtcls];
|
||||
for (int iw=0; iw<walkers.size(); iw++)
|
||||
{
|
||||
CudaValueType Ainv[NumPtcls][RowStride], A[NumPtcls][RowStride];
|
||||
CudaValueType new_row[RowStride]; //Ainv_delta[NumPtcls];
|
||||
//for (int iw=0; iw<walkers.size(); iw++)
|
||||
//{
|
||||
int iw = 0;
|
||||
Walker_t::cuda_Buffer_t &data = walkers[iw]->cuda_DataSet;
|
||||
cudaMemcpy (A, &(data.data()[AOffset]),
|
||||
NumPtcls*NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy (Ainv, &(data.data()[AinvOffset]),
|
||||
NumPtcls*NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy (new_row, &(data.data()[newRowOffset]),
|
||||
NumPtcls*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
// for (int i=0; i<NumPtcls; i++)
|
||||
// cerr << "new_row(" << i << ") = " << new_row[i]
|
||||
// << " old_row = " << A[iat-FirstIndex][i] << std::endl;
|
||||
|
@ -308,19 +313,19 @@ DiracDeterminantCUDA::update (const std::vector<Walker_t*> &walkers,
|
|||
for (int i=0; i<NumPtcls; i++)
|
||||
for (int j=0; j<NumPtcls; j++)
|
||||
{
|
||||
float val = 0.0;
|
||||
CudaValueType val=0.0;
|
||||
for (int k=0; k<NumPtcls; k++)
|
||||
val += Ainv[i][k]*A[k][j];
|
||||
if (i==j && (std::abs(val-1.0) > 1.0e-2))
|
||||
if (i==j && (std::abs(std::real(val)-1.0) > 1.0e-2))
|
||||
std::cerr << "Error in inverse at (i,j) = (" << i << ", " << j
|
||||
<< ") val = " << val << " walker = " << iw
|
||||
<< " of " << walkers.size() << std::endl;
|
||||
<< ") val = " << val << " walker = " << iw
|
||||
<< " of " << walkers.size() << std::endl;
|
||||
else if ((i!=j) && (std::abs(val) > 1.0e-2))
|
||||
std::cerr << "Error in inverse at (i,j) = (" << i << ", " << j
|
||||
<< ") val = " << val << " walker = " << iw
|
||||
<< " of " << walkers.size() << std::endl;
|
||||
<< ") val = " << val << " walker = " << iw
|
||||
<< " of " << walkers.size() << std::endl;
|
||||
}
|
||||
}
|
||||
//}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -410,6 +415,35 @@ DiracDeterminantCUDA::recompute(MCWalkerConfiguration &W, bool firstTime)
|
|||
AWorkList_d.data(), AinvWorkList_d.data(),
|
||||
NumPtcls, RowStride, walkers.size(), useDoublePrecision);
|
||||
|
||||
#ifdef DEBUG_CUDA
|
||||
CudaValueType Ainv[NumPtcls][RowStride], A[NumPtcls][RowStride];
|
||||
//for (int iw=0; iw<walkers.size(); iw++)
|
||||
//{
|
||||
int iw = 0;
|
||||
Walker_t::cuda_Buffer_t &data = walkers[iw]->cuda_DataSet;
|
||||
cudaMemcpy (A, &(data.data()[AOffset]),
|
||||
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy (Ainv, &(data.data()[AinvOffset]),
|
||||
NumPtcls*RowStride*sizeof(CudaValueType), cudaMemcpyDeviceToHost);
|
||||
|
||||
FILE *f1, *f2;
|
||||
f1 = fopen("A.dat", "a");
|
||||
f2 = fopen("Ainv.dat", "a");
|
||||
for (int i=0; i<NumPtcls; i++)
|
||||
for (int j=0; j<NumPtcls; j++)
|
||||
{
|
||||
#ifdef QMC_COMPLEX
|
||||
fprintf(f1, "%5d %5d %20.15e %20.15e\n", i, j, A[i][j].real(), A[i][j].imag());
|
||||
fprintf(f2, "%5d %5d %20.15e %20.15e\n", i, j, Ainv[i][j].real(), Ainv[i][j].imag());
|
||||
#else
|
||||
fprintf(f1, "%5d %5d %20.15e\n", i, j, A[i][j]);
|
||||
fprintf(f2, "%5d %5d %20.15e\n", i, j, Ainv[i][j]);
|
||||
#endif
|
||||
}
|
||||
//}
|
||||
fclose(f1);
|
||||
fclose(f2);
|
||||
#endif
|
||||
// HACK HACK HACK
|
||||
// app_log() << "After recompute:\n";
|
||||
// double A[NumPtcls*NumPtcls], work[NumPtcls*NumPtcls];
|
||||
|
@ -479,10 +513,9 @@ DiracDeterminantCUDA::addLog (MCWalkerConfiguration &W, std::vector<RealType> &l
|
|||
for (int iw=0; iw<walkers.size(); iw++)
|
||||
{
|
||||
Walker_t::cuda_Buffer_t& data = walkers[iw]->cuda_DataSet;
|
||||
gpu::host_vector<CUDA_PRECISION> host_data;
|
||||
gpu::host_vector<CudaValueType> host_data;
|
||||
Vector<CudaValueType> A(NumPtcls*NumOrbitals);
|
||||
host_data = data;
|
||||
//Vector<double> A(NumPtcls*NumOrbitals);
|
||||
Vector<CUDA_PRECISION> A(NumPtcls*NumOrbitals);
|
||||
for (int i=0; i<NumPtcls; i++)
|
||||
for (int j=0; j<NumOrbitals; j++)
|
||||
A[i*NumOrbitals+j] = host_data[AOffset+i*RowStride+j];
|
||||
|
@ -668,8 +701,8 @@ void DiracDeterminantCUDA::ratio (MCWalkerConfiguration &W, int iat,
|
|||
ratio_host = ratio_d;
|
||||
#ifdef CUDA_DEBUG
|
||||
// Now, check against CPU
|
||||
gpu::host_vector<CudaRealType> host_data;
|
||||
std::vector<CudaRealType> cpu_ratios(walkers.size(), 0.0f);
|
||||
gpu::host_vector<CudaValueType> host_data;
|
||||
std::vector<CudaValueType> cpu_ratios(walkers.size(), 0.0f);
|
||||
for (int iw=0; iw<walkers.size(); iw++)
|
||||
{
|
||||
host_data = walkers[iw]->cuda_DataSet;
|
||||
|
@ -697,8 +730,8 @@ void DiracDeterminantCUDA::ratio (MCWalkerConfiguration &W, int iat,
|
|||
#ifdef CUDA_DEBUG
|
||||
if (NumOrbitals == 31)
|
||||
{
|
||||
gpu::host_vector<CudaRealType> host_data;
|
||||
std::vector<CudaRealType> cpu_ratios(walkers.size(), 0.0f);
|
||||
gpu::host_vector<CudaValueType> host_data;
|
||||
std::vector<CudaValueType> cpu_ratios(walkers.size(), 0.0f);
|
||||
for (int iw=0; iw<walkers.size(); iw++)
|
||||
{
|
||||
host_data = walkers[iw]->cuda_DataSet;
|
||||
|
@ -763,8 +796,8 @@ void DiracDeterminantCUDA::calcRatio (MCWalkerConfiguration &W, int iat,
|
|||
cudaEventRecord(gpu::ratioSyncDiracEvent, gpu::memoryStream);
|
||||
#ifdef CUDA_DEBUG
|
||||
// Now, check against CPU
|
||||
gpu::host_vector<CudaRealType> host_data;
|
||||
std::vector<CudaRealType> cpu_ratios(walkers.size(), 0.0f);
|
||||
gpu::host_vector<CudaValueType> host_data;
|
||||
std::vector<CudaValueType> cpu_ratios(walkers.size(), 0.0f);
|
||||
for (int iw=0; iw<walkers.size(); iw++)
|
||||
{
|
||||
host_data = walkers[iw]->cuda_DataSet;
|
||||
|
@ -799,8 +832,8 @@ void DiracDeterminantCUDA::addRatio (MCWalkerConfiguration &W, int iat,
|
|||
#ifdef CUDA_DEBUG
|
||||
if (NumOrbitals == 31)
|
||||
{
|
||||
gpu::host_vector<CudaRealType> host_data;
|
||||
std::vector<CudaRealType> cpu_ratios(walkers.size(), 0.0f);
|
||||
gpu::host_vector<CudaValueType> host_data;
|
||||
std::vector<CudaValueType> cpu_ratios(walkers.size(), 0.0f);
|
||||
for (int iw=0; iw<walkers.size(); iw++)
|
||||
{
|
||||
host_data = walkers[iw]->cuda_DataSet;
|
||||
|
@ -896,8 +929,16 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
|
|||
gradLapl_host[4*(iw*RowStride + iat)+1],
|
||||
gradLapl_host[4*(iw*RowStride + iat)+2]);
|
||||
grads(iw,iat+FirstIndex) += g;
|
||||
#ifdef QMC_COMPLEX
|
||||
//YingWai's trial fix; need to revise the dot product
|
||||
// should be converted into "#if defined QMC_COMPLEX" later
|
||||
lapl(iw,iat+FirstIndex) += gradLapl_host[4*(iw*RowStride + iat)+3] - (CudaValueType)dot(g,g);
|
||||
ValueType lapl_temp = lapl(iw,iat+FirstIndex);
|
||||
if ( std::isnan(std::real(lapl_temp)) || std::isnan(std::imag(lapl_temp)) )
|
||||
#else
|
||||
lapl(iw,iat+FirstIndex) += gradLapl_host[4*(iw*RowStride + iat)+3] - dot(g,g);
|
||||
if (std::isnan(lapl(iw,iat+FirstIndex)))
|
||||
#endif
|
||||
{
|
||||
char name[1000];
|
||||
gethostname(name, 1000);
|
||||
|
@ -905,7 +946,7 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
|
|||
int dev;
|
||||
cudaGetDevice(&dev);
|
||||
fprintf (stderr, "Offending device = %d\n", dev);
|
||||
gpu::host_vector<CUDA_PRECISION> host_data;
|
||||
gpu::host_vector<CudaValueType> host_data;
|
||||
host_data = walkers[iw]->cuda_DataSet;
|
||||
FILE *Amat = fopen ("Amat.dat", "w");
|
||||
FILE *Ainv = fopen ("Ainv.dat", "w");
|
||||
|
@ -915,11 +956,19 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
|
|||
{
|
||||
for (int j=0; j<NumPtcls; j++)
|
||||
{
|
||||
#ifdef QMC_COMPLEX
|
||||
fprintf (Amat, "%14.8e+%14.8ei ", host_data[AOffset+i*RowStride+j].real(), host_data[AOffset+i*RowStride+j].imag());
|
||||
fprintf (Ainv, "%14.8e+%14.8ei ", host_data[AinvOffset+i*RowStride+j].real(), host_data[AinvOffset+i*RowStride+j].imag());
|
||||
fprintf (Lmat, "%14.8e+%14.8ei ", host_data[gradLaplOffset+(4*i+3)*RowStride+j].real(), host_data[gradLaplOffset+(4*i+3)*RowStride+j].imag());
|
||||
for (int k=0; k<3; k++)
|
||||
fprintf (Lmat, "%14.8e+%14.8ei ", host_data[gradLaplOffset+(4*i+k)*RowStride+j].real(), host_data[gradLaplOffset+(4*i+k)*RowStride+j].imag());
|
||||
#else
|
||||
fprintf (Amat, "%14.8e ", host_data[AOffset+i*RowStride+j]);
|
||||
fprintf (Ainv, "%14.8e ", host_data[AinvOffset+i*RowStride+j]);
|
||||
fprintf (Lmat, "%14.8e ", host_data[gradLaplOffset+(4*i+3)*RowStride+j]);
|
||||
for (int k=0; k<3; k++)
|
||||
fprintf (Lmat, "%14.8e ", host_data[gradLaplOffset+(4*i+k)*RowStride+j]);
|
||||
#endif
|
||||
}
|
||||
fprintf (Amat, "\n");
|
||||
fprintf (Ainv, "\n");
|
||||
|
@ -942,17 +991,29 @@ DiracDeterminantCUDA::gradLapl (MCWalkerConfiguration &W, GradMatrix_t &grads,
|
|||
walkers[iw]->R[iat+FirstIndex][1],
|
||||
walkers[iw]->R[iat+FirstIndex][2]);
|
||||
for (int orb=0; orb<NumPtcls; orb++)
|
||||
#ifdef QMC_COMPLEX
|
||||
fprintf (stderr, "%1.10e+%1.10ei %1.10e+%1.10ei %1.10e+%1.10ei %1.10e+%1.10ei \n",
|
||||
host_data[gradLaplOffset +(4*iat+0)*RowStride+orb].real(),
|
||||
host_data[gradLaplOffset +(4*iat+0)*RowStride+orb].imag(),
|
||||
host_data[gradLaplOffset +(4*iat+1)*RowStride+orb].real(),
|
||||
host_data[gradLaplOffset +(4*iat+1)*RowStride+orb].imag(),
|
||||
host_data[gradLaplOffset +(4*iat+2)*RowStride+orb].real(),
|
||||
host_data[gradLaplOffset +(4*iat+2)*RowStride+orb].imag(),
|
||||
host_data[gradLaplOffset +(4*iat+3)*RowStride+orb].real(),
|
||||
host_data[gradLaplOffset +(4*iat+3)*RowStride+orb].imag());
|
||||
#else
|
||||
fprintf (stderr, "%1.10e %1.10e %1.10e %1.10e \n",
|
||||
host_data[gradLaplOffset +(4*iat+0)*RowStride+orb],
|
||||
host_data[gradLaplOffset +(4*iat+1)*RowStride+orb],
|
||||
host_data[gradLaplOffset +(4*iat+2)*RowStride+orb],
|
||||
host_data[gradLaplOffset +(4*iat+3)*RowStride+orb]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef CUDA_DEBUG
|
||||
// Now do it on the CPU
|
||||
gpu::host_vector<CudaRealType> host_data;
|
||||
gpu::host_vector<CudaValueType> host_data;
|
||||
GradMatrix_t cpu_grads(grads.rows(), grads.cols());
|
||||
ValueMatrix_t cpu_lapl(grads.rows(), grads.cols());
|
||||
for (int iw=0; iw<walkers.size(); iw++)
|
||||
|
@ -983,12 +1044,12 @@ DiracDeterminantCUDA::NLratios_CPU
|
|||
std::vector<ValueMatrix_t> Ainv_host;
|
||||
int nw = walkers.size();
|
||||
Ainv_host.resize(nw);
|
||||
int mat_size = NumOrbitals*NumOrbitals*sizeof(CudaRealType);
|
||||
int mat_size = NumOrbitals*NumOrbitals*sizeof(CudaValueType);
|
||||
for (int iw=0; iw<nw; iw++)
|
||||
{
|
||||
Ainv_host[iw].resize(NumOrbitals, NumOrbitals);
|
||||
ValueType *dest = &(Ainv_host[iw](0,0));
|
||||
CudaRealType *src = &(walkers[iw]->cuda_DataSet.data()[AinvOffset]);
|
||||
CudaValueType *src = &(walkers[iw]->cuda_DataSet.data()[AinvOffset]);
|
||||
cudaMemcpy(dest, src, mat_size, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
std::vector<RealType> phi(NumOrbitals);
|
||||
|
|
|
@ -50,44 +50,44 @@ protected:
|
|||
size_t AOffset, AinvOffset, newRowOffset, AinvDeltaOffset,
|
||||
AinvColkOffset, gradLaplOffset, newGradLaplOffset,
|
||||
AWorkOffset, AinvWorkOffset;
|
||||
gpu::host_vector<CudaRealType*> UpdateList;
|
||||
gpu::device_vector<CudaRealType*> UpdateList_d;
|
||||
gpu::host_vector<CudaValueType*> UpdateList;
|
||||
gpu::device_vector<CudaValueType*> UpdateList_d;
|
||||
gpu::host_vector<updateJob> UpdateJobList;
|
||||
gpu::device_vector<updateJob> UpdateJobList_d;
|
||||
std::vector<CudaRealType*> srcList, destList, AList, AinvList, newRowList,
|
||||
AinvDeltaList, AinvColkList, gradLaplList, newGradLaplList,
|
||||
AWorkList, AinvWorkList, GLList;
|
||||
gpu::device_vector<CudaRealType*> srcList_d, destList_d, AList_d, AinvList_d, newRowList_d,
|
||||
std::vector<CudaValueType*> srcList, destList, AList, AinvList, newRowList,
|
||||
AinvDeltaList, AinvColkList, gradLaplList, newGradLaplList,
|
||||
AWorkList, AinvWorkList, GLList;
|
||||
gpu::device_vector<CudaValueType*> srcList_d, destList_d, AList_d, AinvList_d, newRowList_d,
|
||||
AinvDeltaList_d, AinvColkList_d, gradLaplList_d,
|
||||
newGradLaplList_d, AWorkList_d, AinvWorkList_d, GLList_d;
|
||||
gpu::device_vector<CudaRealType> ratio_d;
|
||||
gpu::host_vector<CudaRealType> ratio_host;
|
||||
gpu::device_vector<CudaRealType> gradLapl_d;
|
||||
gpu::host_vector<CudaRealType> gradLapl_host;
|
||||
gpu::device_vector<CudaValueType> ratio_d;
|
||||
gpu::host_vector<CudaValueType> ratio_host;
|
||||
gpu::device_vector<CudaValueType> gradLapl_d;
|
||||
gpu::host_vector<CudaValueType> gradLapl_host;
|
||||
gpu::device_vector<int> iatList_d;
|
||||
gpu::host_vector<int> iatList;
|
||||
|
||||
// Data members for nonlocal psuedopotential ratio evaluation
|
||||
static const int NLrowBufferRows = 4800;
|
||||
gpu::device_vector<CudaRealType> NLrowBuffer_d;
|
||||
gpu::host_vector<CudaRealType> NLrowBuffer_host;
|
||||
|
||||
gpu::device_vector<CudaRealType*> SplineRowList_d;
|
||||
gpu::host_vector<CudaRealType*> SplineRowList_host;
|
||||
gpu::device_vector<CudaRealType*> RatioRowList_d;
|
||||
gpu::host_vector<CudaRealType*> RatioRowList_host[2];
|
||||
gpu::device_vector<CudaValueType> NLrowBuffer_d;
|
||||
gpu::host_vector<CudaValueType> NLrowBuffer_host;
|
||||
gpu::device_vector<CudaValueType*> SplineRowList_d;
|
||||
gpu::host_vector<CudaValueType*> SplineRowList_host;
|
||||
gpu::device_vector<CudaValueType*> RatioRowList_d;
|
||||
gpu::host_vector<CudaValueType*> RatioRowList_host[2];
|
||||
gpu::device_vector<CudaRealType> NLposBuffer_d;
|
||||
gpu::host_vector<CudaRealType> NLposBuffer_host;
|
||||
gpu::device_vector<CudaRealType*> NLAinvList_d;
|
||||
gpu::host_vector<CudaRealType*> NLAinvList_host[2];
|
||||
gpu::device_vector<CudaValueType*> NLAinvList_d;
|
||||
gpu::host_vector<CudaValueType*> NLAinvList_host[2];
|
||||
gpu::device_vector<int> NLnumRatioList_d;
|
||||
gpu::host_vector<int> NLnumRatioList_host[2];
|
||||
gpu::device_vector<int> NLelecList_d;
|
||||
gpu::host_vector<int> NLelecList_host[2];
|
||||
gpu::device_vector<CudaRealType> NLratios_d[2];
|
||||
gpu::host_vector<CudaRealType> NLratios_host;
|
||||
gpu::device_vector<CudaRealType*> NLratioList_d;
|
||||
gpu::host_vector<CudaRealType*> NLratioList_host[2];
|
||||
gpu::device_vector<CudaValueType> NLratios_d[2];
|
||||
gpu::host_vector<CudaValueType> NLratios_host;
|
||||
gpu::device_vector<CudaValueType*> NLratioList_d;
|
||||
gpu::host_vector<CudaValueType*> NLratioList_host[2];
|
||||
|
||||
void resizeLists(int numWalkers)
|
||||
{
|
||||
|
@ -161,8 +161,7 @@ public:
|
|||
void update (std::vector<Walker_t*> &walkers, int iat);
|
||||
void update (const std::vector<Walker_t*> &walkers, const std::vector<int> &iatList);
|
||||
|
||||
void reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool)
|
||||
{
|
||||
void reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool) {
|
||||
RowStride = ((NumOrbitals + 31)/32) * 32;
|
||||
AOffset = pool.reserve((size_t) NumPtcls * RowStride);
|
||||
AinvOffset = pool.reserve((size_t) NumPtcls * RowStride);
|
||||
|
|
|
@ -306,7 +306,7 @@ public:
|
|||
Dets[id]->recompute(W, firstTime);
|
||||
}
|
||||
|
||||
void reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool)
|
||||
void reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool)
|
||||
{
|
||||
for (int id=0; id<Dets.size(); id++)
|
||||
Dets[id]->reserve(pool);
|
||||
|
@ -381,7 +381,7 @@ private:
|
|||
}
|
||||
#endif
|
||||
/***************************************************************************
|
||||
* $RCSfile$ $Author$
|
||||
* $Revision$ $Date$
|
||||
* $Id$
|
||||
* $RCSfile$ $Author: yingwai $
|
||||
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
|
||||
* $Id: SlaterDet.h 7279 2016-11-24 00:21:16Z yingwai $
|
||||
***************************************************************************/
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -14,6 +14,8 @@
|
|||
#ifndef CUDA_DETERMINANT_UPDATE_H
|
||||
#define CUDA_DETERMINANT_UPDATE_H
|
||||
|
||||
#include <complex>
|
||||
|
||||
struct updateJob
|
||||
{
|
||||
void *A, *Ainv, *newRow, *AinvDelta, *AinvColk, *gradLapl, *newGradLapl, *dummy;
|
||||
|
@ -57,6 +59,21 @@ update_inverse_cuda(double **data, int iat[],
|
|||
int AinvDelta_off, int AinvColk_off,
|
||||
int N, int rowstride, int numWalkers);
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
update_inverse_cuda(std::complex<float> **data, int iat[],
|
||||
int A_off, int Ainv_off, int newRow_off,
|
||||
int AinvDelta_off, int AinvColk_off,
|
||||
int N, int rowstride, int numWalkers);
|
||||
|
||||
void
|
||||
update_inverse_cuda(std::complex<double> **data, int iat[],
|
||||
int A_off, int Ainv_off, int newRow_off,
|
||||
int AinvDelta_off, int AinvColk_off,
|
||||
int N, int rowstride, int numWalkers);
|
||||
#endif
|
||||
|
||||
|
||||
// These are for single-particle move updates. Each
|
||||
// walker is moving the same electron.
|
||||
void
|
||||
|
@ -71,7 +88,19 @@ update_inverse_cuda(double **data, int iat,
|
|||
int AinvDelta_off, int AinvColk_off,
|
||||
int N, int rowstride, int numWalkers);
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
update_inverse_cuda(std::complex<float>** data, int iat,
|
||||
int A_off, int Ainv_off, int newRow_off,
|
||||
int AinvDelta_off, int AinvColk_off,
|
||||
int N, int rowstride, int numWalkers);
|
||||
|
||||
void
|
||||
update_inverse_cuda(std::complex<double>** data, int iat,
|
||||
int A_off, int Ainv_off, int newRow_off,
|
||||
int AinvDelta_off, int AinvColk_off,
|
||||
int N, int rowstride, int numWalkers);
|
||||
#endif
|
||||
|
||||
|
||||
void
|
||||
|
@ -84,6 +113,18 @@ determinant_ratios_cuda (double *Ainv_list[], double *new_row_list[],
|
|||
double *ratios, int N, int row_stride, int iat,
|
||||
int numWalkers);
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
determinant_ratios_cuda (std::complex<float> *Ainv_list[], std::complex<float> *new_row_list[],
|
||||
std::complex<float> *ratios, int N, int row_stride, int iat,
|
||||
int numWalkers);
|
||||
|
||||
void
|
||||
determinant_ratios_cuda (std::complex<double> *Ainv_list[], std::complex<double> *new_row_list[],
|
||||
std::complex<double> *ratios, int N, int row_stride, int iat,
|
||||
int numWalkers);
|
||||
#endif
|
||||
|
||||
|
||||
void
|
||||
calc_many_ratios (float *Ainv_list[], float *new_row_list[],
|
||||
|
@ -97,6 +138,20 @@ calc_many_ratios (double *Ainv_list[], double *new_row_list[],
|
|||
int N, int row_stride, int elec_list[],
|
||||
int numWalkers);
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
calc_many_ratios (std::complex<float> *Ainv_list[], std::complex<float> *new_row_list[],
|
||||
std::complex<float>* ratio_list[], int num_ratio_list[],
|
||||
int N, int row_stride, int elec_list[],
|
||||
int numWalkers);
|
||||
|
||||
void
|
||||
calc_many_ratios (std::complex<double> *Ainv_list[], std::complex<double> *new_row_list[],
|
||||
std::complex<double>* ratio_list[], int num_ratio_list[],
|
||||
int N, int row_stride, int elec_list[],
|
||||
int numWalkers);
|
||||
#endif
|
||||
|
||||
void
|
||||
scale_grad_lapl(float *grad_list[], float *hess_list[],
|
||||
float *grad_lapl_list[], float Linv[], int N,
|
||||
|
@ -111,18 +166,42 @@ calc_grad_lapl (double *Ainv_list[], double *grad_lapl_list[],
|
|||
double *out_list[], int N, int row_stride, int num_mats);
|
||||
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
calc_grad_lapl (std::complex<float> *Ainv_list[], std::complex<float> *grad_lapl_list[],
|
||||
std::complex<float> *out_list[], int N, int row_stride, int num_mats);
|
||||
|
||||
void
|
||||
calc_grad_lapl (std::complex<double> *Ainv_list[], std::complex<double> *grad_lapl_list[],
|
||||
std::complex<double> *out_list[], int N, int row_stride, int num_mats);
|
||||
#endif
|
||||
|
||||
|
||||
void
|
||||
multi_copy (float *dest[], float *src[], int len, int num);
|
||||
void
|
||||
multi_copy (double *dest[], double *src[], int len, int num);
|
||||
|
||||
void
|
||||
multi_copy (float *buff[], int srcoff, int dest_off, int len, int num);
|
||||
multi_copy (float *buff[], int dest_off, int src_off, int len, int num);
|
||||
void
|
||||
multi_copy (double *buff[], int src_off, int dest_off, int len, int num);
|
||||
multi_copy (double *buff[], int dest_off, int src_off, int len, int num);
|
||||
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
multi_copy (std::complex<float> *dest[], std::complex<float> *src[], int len, int num);
|
||||
void
|
||||
multi_copy (std::complex<double> *dest[], std::complex<double> *src[], int len, int num);
|
||||
|
||||
void
|
||||
multi_copy (std::complex<float> *buff[], int dest_off, int src_off, int len, int num);
|
||||
void
|
||||
multi_copy (std::complex<double> *buff[], int dest_off, int src_off, int len, int num);
|
||||
#endif
|
||||
|
||||
//YingWai: the following two are repeated (Dec 28, 15)
|
||||
/*
|
||||
void
|
||||
calc_many_ratios (float *Ainv_list[], float *new_row_list[],
|
||||
float* ratio_list[], int num_ratio_list[],
|
||||
|
@ -134,6 +213,7 @@ calc_many_ratios (double *Ainv_list[], double *new_row_list[],
|
|||
double* ratio_list[], int num_ratio_list[],
|
||||
int N, int row_stride, int elec_list[],
|
||||
int numWalkers);
|
||||
*/
|
||||
|
||||
void
|
||||
determinant_ratios_grad_lapl_cuda
|
||||
|
@ -143,8 +223,23 @@ determinant_ratios_grad_lapl_cuda
|
|||
|
||||
void
|
||||
determinant_ratios_grad_lapl_cuda
|
||||
(double *Ainv_list[], double *new_row_list[], double *grad_lapl_list[],
|
||||
double ratios_grad_lapl[], int N, int row_stride, int iat, int numWalkers);
|
||||
(double *Ainv_list[], double *new_row_list[],
|
||||
double *grad_lapl_list[], double ratios_grad_lapl[],
|
||||
int N, int row_stride, int iat, int numWalkers);
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
determinant_ratios_grad_lapl_cuda
|
||||
(std::complex<float> *Ainv_list[], std::complex<float> *new_row_list[],
|
||||
std::complex<float> *grad_lapl_list[], std::complex<float> ratios_grad_lapl[],
|
||||
int N, int row_stride, int iat, int numWalkers);
|
||||
|
||||
void
|
||||
determinant_ratios_grad_lapl_cuda
|
||||
(std::complex<double> *Ainv_list[], std::complex<double> *new_row_list[],
|
||||
std::complex<double> *grad_lapl_list[], std::complex<double> ratios_grad_lapl[],
|
||||
int N, int row_stride, int iat, int numWalkers);
|
||||
#endif
|
||||
|
||||
void
|
||||
determinant_ratios_grad_lapl_cuda
|
||||
|
@ -158,6 +253,19 @@ determinant_ratios_grad_lapl_cuda
|
|||
double ratios_grad_lapl[], int N, int row_stride,
|
||||
int iat_list[], int numWalkers);
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
determinant_ratios_grad_lapl_cuda
|
||||
(std::complex<float> *Ainv_list[], std::complex<float> *new_row_list[],
|
||||
std::complex<float> *grad_lapl_list[], std::complex<float> ratios_grad_lapl[],
|
||||
int N, int row_stride, int iat_list[], int numWalkers);
|
||||
|
||||
void
|
||||
determinant_ratios_grad_lapl_cuda
|
||||
(std::complex<double> *Ainv_list[], std::complex<double> *new_row_list[],
|
||||
std::complex<double> *grad_lapl_list[], std::complex<double> ratios_grad_lapl[],
|
||||
int N, int row_stride, int iat_list[], int numWalkers);
|
||||
#endif
|
||||
|
||||
void
|
||||
calc_gradient (float *Ainv_list[], float *grad_lapl_list[],
|
||||
|
@ -169,5 +277,16 @@ calc_gradient (double *Ainv_list[], double *grad_lapl_list[],
|
|||
double grad[], int N, int row_stride, int elec,
|
||||
int numWalkers);
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
calc_gradient (std::complex<float> *Ainv_list[], std::complex<float> *grad_lapl_list[],
|
||||
std::complex<float> grad[], int N, int row_stride, int elec,
|
||||
int numWalkers);
|
||||
|
||||
void
|
||||
calc_gradient (std::complex<double> *Ainv_list[], std::complex<double> *grad_lapl_list[],
|
||||
std::complex<double> grad[], int N, int row_stride, int elec,
|
||||
int numWalkers);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -941,7 +941,7 @@ two_body_gradient (double *R[], int first, int last, int iat,
|
|||
|
||||
|
||||
|
||||
template<typename T, int BS>
|
||||
template<typename T, int BS, unsigned COMPLEX>
|
||||
__global__ void
|
||||
two_body_derivs_kernel(T **R, T **gradLogPsi,
|
||||
int e1_first, int e1_last,
|
||||
|
@ -991,7 +991,7 @@ two_body_derivs_kernel(T **R, T **gradLogPsi,
|
|||
int outoff = i*BS+tid;
|
||||
int inoff = outoff + 3*e1_first + 3*b1*BS;
|
||||
r1[0][outoff] = myR[inoff];//[3*e1_first + (3*b1+i)*BS + tid];
|
||||
sGrad[0][outoff] = myGrad[inoff];
|
||||
sGrad[0][outoff] = myGrad[inoff*COMPLEX];
|
||||
}
|
||||
__syncthreads();
|
||||
int ptcl1 = e1_first+b1*BS + tid;
|
||||
|
@ -1082,7 +1082,7 @@ two_body_derivs(float *R[], float *gradLogPsi[], int e1_first, int e1_last,
|
|||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
two_body_derivs_kernel<float,BS><<<dimGrid,dimBlock>>>
|
||||
two_body_derivs_kernel<float,BS,1><<<dimGrid,dimBlock>>>
|
||||
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, derivs);
|
||||
}
|
||||
|
@ -1096,12 +1096,46 @@ two_body_derivs(double *R[], double *gradLogPsi[], int e1_first, int e1_last,
|
|||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
two_body_derivs_kernel<double,BS><<<dimGrid,dimBlock>>>
|
||||
two_body_derivs_kernel<double,BS,1><<<dimGrid,dimBlock>>>
|
||||
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, derivs);
|
||||
}
|
||||
|
||||
|
||||
// Ye: use offset to recycle the old routines
|
||||
// block size can be further optimized.
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
two_body_derivs(float *R[], std::complex<float> *gradLogPsi[], int e1_first, int e1_last,
|
||||
int e2_first, int e2_last,
|
||||
int numCoefs, float rMax,
|
||||
float *derivs[], int numWalkers)
|
||||
{
|
||||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
|
||||
two_body_derivs_kernel<float,BS,2><<<dimGrid,dimBlock>>>
|
||||
(R, (float**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, derivs);
|
||||
}
|
||||
|
||||
void
|
||||
two_body_derivs(double *R[], std::complex<double> *gradLogPsi[], int e1_first, int e1_last,
|
||||
int e2_first, int e2_last,
|
||||
int numCoefs, double rMax,
|
||||
double *derivs[], int numWalkers)
|
||||
{
|
||||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
|
||||
two_body_derivs_kernel<double,BS,2><<<dimGrid,dimBlock>>>
|
||||
(R, (double**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, derivs);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// One-body routines //
|
||||
|
@ -1892,7 +1926,7 @@ one_body_gradient (double *Rlist[], int iat, double C[], int first, int last,
|
|||
|
||||
|
||||
|
||||
template<typename T, int BS>
|
||||
template<typename T, int BS, unsigned COMPLEX>
|
||||
__global__ void
|
||||
one_body_derivs_kernel(T* C, T **R, T **gradLogPsi,
|
||||
int cfirst, int clast,
|
||||
|
@ -1939,7 +1973,7 @@ one_body_derivs_kernel(T* C, T **R, T **gradLogPsi,
|
|||
int outoff = i*BS+tid;
|
||||
int inoff = outoff + 3*efirst + 3*be*BS;
|
||||
r[0][outoff] = myR[inoff];
|
||||
sGrad[0][outoff] = myGrad[inoff];
|
||||
sGrad[0][outoff] = myGrad[inoff*COMPLEX];
|
||||
}
|
||||
__syncthreads();
|
||||
int eptcl = efirst+be*BS + tid;
|
||||
|
@ -2023,7 +2057,7 @@ one_body_derivs(float C[], float *R[], float *gradLogPsi[],
|
|||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
one_body_derivs_kernel<float,BS><<<dimGrid,dimBlock>>>
|
||||
one_body_derivs_kernel<float,BS,1><<<dimGrid,dimBlock>>>
|
||||
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, derivs);
|
||||
}
|
||||
|
@ -2040,13 +2074,48 @@ one_body_derivs(double C[], double *R[], double *gradLogPsi[],
|
|||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
one_body_derivs_kernel<double,BS><<<dimGrid,dimBlock>>>
|
||||
one_body_derivs_kernel<double,BS,1><<<dimGrid,dimBlock>>>
|
||||
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, derivs);
|
||||
}
|
||||
|
||||
// Ye: use offset to recycle the old routines
|
||||
// block size can be further optimized.
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
one_body_derivs(float C[], float *R[], std::complex<float> *gradLogPsi[],
|
||||
int cfirst, int clast,
|
||||
int efirst, int elast,
|
||||
int numCoefs, float rMax,
|
||||
float *derivs[], int numWalkers)
|
||||
{
|
||||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
|
||||
one_body_derivs_kernel<float,BS,2><<<dimGrid,dimBlock>>>
|
||||
(C, R, (float**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, derivs);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
one_body_derivs(double C[], double *R[], std::complex<double> *gradLogPsi[],
|
||||
int cfirst, int clast,
|
||||
int efirst, int elast,
|
||||
int numCoefs, double rMax,
|
||||
double *derivs[], int numWalkers)
|
||||
{
|
||||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
|
||||
one_body_derivs_kernel<double,BS,2><<<dimGrid,dimBlock>>>
|
||||
(C, R, (double**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, derivs);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void test()
|
||||
{
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#ifndef BSPLINE_JASTROW_CUDA_H
|
||||
#define BSPLINE_JASTROW_CUDA_H
|
||||
|
||||
#include <complex>
|
||||
#include "NLjobGPU.h"
|
||||
|
||||
///////////////////////
|
||||
|
@ -106,6 +107,19 @@ two_body_derivs(double *R[], double *gradLogPsi[], int e1_first, int e1_last,
|
|||
double *derivs[], int numWalkers);
|
||||
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
two_body_derivs(float *R[], std::complex<float> *gradLogPsi[], int e1_first, int e1_last,
|
||||
int e2_first, int e2_last,
|
||||
int numCoefs, float rMax,
|
||||
float *derivs[], int numWalkers);
|
||||
void
|
||||
two_body_derivs(double *R[], std::complex<double> *gradLogPsi[], int e1_first, int e1_last,
|
||||
int e2_first, int e2_last,
|
||||
int numCoefs, double rMax,
|
||||
double *derivs[], int numWalkers);
|
||||
#endif
|
||||
|
||||
///////////////////////
|
||||
// One-Body routines //
|
||||
///////////////////////
|
||||
|
@ -199,4 +213,22 @@ one_body_derivs(double C[], double *R[], double *gradLogPsi[],
|
|||
int numCoefs, double rMax,
|
||||
double *derivs[], int numWalkers);
|
||||
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
one_body_derivs(float C[], float *R[], std::complex<float> *gradLogPsi[],
|
||||
int cfirst, int clast,
|
||||
int efirst, int elast,
|
||||
int numCoefs, float rMax,
|
||||
float *derivs[], int numWalkers);
|
||||
|
||||
void
|
||||
one_body_derivs(double C[], double *R[], std::complex<double> *gradLogPsi[],
|
||||
int cfirst, int clast,
|
||||
int efirst, int elast,
|
||||
int numCoefs, double rMax,
|
||||
double *derivs[], int numWalkers);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -17,9 +17,9 @@
|
|||
|
||||
#define MAX_SPLINES 100
|
||||
#include <stdio.h>
|
||||
#include <config.h>
|
||||
#include "BsplineJastrowCudaPBC.h"
|
||||
#include "../../CUDA/gpu_misc.h"
|
||||
#include <config.h>
|
||||
|
||||
|
||||
bool AisInitializedPBC = false;
|
||||
|
@ -1791,7 +1791,7 @@ two_body_gradient_PBC (double *R[], int first, int last, int iat,
|
|||
|
||||
|
||||
|
||||
template<typename T, int BS>
|
||||
template<typename T, int BS, unsigned COMPLEX>
|
||||
__global__ void
|
||||
two_body_derivs_PBC_kernel(T **R, T **gradLogPsi,
|
||||
int e1_first, int e1_last,
|
||||
|
@ -1848,7 +1848,7 @@ two_body_derivs_PBC_kernel(T **R, T **gradLogPsi,
|
|||
int outoff = i*BS+tid;
|
||||
int inoff = outoff + 3*e1_first + 3*b1*BS;
|
||||
r1[0][outoff] = myR[inoff];//[3*e1_first + (3*b1+i)*BS + tid];
|
||||
sGrad[0][outoff] = myGrad[inoff];
|
||||
sGrad[0][outoff] = myGrad[inoff*COMPLEX];
|
||||
}
|
||||
__syncthreads();
|
||||
int ptcl1 = e1_first+b1*BS + tid;
|
||||
|
@ -1941,11 +1941,11 @@ two_body_derivs_PBC(float *R[], float *gradLogPsi[], int e1_first, int e1_last,
|
|||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
if (sim_cell_radius >= rMax)
|
||||
two_body_derivs_PBC_kernel<float,BS><<<dimGrid,dimBlock>>>
|
||||
two_body_derivs_PBC_kernel<float,BS,1><<<dimGrid,dimBlock>>>
|
||||
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
else
|
||||
two_body_derivs_PBC_kernel<float,BS><<<dimGrid,dimBlock>>>
|
||||
two_body_derivs_PBC_kernel<float,BS,1><<<dimGrid,dimBlock>>>
|
||||
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
}
|
||||
|
@ -1961,16 +1961,62 @@ two_body_derivs_PBC(double *R[], double *gradLogPsi[], int e1_first, int e1_last
|
|||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
if (sim_cell_radius >= rMax)
|
||||
two_body_derivs_PBC_kernel<double,BS><<<dimGrid,dimBlock>>>
|
||||
two_body_derivs_PBC_kernel<double,BS,1><<<dimGrid,dimBlock>>>
|
||||
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
else
|
||||
two_body_derivs_PBC_kernel<double,BS><<<dimGrid,dimBlock>>>
|
||||
two_body_derivs_PBC_kernel<double,BS,1><<<dimGrid,dimBlock>>>
|
||||
(R, gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
}
|
||||
|
||||
|
||||
// Ye: use offset to recycle the old routines
|
||||
// block size can be further optimized.
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
two_body_derivs_PBC(float *R[], std::complex<float> *gradLogPsi[], int e1_first, int e1_last,
|
||||
int e2_first, int e2_last,
|
||||
int numCoefs, float rMax,
|
||||
float lattice[], float latticeInv[], float sim_cell_radius,
|
||||
float *derivs[], int numWalkers)
|
||||
{
|
||||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
|
||||
if (sim_cell_radius >= rMax)
|
||||
two_body_derivs_PBC_kernel<float,BS,2><<<dimGrid,dimBlock>>>
|
||||
(R, (float**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
else
|
||||
two_body_derivs_PBC_kernel<float,BS,2><<<dimGrid,dimBlock>>>
|
||||
(R, (float**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
}
|
||||
|
||||
void
|
||||
two_body_derivs_PBC(double *R[], std::complex<double> *gradLogPsi[], int e1_first, int e1_last,
|
||||
int e2_first, int e2_last,
|
||||
int numCoefs, double rMax,
|
||||
double lattice[], double latticeInv[], double sim_cell_radius,
|
||||
double *derivs[], int numWalkers)
|
||||
{
|
||||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
|
||||
if (sim_cell_radius >= rMax)
|
||||
two_body_derivs_PBC_kernel<double,BS,2><<<dimGrid,dimBlock>>>
|
||||
(R, (double**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
else
|
||||
two_body_derivs_PBC_kernel<double,BS,2><<<dimGrid,dimBlock>>>
|
||||
(R, (double**)gradLogPsi, e1_first, e1_last, e2_first, e2_last, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// One-body routines //
|
||||
|
@ -3310,7 +3356,7 @@ one_body_gradient_PBC (double *Rlist[], int iat, double C[], int first, int last
|
|||
|
||||
|
||||
|
||||
template<typename T, int BS>
|
||||
template<typename T, int BS, unsigned COMPLEX>
|
||||
__global__ void
|
||||
one_body_derivs_PBC_kernel(T* C, T **R, T **gradLogPsi,
|
||||
int cfirst, int clast,
|
||||
|
@ -3364,7 +3410,7 @@ one_body_derivs_PBC_kernel(T* C, T **R, T **gradLogPsi,
|
|||
int outoff = i*BS+tid;
|
||||
int inoff = outoff + 3*efirst + 3*be*BS;
|
||||
r[0][outoff] = myR[inoff];
|
||||
sGrad[0][outoff] = myGrad[inoff];
|
||||
sGrad[0][outoff] = myGrad[inoff*COMPLEX];
|
||||
}
|
||||
__syncthreads();
|
||||
int eptcl = efirst+be*BS + tid;
|
||||
|
@ -3450,11 +3496,11 @@ one_body_derivs_PBC(float C[], float *R[], float *gradLogPsi[],
|
|||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
if (sim_cell_radius >= rMax)
|
||||
one_body_derivs_PBC_kernel<float,BS><<<dimGrid,dimBlock>>>
|
||||
one_body_derivs_PBC_kernel<float,BS,1><<<dimGrid,dimBlock>>>
|
||||
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
else
|
||||
one_body_derivs_PBC_kernel<float,BS><<<dimGrid,dimBlock>>>
|
||||
one_body_derivs_PBC_kernel<float,BS,1><<<dimGrid,dimBlock>>>
|
||||
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
}
|
||||
|
@ -3473,16 +3519,65 @@ one_body_derivs_PBC(double C[], double *R[], double *gradLogPsi[],
|
|||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
if (sim_cell_radius >= rMax)
|
||||
one_body_derivs_PBC_kernel<double,BS><<<dimGrid,dimBlock>>>
|
||||
one_body_derivs_PBC_kernel<double,BS,1><<<dimGrid,dimBlock>>>
|
||||
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
else
|
||||
one_body_derivs_PBC_kernel<double,BS><<<dimGrid,dimBlock>>>
|
||||
one_body_derivs_PBC_kernel<double,BS,1><<<dimGrid,dimBlock>>>
|
||||
(C, R, gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
}
|
||||
|
||||
|
||||
// Ye: use offset to recycle the old routines
|
||||
// block size can be further optimized.
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
one_body_derivs_PBC(float C[], float *R[], std::complex<float> *gradLogPsi[],
|
||||
int cfirst, int clast,
|
||||
int efirst, int elast,
|
||||
int numCoefs, float rMax,
|
||||
float lattice[], float latticeInv[], float sim_cell_radius,
|
||||
float *derivs[], int numWalkers)
|
||||
{
|
||||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
|
||||
if (sim_cell_radius >= rMax)
|
||||
one_body_derivs_PBC_kernel<float,BS,2><<<dimGrid,dimBlock>>>
|
||||
(C, R, (float**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
else
|
||||
one_body_derivs_PBC_kernel<float,BS,2><<<dimGrid,dimBlock>>>
|
||||
(C, R, (float**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void
|
||||
one_body_derivs_PBC(double C[], double *R[], std::complex<double> *gradLogPsi[],
|
||||
int cfirst, int clast,
|
||||
int efirst, int elast,
|
||||
int numCoefs, double rMax,
|
||||
double lattice[], double latticeInv[], double sim_cell_radius,
|
||||
double *derivs[], int numWalkers)
|
||||
{
|
||||
const int BS=32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid(numWalkers);
|
||||
|
||||
if (sim_cell_radius >= rMax)
|
||||
one_body_derivs_PBC_kernel<double,BS,2><<<dimGrid,dimBlock>>>
|
||||
(C, R, (double**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
else
|
||||
one_body_derivs_PBC_kernel<double,BS,2><<<dimGrid,dimBlock>>>
|
||||
(C, R, (double**)gradLogPsi, cfirst, clast, efirst, elast, numCoefs,
|
||||
rMax, lattice, latticeInv, derivs);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void testPBC()
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#ifndef BSPLINE_JASTROW_CUDA_PBC_H
|
||||
#define BSPLINE_JASTROW_CUDA_PBC_H
|
||||
|
||||
#include <complex>
|
||||
#include "NLjobGPU.h"
|
||||
|
||||
///////////////////////
|
||||
|
@ -117,6 +118,23 @@ two_body_derivs_PBC(double *R[], double *gradLogPsi[], int e1_first, int e1_last
|
|||
double *derivs[], int numWalkers);
|
||||
|
||||
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void
|
||||
two_body_derivs_PBC(float *R[], std::complex<float> *gradLogPsi[], int e1_first, int e1_last,
|
||||
int e2_first, int e2_last,
|
||||
int numCoefs, float rMax,
|
||||
float lattice[], float latticeInv[], float sim_cell_radius,
|
||||
float *derivs[], int numWalkers);
|
||||
void
|
||||
two_body_derivs_PBC(double *R[], std::complex<double> *gradLogPsi[], int e1_first, int e1_last,
|
||||
int e2_first, int e2_last,
|
||||
int numCoefs, double rMax,
|
||||
double lattice[], double latticeInv[], double sim_cell_radius,
|
||||
double *derivs[], int numWalkers);
|
||||
#endif
|
||||
|
||||
|
||||
///////////////////////
|
||||
// One-Body routines //
|
||||
///////////////////////
|
||||
|
@ -222,4 +240,25 @@ one_body_derivs_PBC(double C[], double *R[], double *gradLogPsi[],
|
|||
double lattice[], double latticeInv[], double sim_cell_radius,
|
||||
double *derivs[], int numWalkers);
|
||||
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
|
||||
void
|
||||
one_body_derivs_PBC(float C[], float *R[], std::complex<float> *gradLogPsi[],
|
||||
int cfirst, int clast,
|
||||
int efirst, int elast,
|
||||
int numCoefs, float rMax,
|
||||
float lattice[], float latticeInv[], float sim_cell_radius,
|
||||
float *derivs[], int numWalkers);
|
||||
|
||||
void
|
||||
one_body_derivs_PBC(double C[], double *R[], std::complex<double> *gradLogPsi[],
|
||||
int cfirst, int clast,
|
||||
int efirst, int elast,
|
||||
int numCoefs, double rMax,
|
||||
double lattice[], double latticeInv[], double sim_cell_radius,
|
||||
double *derivs[], int numWalkers);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -495,7 +495,7 @@ OneBodyJastrowOrbitalBspline::resetParameters(const opt_variables_type& active)
|
|||
void
|
||||
OneBodyJastrowOrbitalBspline::evaluateDerivatives
|
||||
(MCWalkerConfiguration &W, const opt_variables_type& optvars,
|
||||
ValueMatrix_t &d_logpsi, ValueMatrix_t &dlapl_over_psi)
|
||||
RealMatrix_t &d_logpsi, RealMatrix_t &dlapl_over_psi)
|
||||
{
|
||||
CudaReal sim_cell_radius = W.Lattice.SimulationCellRadius;
|
||||
std::vector<Walker_t*> &walkers = W.WalkerList;
|
||||
|
|
|
@ -108,8 +108,8 @@ public:
|
|||
std::vector<PosType> &quadPoints, std::vector<ValueType> &psi_ratios);
|
||||
void evaluateDerivatives (MCWalkerConfiguration &W,
|
||||
const opt_variables_type& optvars,
|
||||
ValueMatrix_t &dlogpsi,
|
||||
ValueMatrix_t &dlapl_over_psi);
|
||||
RealMatrix_t &dlogpsi,
|
||||
RealMatrix_t &dlapl_over_psi);
|
||||
OneBodyJastrowOrbitalBspline(ParticleSet ¢ers, ParticleSet& elecs) :
|
||||
OneBodyJastrowOrbital<BsplineFunctor<OrbitalBase::RealType> > (centers,elecs),
|
||||
ElecRef(elecs),
|
||||
|
|
|
@ -568,7 +568,7 @@ TwoBodyJastrowOrbitalBspline::resetParameters(const opt_variables_type& active)
|
|||
void
|
||||
TwoBodyJastrowOrbitalBspline::evaluateDerivatives
|
||||
(MCWalkerConfiguration &W, const opt_variables_type& optvars,
|
||||
ValueMatrix_t &d_logpsi, ValueMatrix_t &dlapl_over_psi)
|
||||
RealMatrix_t &d_logpsi, RealMatrix_t &dlapl_over_psi)
|
||||
{
|
||||
CudaReal sim_cell_radius = W.Lattice.SimulationCellRadius;
|
||||
std::vector<Walker_t*> &walkers = W.WalkerList;
|
||||
|
|
|
@ -109,8 +109,8 @@ public:
|
|||
void
|
||||
evaluateDerivatives (MCWalkerConfiguration &W,
|
||||
const opt_variables_type& optvars,
|
||||
ValueMatrix_t &dlogpsi,
|
||||
ValueMatrix_t &dlapl_over_psi);
|
||||
RealMatrix_t &dlogpsi,
|
||||
RealMatrix_t &dlapl_over_psi);
|
||||
|
||||
//TwoBodyJastrowOrbitalBspline(ParticleSet& pset, bool is_master) :
|
||||
// TwoBodyJastrowOrbital<BsplineFunctor<OrbitalBase::RealType> > (pset, is_master),
|
||||
|
|
|
@ -95,8 +95,9 @@ struct OrbitalBase: public QMCTraits
|
|||
typedef ParticleAttrib<GradType> GradVectorType;
|
||||
typedef PooledData<RealType> BufferType;
|
||||
typedef ParticleSet::Walker_t Walker_t;
|
||||
typedef OrbitalSetTraits<ValueType>::ValueMatrix_t ValueMatrix_t;
|
||||
typedef OrbitalSetTraits<ValueType>::GradMatrix_t GradMatrix_t;
|
||||
typedef OrbitalSetTraits<RealType>::ValueMatrix_t RealMatrix_t;
|
||||
typedef OrbitalSetTraits<ValueType>::ValueMatrix_t ValueMatrix_t;
|
||||
typedef OrbitalSetTraits<ValueType>::GradMatrix_t GradMatrix_t;
|
||||
typedef OrbitalSetTraits<ValueType>::HessType HessType;
|
||||
typedef OrbitalSetTraits<ValueType>::HessVector_t HessVector_t;
|
||||
|
||||
|
@ -502,7 +503,7 @@ struct OrbitalBase: public QMCTraits
|
|||
virtual void recompute(MCWalkerConfiguration &W, bool firstTime)
|
||||
{ }
|
||||
|
||||
virtual void reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool)
|
||||
virtual void reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool)
|
||||
{ }
|
||||
|
||||
/** Evaluate the log of the WF for all walkers
|
||||
|
@ -645,8 +646,8 @@ struct OrbitalBase: public QMCTraits
|
|||
virtual void
|
||||
evaluateDerivatives (MCWalkerConfiguration &W,
|
||||
const opt_variables_type& optvars,
|
||||
ValueMatrix_t &dgrad_logpsi,
|
||||
ValueMatrix_t &dhpsi_over_psi)
|
||||
RealMatrix_t &dgrad_logpsi,
|
||||
RealMatrix_t &dhpsi_over_psi)
|
||||
{
|
||||
app_error() << "Need specialization of OrbitalBase::evaluateDerivatives.\n";
|
||||
abort();
|
||||
|
@ -656,8 +657,8 @@ struct OrbitalBase: public QMCTraits
|
|||
}
|
||||
#endif
|
||||
/***************************************************************************
|
||||
* $RCSfile$ $Author$
|
||||
* $Revision$ $Date$
|
||||
* $Id$
|
||||
* $RCSfile$ $Author: yingwai $
|
||||
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
|
||||
* $Id: OrbitalBase.h 7279 2016-11-24 00:21:16Z yingwai $
|
||||
***************************************************************************/
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
|
||||
|
||||
#include <assert.h>
|
||||
#include <thrust/complex.h>
|
||||
|
||||
/* !!!WARNING!!!
|
||||
Kernels in this file strongly depend on warp-synchronous behavior
|
||||
|
@ -550,6 +551,7 @@ void apply_phase_factors(double kPoints[], int makeTwoCopies[],
|
|||
GL_in, GL_out, num_splines, num_walkers, row_stride);
|
||||
}
|
||||
|
||||
|
||||
void apply_phase_factors(double kPoints[], int makeTwoCopies[], int TwoCopiesIndex[],
|
||||
double pos[], double *phi_in[], double *phi_out[],
|
||||
double *GL_in[], double *GL_out[],
|
||||
|
@ -563,23 +565,77 @@ void apply_phase_factors(double kPoints[], int makeTwoCopies[], int TwoCopiesInd
|
|||
GL_in, GL_out, num_splines, num_walkers, row_stride);
|
||||
}
|
||||
|
||||
#ifdef ALGO_CHRISTOS
|
||||
// Slower for some runs
|
||||
#define WARP_SIZE 32
|
||||
|
||||
// NO_TWO_COPIES can be set to true if the makeTwoCopies is guaranteed to
|
||||
// only hold 0s.
|
||||
template<typename T, int BS, bool NO_TWO_COPIES> __global__
|
||||
void phase_factor_kernel (T *kPoints, int *makeTwoCopies,
|
||||
T *pos, T **phi_in, T **phi_out,
|
||||
T **grad_lapl_in, T **grad_lapl_out,
|
||||
// YingWai's implementation for complex wavefunctions
|
||||
// * The use of shared memory can be further optimized.
|
||||
// * Thread count can be further optimized.
|
||||
// Ye's optimization eliminates the shared memory buffer and potential race conditions.
|
||||
// * BS>32 is safe but the optimal choice still needs further attempts.
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
// blockIdx.x = counter for walker
|
||||
// BS threads per block
|
||||
template<typename T, typename T2, int BS> __global__
|
||||
void phase_factor_kernel (T *kPoints,
|
||||
T *pos, T2 **phi_in, T2 **phi_out,
|
||||
int num_splines)
|
||||
{
|
||||
__shared__ T pos_s[3];
|
||||
__shared__ T2 *phi_in_ptr, *phi_out_ptr;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
// Copy electron position into shared memory
|
||||
if (tid < 3)
|
||||
pos_s[tid] = pos[3*blockIdx.x+tid];
|
||||
|
||||
// Set pointers to point to the address (of the 1st element) where phi for that walker is stored
|
||||
if (tid == 0)
|
||||
{
|
||||
phi_in_ptr = phi_in[blockIdx.x];
|
||||
phi_out_ptr = phi_out[blockIdx.x];
|
||||
}
|
||||
|
||||
// BS>warpSize sync needed
|
||||
//__syncthreads();
|
||||
|
||||
// "ib" is NOT the thread block! This is just a "block" of BS splines that the BS threads
|
||||
// process together at one time, so it has to repeat NB times to take care of all the splines
|
||||
int NB = (num_splines+BS-1)/BS;
|
||||
for (int ib=0; ib<NB; ib++)
|
||||
{
|
||||
int off = ib*BS + tid;
|
||||
if (off < num_splines)
|
||||
{
|
||||
// Now compute phase factors
|
||||
T phase = -(kPoints[off*3 ]*pos_s[0] +
|
||||
kPoints[off*3+1]*pos_s[1] +
|
||||
kPoints[off*3+2]*pos_s[2]);
|
||||
T s, c;
|
||||
sincos (phase, &s, &c);
|
||||
T2 e_mikr = T2(c,s);
|
||||
|
||||
// Write back to global memory directly
|
||||
phi_out_ptr[ib*BS+tid] = phi_in_ptr[ib*BS+tid]*e_mikr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T, typename T2, int BS> __global__
|
||||
void phase_factor_kernel (T *kPoints,
|
||||
T *pos, T2 **phi_in, T2 **phi_out,
|
||||
T2 **grad_lapl_in, T2 **grad_lapl_out,
|
||||
int num_splines, int num_walkers,
|
||||
int row_stride)
|
||||
{
|
||||
volatile __shared__ T in_shared[5][2*BS+1], out_shared[5][BS+1], kPoints_s[BS][3];
|
||||
// Dummy quantities to make use of shared memory
|
||||
__shared__ T pos_s[3];
|
||||
__shared__ T *my_phi_in, *my_phi_out, *my_GL_in, *my_GL_out;
|
||||
__shared__ T2 *my_phi_in, *my_phi_out, *my_GL_in, *my_GL_out;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
// Set pointers to point to the address (of the 1st element) where phi for that walker is stored
|
||||
if (tid == 0)
|
||||
{
|
||||
my_phi_in = phi_in[blockIdx.x];
|
||||
|
@ -587,189 +643,122 @@ void phase_factor_kernel (T *kPoints, int *makeTwoCopies,
|
|||
my_GL_in = grad_lapl_in[blockIdx.x];
|
||||
my_GL_out = grad_lapl_out[blockIdx.x];
|
||||
}
|
||||
// Copy electron position into shared memory
|
||||
if (tid < 3)
|
||||
pos_s[tid] = pos[3*blockIdx.x+tid];
|
||||
|
||||
if (BS > WARP_SIZE) __syncthreads();
|
||||
__syncthreads();
|
||||
|
||||
// "block" is NOT the thread block! This is just a "block" of BS splines that the BS threads
|
||||
// process together at one time, so it has to repeat nb times to take care of all the splines
|
||||
int nb = (num_splines + BS-1)/BS;
|
||||
int outIndex=0;
|
||||
int outBlock=0;
|
||||
|
||||
// 0 if NO_TWO_COPIES, but won't compile
|
||||
__shared__ int m2c[NO_TWO_COPIES ? 1 : BS];
|
||||
|
||||
for (int block=0; block<nb; block++)
|
||||
{
|
||||
// Load kpoints into shared memory
|
||||
for (int i=0; i<3; i++)
|
||||
T kPoints_s[3];
|
||||
int off = block*BS + tid;
|
||||
if (off < num_splines)
|
||||
{
|
||||
int off = (3*block+i)*BS + tid;
|
||||
if (off < 3*num_splines)
|
||||
kPoints_s[0][i*BS+tid] = kPoints[off];
|
||||
}
|
||||
// Load phi_in with coallesced reads
|
||||
if ((2*block+0)*BS+tid < 2*num_splines)
|
||||
{
|
||||
in_shared[0][tid+ 0] = my_phi_in[(2*block+0)*BS+tid];
|
||||
// Load kpoints
|
||||
kPoints_s[0] = kPoints[off*3 ];
|
||||
kPoints_s[1] = kPoints[off*3+1];
|
||||
kPoints_s[2] = kPoints[off*3+2];
|
||||
// Now compute phase factors
|
||||
T phase = -(kPoints_s[0]*pos_s[0] +
|
||||
kPoints_s[1]*pos_s[1] +
|
||||
kPoints_s[2]*pos_s[2]);
|
||||
T s, c;
|
||||
sincos (phase, &s, &c);
|
||||
T2 e_mikr = T2(c,s);
|
||||
|
||||
// Temp. storage for phi, grad, lapl in the registers
|
||||
T2 u, gradlaplu[4];
|
||||
u = my_phi_in[off];
|
||||
for (int j=0; j<4; j++)
|
||||
in_shared[j+1][tid+ 0] = my_GL_in[2*j*num_splines+(2*block+0)*BS+tid];
|
||||
}
|
||||
if ((2*block+1)*BS+tid < 2*num_splines)
|
||||
{
|
||||
in_shared[0][tid+BS] = my_phi_in[(2*block+1)*BS+tid];
|
||||
gradlaplu[j] = my_GL_in[j*num_splines+off];
|
||||
|
||||
// Apply e^(-ikr) to the wavefunction
|
||||
my_phi_out[off] = u * e_mikr;
|
||||
|
||||
// Laplacian = e^(-ikr)*(laplu - u*k^2 - 2ik*gradu)
|
||||
T k2 = (kPoints_s[0]*kPoints_s[0] +
|
||||
kPoints_s[1]*kPoints_s[1] +
|
||||
kPoints_s[2]*kPoints_s[2]);
|
||||
gradlaplu[3] = e_mikr * ( gradlaplu[3] - k2*u
|
||||
- T2(0.0,2.0)*(kPoints_s[0]*gradlaplu[0]+
|
||||
kPoints_s[1]*gradlaplu[1]+
|
||||
kPoints_s[2]*gradlaplu[2]) );
|
||||
|
||||
// Gradient = e^(-ikr)*(-i*u*k + gradu)
|
||||
for (int dim=0; dim<3; dim++)
|
||||
{
|
||||
T2 g;
|
||||
g = gradlaplu[dim] - (T2(0.0,1.0) * u * kPoints_s[dim]);
|
||||
gradlaplu[dim] = g * e_mikr;
|
||||
}
|
||||
|
||||
// Write back to global memory directly
|
||||
for (int j=0; j<4; j++)
|
||||
in_shared[j+1][tid+BS] = my_GL_in[2*j*num_splines+(2*block+1)*BS+tid];
|
||||
my_GL_out[j*row_stride + off] = gradlaplu[j];
|
||||
}
|
||||
|
||||
if (BS > WARP_SIZE) __syncthreads();
|
||||
|
||||
// Now add on phase factors
|
||||
T phase = -(pos_s[0]*kPoints_s[tid][0] +
|
||||
pos_s[1]*kPoints_s[tid][1] +
|
||||
pos_s[2]*kPoints_s[tid][2]);
|
||||
T s, c;
|
||||
sincosf (phase, &s, &c);
|
||||
T u_re, u_im, gradu_re[3], gradu_im[3], laplu_re, laplu_im;
|
||||
u_re = in_shared[0][2*tid+0];
|
||||
u_im = in_shared[0][2*tid+1];
|
||||
gradu_re[0] = in_shared[1][2*tid+0];
|
||||
gradu_im[0] = in_shared[1][2*tid+1];
|
||||
gradu_re[1] = in_shared[2][2*tid+0];
|
||||
gradu_im[1] = in_shared[2][2*tid+1];
|
||||
gradu_re[2] = in_shared[3][2*tid+0];
|
||||
gradu_im[2] = in_shared[3][2*tid+1];
|
||||
laplu_re = in_shared[4][2*tid+0];
|
||||
laplu_im = in_shared[4][2*tid+1];
|
||||
in_shared[0][2*tid+0] = u_re*c - u_im*s;
|
||||
in_shared[0][2*tid+1] = u_re*s + u_im*c;
|
||||
// Gradient = e^(-ikr)*(-i*u*k + gradu)
|
||||
for (int dim=0; dim<3; dim++)
|
||||
{
|
||||
T gre, gim;
|
||||
gre = gradu_re[dim] + kPoints_s[tid][dim]*u_im;
|
||||
gim = gradu_im[dim] - kPoints_s[tid][dim]*u_re;
|
||||
in_shared[dim+1][2*tid+0] = gre*c - gim*s;
|
||||
in_shared[dim+1][2*tid+1] = gre*s + gim*c;
|
||||
}
|
||||
// Add phase contribution to laplacian
|
||||
T k2 = (kPoints_s[tid][0]*kPoints_s[tid][0] +
|
||||
kPoints_s[tid][1]*kPoints_s[tid][1] +
|
||||
kPoints_s[tid][2]*kPoints_s[tid][2]);
|
||||
T lre = laplu_re - k2*u_re + 2.0*(kPoints_s[tid][0]*gradu_im[0]+
|
||||
kPoints_s[tid][1]*gradu_im[1]+
|
||||
kPoints_s[tid][2]*gradu_im[2]);
|
||||
T lim = laplu_im - k2*u_im - 2.0*(kPoints_s[tid][0]*gradu_re[0]+
|
||||
kPoints_s[tid][1]*gradu_re[1]+
|
||||
kPoints_s[tid][2]*gradu_re[2]);
|
||||
in_shared[4][2*tid+0] = lre*c - lim*s;
|
||||
in_shared[4][2*tid+1] = lre*s + lim*c;
|
||||
// Load makeTwoCopies with coallesced reads
|
||||
|
||||
if (!NO_TWO_COPIES)
|
||||
if (block*BS+tid < num_splines)
|
||||
m2c[tid] = makeTwoCopies[block*BS + tid];
|
||||
|
||||
if (BS > WARP_SIZE) __syncthreads();
|
||||
|
||||
// Now, serialize to output buffer
|
||||
int end = min (BS, num_splines - block*BS);
|
||||
|
||||
// christos: When NO_TWO_COPIES is on, the output buffer logic can be
|
||||
// optimized further, cutting down on instruction level clutter,
|
||||
// synchronization & overall divergence. This is because outIndex's
|
||||
// updates follow the induction variable (i).
|
||||
|
||||
if (NO_TWO_COPIES && end==BS) {
|
||||
// christos: note: it is possible to eliminate out_shared under
|
||||
// certain conditions
|
||||
for (int i=tid; i<end; i+=BS) {
|
||||
for (int j=0 ; j<5 ; j++)
|
||||
out_shared[j][outIndex+i] = in_shared[j][2*i+0];
|
||||
}
|
||||
|
||||
if (BS > WARP_SIZE) __syncthreads();
|
||||
my_phi_out[ outBlock*BS+tid] = out_shared[0][tid];
|
||||
my_GL_out[0*row_stride +outBlock*BS+tid] = out_shared[1][tid];
|
||||
my_GL_out[1*row_stride +outBlock*BS+tid] = out_shared[2][tid];
|
||||
my_GL_out[2*row_stride +outBlock*BS+tid] = out_shared[3][tid];
|
||||
my_GL_out[3*row_stride +outBlock*BS+tid] = out_shared[4][tid];
|
||||
|
||||
outIndex = 0;
|
||||
outBlock++;
|
||||
} else {
|
||||
|
||||
for (int i=0; i<end; i++)
|
||||
{
|
||||
if (tid < 5)
|
||||
out_shared[tid][outIndex] = in_shared[tid][2*i+0];
|
||||
outIndex++;
|
||||
if (BS > WARP_SIZE) __syncthreads();
|
||||
if (outIndex == BS)
|
||||
{
|
||||
// Write back to global memory
|
||||
my_phi_out[ outBlock*BS+tid] = out_shared[0][tid];
|
||||
my_GL_out[0*row_stride +outBlock*BS+tid] = out_shared[1][tid];
|
||||
my_GL_out[1*row_stride +outBlock*BS+tid] = out_shared[2][tid];
|
||||
my_GL_out[2*row_stride +outBlock*BS+tid] = out_shared[3][tid];
|
||||
my_GL_out[3*row_stride +outBlock*BS+tid] = out_shared[4][tid];
|
||||
outIndex = 0;
|
||||
outBlock++;
|
||||
}
|
||||
if (!NO_TWO_COPIES && m2c[i])
|
||||
{
|
||||
if (tid < 5)
|
||||
out_shared[tid][outIndex] = in_shared[tid][2*i+1];
|
||||
outIndex++;
|
||||
if (BS > WARP_SIZE) __syncthreads();
|
||||
if (outIndex == BS)
|
||||
{
|
||||
// Write back to global memory
|
||||
my_phi_out[ outBlock*BS+tid] = out_shared[0][tid];
|
||||
my_GL_out[0*row_stride +outBlock*BS+tid] = out_shared[1][tid];
|
||||
my_GL_out[1*row_stride +outBlock*BS+tid] = out_shared[2][tid];
|
||||
my_GL_out[2*row_stride +outBlock*BS+tid] = out_shared[3][tid];
|
||||
my_GL_out[3*row_stride +outBlock*BS+tid] = out_shared[4][tid];
|
||||
outIndex = 0;
|
||||
outBlock++;
|
||||
if (BS > WARP_SIZE) __syncthreads();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (BS > WARP_SIZE) __syncthreads();
|
||||
}
|
||||
}
|
||||
if (tid < outIndex)
|
||||
{
|
||||
my_phi_out[ outBlock*BS+tid] = out_shared[0][tid];
|
||||
my_GL_out[0*row_stride +outBlock*BS+tid] = out_shared[1][tid];
|
||||
my_GL_out[1*row_stride +outBlock*BS+tid] = out_shared[2][tid];
|
||||
my_GL_out[2*row_stride +outBlock*BS+tid] = out_shared[3][tid];
|
||||
my_GL_out[3*row_stride +outBlock*BS+tid] = out_shared[4][tid];
|
||||
}
|
||||
}
|
||||
|
||||
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
|
||||
float pos[], float *phi_in[], float *phi_out[],
|
||||
float *GL_in[], float *GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride,
|
||||
bool dontMakeTwoCopies)
|
||||
|
||||
// YingWai's implementation for complex wavefunction
|
||||
|
||||
void apply_phase_factors(float kPoints[], float pos[],
|
||||
std::complex<float>* phi_in[], std::complex<float>* phi_out[],
|
||||
int num_splines, int num_walkers)
|
||||
{
|
||||
const int BS = 64;
|
||||
const int BS = 32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid (num_walkers);
|
||||
|
||||
// std::cout << "num_splines=" << num_splines << ", nb=" << ((num_splines + BS-1)/BS) << std::endl;
|
||||
|
||||
if (dontMakeTwoCopies) {
|
||||
phase_factor_kernel<float,BS,true><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
|
||||
(kPoints, makeTwoCopies, pos, phi_in, phi_out,
|
||||
GL_in, GL_out, num_splines, num_walkers, row_stride);
|
||||
} else {
|
||||
phase_factor_kernel<float,BS,false><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
|
||||
(kPoints, makeTwoCopies, pos, phi_in, phi_out,
|
||||
GL_in, GL_out, num_splines, num_walkers, row_stride);
|
||||
}
|
||||
phase_factor_kernel<float,thrust::complex<float>,BS><<<dimGrid,dimBlock>>>
|
||||
(kPoints, pos, (thrust::complex<float>**)phi_in, (thrust::complex<float>**)phi_out, num_splines);
|
||||
}
|
||||
|
||||
void apply_phase_factors(double kPoints[], double pos[],
|
||||
std::complex<double>* phi_in[], std::complex<double>* phi_out[],
|
||||
int num_splines, int num_walkers)
|
||||
{
|
||||
const int BS = 32;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid (num_walkers);
|
||||
|
||||
phase_factor_kernel<double,thrust::complex<double>,BS><<<dimGrid,dimBlock>>>
|
||||
(kPoints, pos, (thrust::complex<double>**)phi_in, (thrust::complex<double>**)phi_out, num_splines);
|
||||
}
|
||||
|
||||
void apply_phase_factors(float kPoints[], float pos[],
|
||||
std::complex<float>* phi_in[], std::complex<float>* phi_out[],
|
||||
std::complex<float>* GL_in[], std::complex<float>* GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride)
|
||||
{
|
||||
|
||||
const int BS = 128;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid (num_walkers);
|
||||
|
||||
phase_factor_kernel<float,thrust::complex<float>,BS><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
|
||||
(kPoints, pos, (thrust::complex<float>**)phi_in, (thrust::complex<float>**)phi_out, (thrust::complex<float>**)GL_in, (thrust::complex<float>**)GL_out,
|
||||
num_splines, num_walkers, row_stride);
|
||||
}
|
||||
|
||||
|
||||
void apply_phase_factors(double kPoints[], double pos[],
|
||||
std::complex<double>* phi_in[], std::complex<double>* phi_out[],
|
||||
std::complex<double>* GL_in[], std::complex<double>* GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride)
|
||||
{
|
||||
const int BS = 128;
|
||||
dim3 dimBlock(BS);
|
||||
dim3 dimGrid (num_walkers);
|
||||
|
||||
phase_factor_kernel<double,thrust::complex<double>,BS><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
|
||||
(kPoints, pos, (thrust::complex<double>**)phi_in, (thrust::complex<double>**)phi_out, (thrust::complex<double>**)GL_in, (thrust::complex<double>**)GL_out,
|
||||
num_splines, num_walkers, row_stride);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
#ifndef PHASE_FACTORS_H
|
||||
#define PHASE_FACTORS_H
|
||||
|
||||
#include<complex>
|
||||
|
||||
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
|
||||
float pos[], float *phi_in[], float *phi_out[],
|
||||
int num_splines, int num_walkers);
|
||||
|
||||
void apply_phase_factors(float kPoints[], int makeTwoCopies[],
|
||||
float pos[], float *phi_in[], float *phi_out[],
|
||||
float *GL_in[], float *GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride);
|
||||
|
||||
void apply_phase_factors(float kPoints[], int makeTwoCopies[], int TwoCopiesIndex[],
|
||||
float pos[], float *phi_in[], float *phi_out[],
|
||||
float *GL_in[], float *GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride);
|
||||
|
||||
void apply_phase_factors(double kPoints[], int makeTwoCopies[],
|
||||
double pos[], double *phi_in[], double *phi_out[],
|
||||
int num_splines, int num_walkers);
|
||||
|
||||
void apply_phase_factors(double kPoints[], int makeTwoCopies[],
|
||||
double pos[], double *phi_in[], double *phi_out[],
|
||||
double *GL_in[], double *GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride);
|
||||
|
||||
void apply_phase_factors(double kPoints[], int makeTwoCopies[], int TwoCopiesIndex[],
|
||||
double pos[], double *phi_in[], double *phi_out[],
|
||||
double *GL_in[], double *GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride);
|
||||
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
// YingWai's complex implementation
|
||||
void apply_phase_factors(float kPoints[], float pos[],
|
||||
std::complex<float>* phi_in[], std::complex<float>* phi_out[],
|
||||
int num_splines, int num_walkers);
|
||||
|
||||
void apply_phase_factors(double kPoints[], double pos[],
|
||||
std::complex<double>* phi_in[], std::complex<double>* phi_out[],
|
||||
int num_splines, int num_walkers);
|
||||
|
||||
void apply_phase_factors(float kPoints[], float pos[],
|
||||
std::complex<float>* phi_in[], std::complex<float>* phi_out[],
|
||||
std::complex<float>* GL_in[], std::complex<float>* GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride);
|
||||
|
||||
void apply_phase_factors(double kPoints[], double pos[],
|
||||
std::complex<double>* phi_in[], std::complex<double>* phi_out[],
|
||||
std::complex<double>* GL_in[], std::complex<double>* GL_out[],
|
||||
int num_splines, int num_walkers, int row_stride);
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
|
@ -313,7 +313,7 @@ public:
|
|||
// Walker-parallel vectorized functions //
|
||||
//////////////////////////////////////////
|
||||
virtual void
|
||||
reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool) { }
|
||||
reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool) { }
|
||||
|
||||
virtual void
|
||||
evaluate (std::vector<Walker_t*> &walkers, int iat, gpu::device_vector<CudaValueType*> &phi);
|
||||
|
@ -350,8 +350,8 @@ typedef SPOSetBase* SPOSetBasePtr;
|
|||
#endif
|
||||
|
||||
/***************************************************************************
|
||||
* $RCSfile$ $Author$
|
||||
* $Revision$ $Date$
|
||||
* $Id$
|
||||
* $RCSfile$ $Author: yingwai $
|
||||
* $Revision: 7279 $ $Date: 2016-11-23 19:21:16 -0500 (Wed, 23 Nov 2016) $
|
||||
* $Id: SPOSetBase.h 7279 2016-11-24 00:21:16Z yingwai $
|
||||
***************************************************************************/
|
||||
|
||||
|
|
|
@ -113,6 +113,7 @@ public:
|
|||
typedef OrbitalBase::CudaValueType CudaValueType;
|
||||
typedef OrbitalBase::CudaGradType CudaGradType;
|
||||
typedef OrbitalBase::CudaRealType CudaRealType;
|
||||
typedef OrbitalBase::RealMatrix_t RealMatrix_t;
|
||||
typedef OrbitalBase::ValueMatrix_t ValueMatrix_t;
|
||||
typedef OrbitalBase::GradMatrix_t GradMatrix_t;
|
||||
typedef ParticleSet::Walker_t Walker_t;
|
||||
|
@ -427,9 +428,8 @@ public:
|
|||
|
||||
void recompute (MCWalkerConfiguration &W, bool firstTime=true);
|
||||
|
||||
void reserve (PointerPool<gpu::device_vector<CudaRealType> > &pool,
|
||||
void reserve (PointerPool<gpu::device_vector<CudaValueType> > &pool,
|
||||
bool onlyOptimizable=false);
|
||||
|
||||
void getGradient (MCWalkerConfiguration &W, int iat,
|
||||
std::vector<GradType> &grad);
|
||||
void calcGradient (MCWalkerConfiguration &W, int iat,
|
||||
|
@ -455,12 +455,16 @@ public:
|
|||
std::vector<ValueType> &psi_ratios,
|
||||
std::vector<GradType> &newG,
|
||||
std::vector<ValueType> &newL);
|
||||
|
||||
#ifdef QMC_COMPLEX
|
||||
void convertRatiosFromComplexToReal (std::vector<ValueType> &psi_ratios,
|
||||
std::vector<RealType> &psi_ratios_real);
|
||||
#endif
|
||||
void ratio (std::vector<Walker_t*> &walkers, std::vector<int> &iatList,
|
||||
std::vector<PosType> &rNew,
|
||||
std::vector<ValueType> &psi_ratios,
|
||||
std::vector<GradType> &newG,
|
||||
std::vector<ValueType> &newL);
|
||||
|
||||
void NLratios (MCWalkerConfiguration &W,
|
||||
gpu::device_vector<CUDA_PRECISION*> &Rlist,
|
||||
gpu::device_vector<int*> &ElecList,
|
||||
|
@ -468,6 +472,7 @@ public:
|
|||
gpu::device_vector<CUDA_PRECISION*> &QuadPosList,
|
||||
gpu::device_vector<CUDA_PRECISION*> &RatioList,
|
||||
int numQuadPoints);
|
||||
|
||||
void NLratios (MCWalkerConfiguration &W, std::vector<NLjob> &jobList,
|
||||
std::vector<PosType> &quadPoints, std::vector<ValueType> &psi_ratios);
|
||||
|
||||
|
@ -493,11 +498,10 @@ public:
|
|||
GradMatrix_t& optG,
|
||||
ValueMatrix_t& optL);
|
||||
|
||||
|
||||
void evaluateDerivatives (MCWalkerConfiguration &W,
|
||||
const opt_variables_type& optvars,
|
||||
ValueMatrix_t &dlogpsi,
|
||||
ValueMatrix_t &dhpsioverpsi);
|
||||
RealMatrix_t &dlogpsi,
|
||||
RealMatrix_t &dhpsioverpsi);
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -507,7 +511,7 @@ public:
|
|||
}
|
||||
#endif
|
||||
/***************************************************************************
|
||||
* $RCSfile$ $Author$
|
||||
* $Revision$ $Date$
|
||||
* $Id$
|
||||
* $RCSfile$ $Author: tillackaf $
|
||||
* $Revision: 7408 $ $Date: 2017-01-10 13:29:49 -0500 (Tue, 10 Jan 2017) $
|
||||
* $Id: TrialWaveFunction.h 7408 2017-01-10 18:29:49Z tillackaf $
|
||||
***************************************************************************/
|
||||
|
|
|
@ -47,7 +47,7 @@ TrialWaveFunction::recompute
|
|||
|
||||
void
|
||||
TrialWaveFunction::reserve
|
||||
(PointerPool<gpu::device_vector<CudaRealType> > &pool,
|
||||
(PointerPool<gpu::device_vector<CudaValueType> > &pool,
|
||||
bool onlyOptimizable)
|
||||
{
|
||||
for(int i=0; i<Z.size(); i++)
|
||||
|
@ -131,6 +131,8 @@ TrialWaveFunction::ratio (MCWalkerConfiguration &W, int iat,
|
|||
myTimers[ii]->stop();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
TrialWaveFunction::calcRatio (MCWalkerConfiguration &W, int iat,
|
||||
std::vector<ValueType> &psi_ratios,
|
||||
|
@ -149,6 +151,8 @@ TrialWaveFunction::calcRatio (MCWalkerConfiguration &W, int iat,
|
|||
myTimers[ii]->stop();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
TrialWaveFunction::addRatio (MCWalkerConfiguration &W, int iat,
|
||||
std::vector<ValueType> &psi_ratios,
|
||||
|
@ -166,7 +170,26 @@ TrialWaveFunction::addRatio (MCWalkerConfiguration &W, int iat,
|
|||
myTimers[1+TIMER_SKIP*(Z.size()-1)]->stop();
|
||||
}
|
||||
|
||||
#if defined (QMC_COMPLEX)
|
||||
void
|
||||
TrialWaveFunction::convertRatiosFromComplexToReal (std::vector<ValueType> &psi_ratios,
|
||||
std::vector<RealType> &psi_ratios_real)
|
||||
{
|
||||
RealType PhaseValue;
|
||||
|
||||
if (psi_ratios.size() != psi_ratios_real.size() ) {
|
||||
std::cerr << "Error: In " << __FILE__ << " , line " << __LINE__ << " , "
|
||||
<< "input vector and output vector sizes unmatched." << std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
for (int iw=0; iw<psi_ratios.size(); iw++)
|
||||
{
|
||||
psi_ratios_real[iw] = evaluateLogAndPhase(psi_ratios[iw], PhaseValue);
|
||||
psi_ratios_real[iw] = std::exp(psi_ratios_real[iw]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
TrialWaveFunction::ratio (std::vector<Walker_t*> &walkers, std::vector<int> &iatList,
|
||||
|
@ -378,8 +401,8 @@ TrialWaveFunction::evaluateOptimizableLog (MCWalkerConfiguration &W,
|
|||
void
|
||||
TrialWaveFunction::evaluateDerivatives (MCWalkerConfiguration &W,
|
||||
const opt_variables_type& optvars,
|
||||
ValueMatrix_t &dlogpsi,
|
||||
ValueMatrix_t &dhpsioverpsi)
|
||||
RealMatrix_t &dlogpsi,
|
||||
RealMatrix_t &dhpsioverpsi)
|
||||
{
|
||||
for (int i=0,ii=DERIVS_TIMER; i<Z.size(); i++,ii+=TIMER_SKIP)
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue