mirror of https://github.com/QMCPACK/qmcpack.git
Merge branch 'develop' into default-nlpp-deriv-on
This commit is contained in:
commit
a932157c0c
|
@ -2,6 +2,10 @@
|
|||
|
||||
Notable changes to QMCPACK are documented in this file.
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
- Minimum CUDA version increased to 11.0 [\#3957](https://github.com/QMCPACK/qmcpack/pull/3957)
|
||||
|
||||
## [3.14.0] - 2022-04-06
|
||||
|
||||
This release focuses on performance improvements to the OpenMP target offload version for GPUs as well as ongoing minor
|
||||
|
|
|
@ -78,3 +78,18 @@ else(HAVE_MKL)
|
|||
set(MKL_FOUND FALSE)
|
||||
message(STATUS "MKL header files not found")
|
||||
endif(HAVE_MKL)
|
||||
|
||||
# check for mkl_sycl
|
||||
if(HAVE_MKL AND ENABLE_SYCL)
|
||||
find_library(MKL_SYCL mkl_sycl
|
||||
HINTS ${MKL_ROOT} $ENV{MKLROOT} $ENV{MKL_ROOT} $ENV{MKL_HOME}
|
||||
PATH_SUFFIXES lib/intel64
|
||||
REQUIRED
|
||||
)
|
||||
|
||||
if(MKL_SYCL)
|
||||
add_library(MKL::sycl INTERFACE IMPORTED)
|
||||
target_include_directories(MKL::sycl INTERFACE "${MKL_INCLUDE}")
|
||||
target_link_libraries(MKL::sycl INTERFACE ${MKL_SYCL})
|
||||
endif()
|
||||
endif()
|
||||
|
|
|
@ -126,8 +126,11 @@ if(NOT CMAKE_SYSTEM_NAME STREQUAL "CrayLinuxEnvironment")
|
|||
endif() #(CMAKE_CXX_FLAGS MATCHES "-march=" AND CMAKE_C_FLAGS MATCHES "-march=")
|
||||
else() #(CMAKE_CXX_FLAGS MATCHES "-march=" OR CMAKE_C_FLAGS MATCHES "-march=")
|
||||
# use -march=native
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
|
||||
# skipped in OneAPI 2022.0 when using SYCL which caused linking failure.
|
||||
if (NOT (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 2022.0 AND ENABLE_SYCL))
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
|
||||
endif()
|
||||
endif() #(CMAKE_CXX_FLAGS MATCHES "-march=" OR CMAKE_C_FLAGS MATCHES "-march=")
|
||||
endif()
|
||||
|
||||
|
|
|
@ -715,8 +715,12 @@ if(QMC_CUDA OR ENABLE_CUDA)
|
|||
endif()
|
||||
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
|
||||
set(CMAKE_CUDA_EXTENSIONS OFF)
|
||||
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING "nvcc host compiler passed via -ccbin")
|
||||
if(NOT CMAKE_CUDA_FLAGS MATCHES "allow-unsupported-compiler")
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --allow-unsupported-compiler")
|
||||
endif()
|
||||
enable_language(CUDA)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
find_package(CUDAToolkit 11.0 REQUIRED)
|
||||
if(NOT TARGET CUDA::cublas)
|
||||
message(
|
||||
FATAL_ERROR
|
||||
|
@ -839,14 +843,21 @@ endif(ENABLE_HIP)
|
|||
# set up SYCL compiler options and libraries
|
||||
#-------------------------------------------------------------------
|
||||
if(ENABLE_SYCL)
|
||||
if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM" OR INTEL_ONEAPI_COMPILER_FOUND))
|
||||
message(FATAL_ERROR "Only LLVM-based Intel compiler supports SYCL.")
|
||||
# require 3.20 to recognize IntelLLVM compiler ID and check accurate version numbers.
|
||||
if(CMAKE_VERSION VERSION_LESS 3.20.0)
|
||||
message(FATAL_ERROR "ENABLE_SYCL require CMake 3.20.0 or later")
|
||||
endif()
|
||||
if(NOT CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
|
||||
message(FATAL_ERROR "QMCPACK only supports SYCL with LLVM-based Intel compiler (icpx).")
|
||||
endif()
|
||||
add_library(SYCL::host INTERFACE IMPORTED)
|
||||
add_library(SYCL::device INTERFACE IMPORTED)
|
||||
find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE})
|
||||
target_link_libraries(SYCL::host INTERFACE OneAPI::DPCPP-host)
|
||||
target_link_libraries(SYCL::device INTERFACE OneAPI::DPCPP-device)
|
||||
if(TARGET MKL::sycl)
|
||||
target_link_libraries(MKL::sycl INTERFACE OneAPI::DPCPP-host)
|
||||
endif()
|
||||
endif(ENABLE_SYCL)
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
|
|
@ -24,13 +24,14 @@ particular emphasis is placed on code quality and reproducibility.
|
|||
|
||||
* C++ 17 and C99 capable compilers.
|
||||
* CMake v3.15.0 or later, build utility, http://www.cmake.org
|
||||
* BLAS/LAPACK, numerical library. Use platform-optimized libraries.
|
||||
* BLAS/LAPACK, numerical library. Use vendor and platform-optimized libraries.
|
||||
* LibXml2, XML parser, http://xmlsoft.org/
|
||||
* HDF5, portable I/O library, http://www.hdfgroup.org/HDF5/
|
||||
* BOOST v1.61.0 or newer, peer-reviewed portable C++ source libraries, http://www.boost.org
|
||||
* FFTW, FFT library, http://www.fftw.org/
|
||||
* MPI, parallel library. Optional, but a near requirement for production calculations.
|
||||
* Python3. Older versions are not supported as of January 2020.
|
||||
* CUDA v11.0 or later. Optional, but required for builds with NVIDIA GPU support.
|
||||
|
||||
We aim to support open source compilers and libraries released within two years of each QMCPACK release. Use of software versions
|
||||
over two years old may work but is discouraged and untested. Proprietary compilers (Intel, NVHPC) are generally supported over the
|
||||
|
|
|
@ -18,6 +18,7 @@ User's Guide and Developer's Manual
|
|||
|
||||
introduction
|
||||
features
|
||||
performance_portable
|
||||
installation
|
||||
running
|
||||
units
|
||||
|
|
|
@ -118,6 +118,8 @@ Batched drivers check against ``max_seconds`` and make efforts to stop the execu
|
|||
In addition, a file named ``id`` plus ``.STOP``, in this case ``vmc.STOP``, stops QMCPACK execution on the fly cleanly once being found in the working directory.
|
||||
|
||||
|
||||
.. _driver-version-parameter:
|
||||
|
||||
Driver version
|
||||
~~~~~~~~~~~~~~
|
||||
The ``driver_version`` parameter selects between the new performance-portable batched drivers and the previous drivers (now referred to as the 'legacy drivers').
|
||||
|
|
|
@ -401,6 +401,9 @@ the path to the source directory.
|
|||
|
||||
See :ref:`Sanitizer-Libraries` for more information.
|
||||
|
||||
|
||||
.. _offloadbuild:
|
||||
|
||||
Notes for OpenMP target offload to accelerators (experimental)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
QMCPACK is currently being updated to support OpenMP target offload and obtain performance
|
||||
|
@ -1777,8 +1780,8 @@ for the creation of projectors in UPF can introduce severe errors and inaccuraci
|
|||
|
||||
.. _buildqe:
|
||||
|
||||
Installing and patching Quantum ESPRESSO
|
||||
----------------------------------------
|
||||
Installing Quantum ESPRESSO and pw2qmcpack
|
||||
------------------------------------------
|
||||
|
||||
For trial wavefunctions obtained in a plane-wave basis, we mainly
|
||||
support QE. Note that ABINIT and QBox were supported historically
|
||||
|
@ -1789,6 +1792,10 @@ QE stores wavefunctions in a nonstandard internal
|
|||
we have developed a converter---pw2qmcpack---which is an add-on to the
|
||||
QE distribution.
|
||||
|
||||
|
||||
Quantum ESPRESSO (<=6.8)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To simplify the process of patching QE we have developed
|
||||
a script that will automatically download and patch the source
|
||||
code. The patches are specific to each version. For example, to download and
|
||||
|
@ -1821,15 +1828,36 @@ the HDF5 capability enabled in either way:
|
|||
|
||||
The complete process is described in external\_codes/quantum\_espresso/README.
|
||||
|
||||
The tests involving pw.x and pw2qmcpack.x have been integrated into the test suite of QMCPACK.
|
||||
By adding ``-D QE_BIN=your_QE_binary_path`` in the CMake command line when building your QMCPACK,
|
||||
tests named with the "qe-" prefix will be included in the test set of your build.
|
||||
You can test the whole ``pw > pw2qmcpack > qmcpack workflow`` by
|
||||
Quantum ESPRESSO (6.7, 6.8 and 7.0)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
After patching the QE source code like above, users may use CMake instead of configure to build QE with pw2qmcpack.
|
||||
Options needed to enable pw2qmcpack have been set ON by default.
|
||||
A HDF5 library installation with Fortran support is required.
|
||||
|
||||
::
|
||||
::
|
||||
|
||||
ctest -R qe
|
||||
mkdir build_mpi
|
||||
cd build_mpi
|
||||
cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_Fortran_COMPILER=mpif90 ..
|
||||
make -j 16
|
||||
|
||||
Quantum ESPRESSO (>7.0)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Due to incorporation of pw2qmcpack as a plugin, there is no longer any need to patch QE.
|
||||
Users may use upstream QE and activate the plugin by specifying ``-DQE_ENABLE_PLUGINS=pw2qmcpack`` at the CMake configure step.
|
||||
Full QE CMake documentation can be found at
|
||||
https://gitlab.com/QEF/q-e/-/wikis/Developers/CMake-build-system .
|
||||
|
||||
::
|
||||
|
||||
mkdir build_mpi
|
||||
cd build_mpi
|
||||
cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_Fortran_COMPILER=mpif90 -DQE_ENABLE_PLUGINS=pw2qmcpack ..
|
||||
make -j 16
|
||||
|
||||
Testing QE after installation
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Testing the QE to QMCPACK workflow after building QE and QMCPACK is highly recommended.
|
||||
See :ref:`integtestqe` and the testing section for more details.
|
||||
|
||||
.. _buildperformance:
|
||||
|
|
|
@ -114,34 +114,22 @@ To continue a run, specify the ``mcwalkerset`` element before your VMC/DMC block
|
|||
|
||||
In the project id section, make sure that the series number is different from any existing ones to avoid overwriting them.
|
||||
|
||||
|
||||
.. _batched_drivers:
|
||||
|
||||
Batched drivers
|
||||
---------------
|
||||
|
||||
Under the Exascale Computing Project effort a new set of QMC drivers was developed
|
||||
to eliminate the divergence of legacy CPU and GPU code paths at the QMC driver level and make the drivers CPU/GPU agnostic.
|
||||
The divergence came from the the fact that the CPU code path favors executing all the compute tasks within a step
|
||||
for one walker and then advance walker by walker. Multiple CPU threads process their own assigned walkers in parallel.
|
||||
In this way, walkers are not synchronized with each other and maximal throughout can be achieved on CPU.
|
||||
The GPU code path favors executing the same compute task over all the walkers together to maximize GPU throughput.
|
||||
This compute dispatch pattern minimizes the overhead of dispatching computation and host-device data transfer.
|
||||
However, the legacy GPU code path only leverages the OpenMP main host thread for handling
|
||||
all the interaction between the host and GPUs and limit the kernel dispatch capability.
|
||||
In brief, the CPU code path handles computation with a walker batch size of one and many batches
|
||||
while the GPU code path uses only one batch containing all the walkers.
|
||||
The new drivers that implement this flexible batching scheme are called "batched drivers".
|
||||
|
||||
The batched drivers introduce a new concept, "crowd", as a sub-organization of walker population.
|
||||
A crowd is a subset of the walkers that are operated on as as single batch.
|
||||
Walkers within a crowd operate their computation in lock-step, which helps the GPU efficiency.
|
||||
Walkers between crowds remain fully asynchronous unless operations involving the full population are needed.
|
||||
Walkers in different crowds remain fully asynchronous unless operations involving the full population are needed.
|
||||
With this flexible batching capability the new drivers are capable of delivering maximal performance on given hardware.
|
||||
In the new driver design, all the batched API calls may fallback to an existing single walker implementation.
|
||||
Consequently, batched drivers allow mixing and matching CPU-only and GPU-accelerated features
|
||||
in a way that is not feasible with the legacy GPU implementation.
|
||||
|
||||
For OpenMP GPU offload users, batched drivers are essential to effectively use GPUs.
|
||||
|
||||
|
||||
.. _transition_guide:
|
||||
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
.. _performance_portable:
|
||||
|
||||
Performance Portable Implementation
|
||||
===================================
|
||||
|
||||
The so-called performance portable implementation was developed to present a unified way to run QMC on CPU and GPU
|
||||
systems, and eliminate the divergence between CPU and GPU code paths that had been introduced in the past, while still
|
||||
maintaining high performance. This required generalizing all the driver inputs to potentially drive larger batches of
|
||||
walkers and also eliminating ambiguities in the various input blocks of QMCPACK. Internally many new code paths have
|
||||
been created, including new QMC drivers for VMC, DMC, and the wavefunction optimizer.
|
||||
|
||||
Once this implementation is sufficiently matured and enough features are available, the old non-performance portable
|
||||
drivers will be deprecated and eventually deleted. The number of changes required to old input files is usually very
|
||||
small, so use of the new performance portable implementation is encouraged, particularly for new projects.
|
||||
|
||||
The performance portable implementation load balances the total number of walkers onto MPI tasks, as per the old
|
||||
drivers. The new implementation is then able to subdivide the walkers of each MPI task into multiple similarly-sized
|
||||
crowds. The walkers in each crowd can then be updated simultaneously. This structure enables the walkers to be
|
||||
efficiently mapped to both CPUs and GPUs. On CPU systems, they then are mapped to OpenMP threads where a single walker
|
||||
can be computed efficiently by even a single thread. On GPU systems, large numbers of GPU threads must be used
|
||||
concurrently for high efficiency: Each crowd is first owned by a distinct CPU thread, which in turn executes batched
|
||||
operations over all the walkers in its crowd on the GPU. Provided the batches are sufficiently large, this facilitates
|
||||
efficient GPU execution, while the use of multiple crowds can reduce synchronization and allow higher performance to be
|
||||
obtained. For these reasons the new performance portable drivers are also referred to as batched drivers, since this is
|
||||
the largest change from the older code.
|
||||
|
||||
The new implementation largely uses OpenMP offload for portability, although other technologies are also used and the
|
||||
implementation has flexible dispatch to help obtain high performance on every platform.
|
||||
|
||||
This implementation was designed and implemented as part of the Exascale Computing Project, with a view to bringing
|
||||
QMCPACK to GPUs from multiple vendors with high-efficiency while creating a more maintainable and easy to contribute to
|
||||
codebase.
|
||||
|
||||
Links to more information in other sections of the manual:
|
||||
|
||||
- **Build instructions:** :ref:`OpenMP target offload <offloadbuild>` section of the :ref:`obtaininginstalling` chapter.
|
||||
|
||||
- **Supported features:** :ref:`gpufeatures` section of the :ref:`chap:features` chapter.
|
||||
|
||||
- **Enabling batch drivers** :ref:`driver-version-parameter` section of the :ref:`input-overview` chapter.
|
||||
|
||||
- **Driver Inputs:** :ref:`batched_drivers` section of the :ref:`qmcmethods` chapter.
|
||||
|
||||
|
||||
Input files for batched drivers
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Use the following changes to update input files to use the batched drivers.
|
||||
|
||||
1. Update the project block with the ``driver_version`` parameter. For example:
|
||||
|
||||
::
|
||||
|
||||
<project id="vmc" series="0">
|
||||
<parameter name="driver_version">batch</parameter>
|
||||
</project>
|
||||
|
||||
See :ref:`driver-version-parameter` for more.
|
||||
|
||||
2. Modify the QMC algorithm blocks
|
||||
|
||||
The most significant change is the ``walkers`` parameter has been replaced with ``walkers_per_rank`` or ``total_walkers``.
|
||||
|
||||
See :ref:`batched_drivers` for details.
|
|
@ -18,6 +18,9 @@
|
|||
#if defined(ENABLE_CUDA)
|
||||
#include "DualAllocator.hpp"
|
||||
#include "CUDA/CUDAallocator.hpp"
|
||||
#elif defined(ENABLE_SYCL)
|
||||
#include "DualAllocator.hpp"
|
||||
#include "SYCL/SYCLallocator.hpp"
|
||||
#endif
|
||||
#include "OhmmsPETE/OhmmsMatrix.h"
|
||||
#include "OhmmsSoA/VectorSoaContainer.h"
|
||||
|
@ -33,7 +36,10 @@ template<typename T>
|
|||
using OffloadPinnedAllocator = OMPallocator<T, PinnedAlignedAllocator<T>>;
|
||||
#if defined(ENABLE_CUDA)
|
||||
template<typename T>
|
||||
using CUDAPinnedAllocator = DualAllocator<T, CUDAAllocator<T>, PinnedAlignedAllocator<T>>;
|
||||
using VendorDualPinnedAllocator = DualAllocator<T, CUDAAllocator<T>, PinnedAlignedAllocator<T>>;
|
||||
#elif defined(ENABLE_SYCL)
|
||||
template<typename T>
|
||||
using VendorDualPinnedAllocator = DualAllocator<T, SYCLAllocator<T>, PinnedAlignedAllocator<T>>;
|
||||
#endif
|
||||
|
||||
template<class OPA>
|
||||
|
@ -109,9 +115,9 @@ TEST_CASE("OhmmsMatrix_VectorSoaContainer_View", "[Integration][Allocators]")
|
|||
{
|
||||
testDualAllocator<OffloadPinnedAllocator<double>>();
|
||||
testDualAllocator<OffloadPinnedAllocator<std::complex<double>>>();
|
||||
#if defined(ENABLE_CUDA)
|
||||
testDualAllocator<CUDAPinnedAllocator<double>>();
|
||||
testDualAllocator<CUDAPinnedAllocator<std::complex<double>>>();
|
||||
#if defined(ENABLE_CUDA) || defined(ENABLE_SYCL)
|
||||
testDualAllocator<VendorDualPinnedAllocator<double>>();
|
||||
testDualAllocator<VendorDualPinnedAllocator<std::complex<double>>>();
|
||||
#endif
|
||||
}
|
||||
} // namespace qmcplusplus
|
||||
|
|
|
@ -63,6 +63,7 @@ endif(ENABLE_ROCM)
|
|||
if(ENABLE_SYCL)
|
||||
add_subdirectory(SYCL)
|
||||
target_link_libraries(platform_runtime PUBLIC platform_sycl_runtime)
|
||||
target_link_libraries(platform_LA INTERFACE platform_sycl_LA)
|
||||
endif(ENABLE_SYCL)
|
||||
|
||||
if(BUILD_UNIT_TESTS)
|
||||
|
|
|
@ -188,17 +188,6 @@ struct qmc_allocator_traits<qmcplusplus::CUDAAllocator<T>>
|
|||
static const bool is_host_accessible = false;
|
||||
static const bool is_dual_space = false;
|
||||
static void fill_n(T* ptr, size_t n, const T& value) { qmcplusplus::CUDAfill_n(ptr, n, value); }
|
||||
static void updateTo(CUDAAllocator<T>& alloc, T* host_ptr, size_t n)
|
||||
{
|
||||
T* device_ptr = alloc.getDevicePtr(host_ptr);
|
||||
copyToDevice(device_ptr, host_ptr, n);
|
||||
}
|
||||
|
||||
static void updateFrom(CUDAAllocator<T>& alloc, T* host_ptr, size_t n)
|
||||
{
|
||||
T* device_ptr = alloc.getDevicePtr(host_ptr);
|
||||
copyFromDevice(host_ptr, device_ptr, n);
|
||||
}
|
||||
};
|
||||
|
||||
/** allocator for CUDA host pinned memory
|
||||
|
|
|
@ -57,7 +57,7 @@ void DeviceManager::initializeGlobalDeviceManager(int local_rank, int local_size
|
|||
const DeviceManager& DeviceManager::getGlobal()
|
||||
{
|
||||
if (!global)
|
||||
throw std::runtime_error("DeviceManager::getGlobal cannot access initialized the global instance.");
|
||||
throw std::runtime_error("DeviceManager::getGlobal the global instance was not initialized.");
|
||||
return *global;
|
||||
}
|
||||
} // namespace qmcplusplus
|
||||
|
|
|
@ -23,6 +23,8 @@
|
|||
#include "PinnedAllocator.h"
|
||||
#if defined(ENABLE_CUDA)
|
||||
#include "CUDA/CUDAallocator.hpp"
|
||||
#elif defined(ENABLE_SYCL)
|
||||
#include "SYCL/SYCLallocator.hpp"
|
||||
#endif
|
||||
|
||||
namespace qmcplusplus
|
||||
|
|
|
@ -22,9 +22,9 @@
|
|||
#define QMCPLUSPLUS_DUAL_ALLOCATOR_ALIASES_HPP
|
||||
|
||||
#include "PinnedAllocator.h"
|
||||
|
||||
#if defined(ENABLE_CUDA) && !defined(ENABLE_OFFLOAD)
|
||||
#if (defined(ENABLE_CUDA) || defined(ENABLE_SYCL)) && !defined(ENABLE_OFFLOAD)
|
||||
#include "DualAllocator.hpp"
|
||||
#if defined(ENABLE_CUDA)
|
||||
namespace qmcplusplus
|
||||
{
|
||||
template<typename T>
|
||||
|
@ -32,7 +32,19 @@ namespace qmcplusplus
|
|||
template<typename T>
|
||||
using PinnedDualAllocator = DualAllocator<T, CUDAAllocator<T>, PinnedAlignedAllocator<T>>;
|
||||
}
|
||||
#elif defined(ENABLE_SYCL)
|
||||
namespace qmcplusplus
|
||||
{
|
||||
template<typename T>
|
||||
using UnpinnedDualAllocator = DualAllocator<T, SYCLAllocator<T>, aligned_allocator<T>>;
|
||||
template<typename T>
|
||||
using PinnedDualAllocator = DualAllocator<T, SYCLAllocator<T>, PinnedAlignedAllocator<T>>;
|
||||
}
|
||||
#else
|
||||
#error unhandled platform
|
||||
#endif
|
||||
|
||||
#else // ENABLE_OFFLOAD or no CUDA or SYCL
|
||||
#include "OMPTarget/OffloadAlignedAllocators.hpp"
|
||||
namespace qmcplusplus
|
||||
{
|
||||
|
|
|
@ -15,8 +15,10 @@
|
|||
|
||||
#include <memory>
|
||||
#include "CPU/SIMD/aligned_allocator.hpp"
|
||||
#ifdef ENABLE_CUDA
|
||||
#if defined(ENABLE_CUDA)
|
||||
#include "CUDA/CUDAallocator.hpp"
|
||||
#elif defined(ENABLE_SYCL)
|
||||
#include "SYCL/SYCLallocator.hpp"
|
||||
#endif
|
||||
|
||||
namespace qmcplusplus
|
||||
|
@ -24,15 +26,19 @@ namespace qmcplusplus
|
|||
|
||||
/** The fact that the pinned allocators are not always pinned hurts readability elsewhere. */
|
||||
template<typename T>
|
||||
#ifdef ENABLE_CUDA
|
||||
#if defined(ENABLE_CUDA)
|
||||
using PinnedAllocator = CUDALockedPageAllocator<T>;
|
||||
#elif defined(ENABLE_SYCL)
|
||||
using PinnedAllocator = SYCLHostAllocator<T>;
|
||||
#else
|
||||
using PinnedAllocator = std::allocator<T>;
|
||||
#endif
|
||||
|
||||
template<typename T, size_t ALIGN = QMC_SIMD_ALIGNMENT>
|
||||
#ifdef ENABLE_CUDA
|
||||
#if defined(ENABLE_CUDA)
|
||||
using PinnedAlignedAllocator = CUDALockedPageAllocator<T, aligned_allocator<T, ALIGN>>;
|
||||
#elif defined(ENABLE_SYCL)
|
||||
using PinnedAlignedAllocator = SYCLHostAllocator<T, ALIGN>;
|
||||
#else
|
||||
using PinnedAlignedAllocator = aligned_allocator<T, ALIGN>;
|
||||
#endif
|
||||
|
|
|
@ -10,8 +10,13 @@
|
|||
#//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
set(SYCL_RT_SRCS SYCLDeviceManager.cpp)
|
||||
set(SYCL_RT_SRCS SYCLDeviceManager.cpp SYCLallocator.cpp SYCLruntime.cpp)
|
||||
set(SYCL_LA_SRCS syclBLAS.cpp)
|
||||
|
||||
add_library(platform_sycl_runtime ${SYCL_RT_SRCS})
|
||||
target_link_libraries(platform_sycl_runtime PUBLIC SYCL::host
|
||||
PRIVATE platform_host_runtime)
|
||||
|
||||
add_library(platform_sycl_LA ${SYCL_LA_SRCS})
|
||||
target_link_libraries(platform_sycl_LA PUBLIC platform_sycl_runtime MKL::sycl
|
||||
PRIVATE SYCL::device platform_sycl_runtime)
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include "config.h"
|
||||
#include "OutputManager.h"
|
||||
#include "determineDefaultDeviceNum.h"
|
||||
#if defined(_OPENMP)
|
||||
|
@ -27,7 +28,6 @@
|
|||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
|
||||
#if defined(_OPENMP)
|
||||
/** create SYCL device/contexts from OpenMP owned ones to ensure interoperability.
|
||||
* CUDA has the notion of primary context while SYCL requires explicitly sharing context.
|
||||
|
@ -86,11 +86,20 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices,
|
|||
else if (default_device_num != sycl_default_device_num)
|
||||
throw std::runtime_error("Inconsistent assigned SYCL devices with the previous record!");
|
||||
|
||||
default_device_queue =
|
||||
sycl::queue(visible_devices[sycl_default_device_num].context, visible_devices[sycl_default_device_num].device);
|
||||
default_device_queue = std::make_unique<sycl::queue>(visible_devices[sycl_default_device_num].context,
|
||||
visible_devices[sycl_default_device_num].device);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<sycl::queue> SYCLDeviceManager::default_device_queue;
|
||||
|
||||
sycl::queue& SYCLDeviceManager::getDefaultDeviceQueue()
|
||||
{
|
||||
if (!default_device_queue)
|
||||
throw std::runtime_error("SYCLDeviceManager::getDefaultDeviceQueue() the global instance not initialized.");
|
||||
return *default_device_queue;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
static std::vector<struct syclDeviceInfo> xomp_get_sycl_devices()
|
||||
{
|
||||
|
@ -130,10 +139,9 @@ static std::vector<struct syclDeviceInfo> xomp_get_sycl_devices()
|
|||
devices[id].device =
|
||||
sycl::ext::oneapi::level_zero::make_device(sycl_platform, reinterpret_cast<pi_native_handle>(hDevice));
|
||||
|
||||
const sycl::context sycl_context =
|
||||
sycl::ext::oneapi::level_zero::make_context({devices[id].device},
|
||||
reinterpret_cast<pi_native_handle>(hContext),
|
||||
true /* keep the ownership, no transfer */);
|
||||
devices[id].context = sycl::ext::oneapi::level_zero::make_context({devices[id].device},
|
||||
reinterpret_cast<pi_native_handle>(hContext),
|
||||
true /* keep the ownership, no transfer */);
|
||||
}
|
||||
else if (omp_backend.find("opencl") == 0)
|
||||
{
|
||||
|
|
|
@ -15,13 +15,12 @@
|
|||
#ifndef QMCPLUSPLUS_SYCLDEVICEMANAGER_H
|
||||
#define QMCPLUSPLUS_SYCLDEVICEMANAGER_H
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
#include <vector>
|
||||
#include "config.h"
|
||||
#include <memory>
|
||||
#include <CL/sycl.hpp>
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
|
||||
struct syclDeviceInfo
|
||||
{
|
||||
sycl::context context;
|
||||
|
@ -34,7 +33,9 @@ class SYCLDeviceManager
|
|||
{
|
||||
int sycl_default_device_num;
|
||||
std::vector<syclDeviceInfo> visible_devices;
|
||||
sycl::queue default_device_queue;
|
||||
|
||||
/// the global singleton which can be used to access the default queue of the default device.
|
||||
static std::unique_ptr<sycl::queue> default_device_queue;
|
||||
|
||||
public:
|
||||
SYCLDeviceManager(int& default_device_num, int& num_devices, int local_rank, int local_size);
|
||||
|
@ -42,10 +43,8 @@ public:
|
|||
/** access the the DeviceManager owned default queue.
|
||||
* Restrict the use of it to performance non-critical operations.
|
||||
* Note: CUDA has a default queue but all the SYCL queues are explicit.
|
||||
* Right now we return a copy of the default queue. Queues hold contexts and devices by referece.
|
||||
* So making a copy is expected to be cheap. If this is not the case, we will find a cheap solution.
|
||||
*/
|
||||
sycl::queue getDefaultDeviceQueue() const { return default_device_queue; }
|
||||
static sycl::queue& getDefaultDeviceQueue();
|
||||
};
|
||||
} // namespace qmcplusplus
|
||||
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// This file is distributed under the University of Illinois/NCSA Open Source License.
|
||||
// See LICENSE file in top directory for details.
|
||||
//
|
||||
// Copyright (c) 2022 QMCPACK developers.
|
||||
//
|
||||
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//
|
||||
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#include <cstddef>
|
||||
#include <atomic>
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
std::atomic<size_t> SYCLallocator_device_mem_allocated(0);
|
||||
}
|
|
@ -0,0 +1,252 @@
|
|||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// This file is distributed under the University of Illinois/NCSA Open Source License.
|
||||
// See LICENSE file in top directory for details.
|
||||
//
|
||||
// Copyright (c) 2022 QMCPACK developers.
|
||||
//
|
||||
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//
|
||||
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// -*- C++ -*-
|
||||
/** @file SYCLallocator.hpp
|
||||
* this file provides three C++ memory allocators using SYCL specific memory allocation functions.
|
||||
*
|
||||
* SYCLManagedAllocator allocates SYCL shared memory
|
||||
* SYCLAllocator allocates SYCL device memory
|
||||
* SYCLHostAllocator allocates SYCL host memory
|
||||
* They are based on CUDA*Allocator implementation
|
||||
*/
|
||||
#ifndef QMCPLUSPLUS_SYCL_ALLOCATOR_H
|
||||
#define QMCPLUSPLUS_SYCL_ALLOCATOR_H
|
||||
|
||||
#include <memory>
|
||||
#include <cstdlib>
|
||||
#include <stdexcept>
|
||||
#include <atomic>
|
||||
#include <limits>
|
||||
#include <CL/sycl.hpp>
|
||||
#include "config.h"
|
||||
#include "allocator_traits.hpp"
|
||||
#include "SYCLruntime.hpp"
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
extern std::atomic<size_t> SYCLallocator_device_mem_allocated;
|
||||
|
||||
inline size_t getSYCLdeviceMemAllocated() { return SYCLallocator_device_mem_allocated; }
|
||||
|
||||
/** allocator for SYCL shared memory
|
||||
* @tparm T data type
|
||||
* @tparm ALIGN alignment in bytes
|
||||
*/
|
||||
template<typename T, size_t ALIGN = QMC_SIMD_ALIGNMENT>
|
||||
struct SYCLSharedAllocator
|
||||
{
|
||||
typedef T value_type;
|
||||
typedef size_t size_type;
|
||||
typedef T* pointer;
|
||||
typedef const T* const_pointer;
|
||||
|
||||
static constexpr size_t alignment = ALIGN;
|
||||
|
||||
SYCLSharedAllocator() = default;
|
||||
template<class U>
|
||||
SYCLSharedAllocator(const SYCLSharedAllocator<U>&)
|
||||
{}
|
||||
|
||||
template<class U>
|
||||
struct rebind
|
||||
{
|
||||
typedef SYCLSharedAllocator<U> other;
|
||||
};
|
||||
|
||||
T* allocate(std::size_t n)
|
||||
{
|
||||
T* pt = sycl::aligned_alloc_shared<T>(ALIGN, n, getSYCLDefaultDeviceDefaultQueue());
|
||||
return pt;
|
||||
}
|
||||
void deallocate(T* p, std::size_t) { sycl::free(p, getSYCLDefaultDeviceDefaultQueue()); }
|
||||
};
|
||||
|
||||
template<class T1, class T2>
|
||||
bool operator==(const SYCLSharedAllocator<T1>&, const SYCLSharedAllocator<T2>&)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
template<class T1, class T2>
|
||||
bool operator!=(const SYCLSharedAllocator<T1>&, const SYCLSharedAllocator<T2>&)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
/** allocator for SYCL device memory
|
||||
* @tparm T data type
|
||||
* @tparm ALIGN alignment in bytes
|
||||
*
|
||||
* using this with something other than Ohmms containers?
|
||||
* -- use caution, write unit tests! --
|
||||
* It's not tested beyond use in some unit tests using std::vector with constant size.
|
||||
* SYCLAllocator appears to meet all the nonoptional requirements of a c++ Allocator.
|
||||
*
|
||||
* Some of the default implementations in std::allocator_traits
|
||||
* of optional Allocator requirements may cause runtime or compilation failures.
|
||||
* They assume there is only one memory space and that the host has access to it.
|
||||
*/
|
||||
template<typename T, size_t ALIGN = 64>
|
||||
class SYCLAllocator
|
||||
{
|
||||
public:
|
||||
typedef T value_type;
|
||||
typedef size_t size_type;
|
||||
typedef T* pointer;
|
||||
typedef const T* const_pointer;
|
||||
|
||||
static constexpr size_t alignment = ALIGN;
|
||||
|
||||
SYCLAllocator() = default;
|
||||
template<class U>
|
||||
SYCLAllocator(const SYCLAllocator<U>&)
|
||||
{}
|
||||
|
||||
template<class U>
|
||||
struct rebind
|
||||
{
|
||||
typedef SYCLAllocator<U> other;
|
||||
};
|
||||
|
||||
T* allocate(std::size_t n)
|
||||
{
|
||||
T* pt = sycl::aligned_alloc_device<T>(ALIGN, n, getSYCLDefaultDeviceDefaultQueue());
|
||||
SYCLallocator_device_mem_allocated += n * sizeof(T);
|
||||
return pt;
|
||||
}
|
||||
|
||||
void deallocate(T* p, std::size_t n)
|
||||
{
|
||||
sycl::free(p, getSYCLDefaultDeviceDefaultQueue());
|
||||
SYCLallocator_device_mem_allocated -= n * sizeof(T);
|
||||
}
|
||||
|
||||
/** Provide a construct for std::allocator_traits::contruct to call.
|
||||
* Don't do anything on construct, pointer p is on the device!
|
||||
*
|
||||
* For example std::vector calls this to default initialize each element. You'll segfault
|
||||
* if std::allocator_traits::construct tries doing that at p.
|
||||
*
|
||||
* The standard is a bit confusing on this point. Implementing this is an optional requirement
|
||||
* of Allocator from C++11 on, its not slated to be removed.
|
||||
*
|
||||
* Its deprecated for the std::allocator in c++17 and will be removed in c++20. But we are not implementing
|
||||
* std::allocator.
|
||||
*
|
||||
* STL containers only use Allocators through allocator_traits and std::allocator_traits handles the case
|
||||
* where no construct method is present in the Allocator.
|
||||
* But std::allocator_traits will call the Allocators construct method if present.
|
||||
*/
|
||||
template<class U, class... Args>
|
||||
static void construct(U* p, Args&&... args)
|
||||
{}
|
||||
|
||||
/** Give std::allocator_traits something to call.
|
||||
* The default if this isn't present is to call p->~T() which
|
||||
* we can't do on device memory.
|
||||
*/
|
||||
template<class U>
|
||||
static void destroy(U* p)
|
||||
{}
|
||||
|
||||
void copyToDevice(T* device_ptr, T* host_ptr, size_t n)
|
||||
{
|
||||
getSYCLDefaultDeviceDefaultQueue().memcpy(device_ptr, host_ptr, n * sizeof(T)).wait();
|
||||
}
|
||||
|
||||
void copyFromDevice(T* host_ptr, T* device_ptr, size_t n)
|
||||
{
|
||||
getSYCLDefaultDeviceDefaultQueue().memcpy(host_ptr, device_ptr, n * sizeof(T)).wait();
|
||||
}
|
||||
|
||||
void copyDeviceToDevice(T* to_ptr, size_t n, T* from_ptr)
|
||||
{
|
||||
getSYCLDefaultDeviceDefaultQueue().memcpy(to_ptr, from_ptr, n * sizeof(T)).wait();
|
||||
}
|
||||
};
|
||||
|
||||
template<class T1, class T2>
|
||||
bool operator==(const SYCLAllocator<T1>&, const SYCLAllocator<T2>&)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
template<class T1, class T2>
|
||||
bool operator!=(const SYCLAllocator<T1>&, const SYCLAllocator<T2>&)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
struct qmc_allocator_traits<qmcplusplus::SYCLAllocator<T>>
|
||||
{
|
||||
static const bool is_host_accessible = false;
|
||||
static const bool is_dual_space = false;
|
||||
static void fill_n(T* ptr, size_t n, const T& value)
|
||||
{
|
||||
//THINK
|
||||
//qmcplusplus::SYCLfill_n(ptr, n, value);
|
||||
}
|
||||
static void updateTo(SYCLAllocator<T>& alloc, T* host_ptr, size_t n)
|
||||
{
|
||||
T* device_ptr = alloc.getDevicePtr(host_ptr);
|
||||
alloc.copyToDevice(device_ptr, host_ptr, n);
|
||||
}
|
||||
|
||||
static void updateFrom(SYCLAllocator<T>& alloc, T* host_ptr, size_t n)
|
||||
{
|
||||
T* device_ptr = alloc.getDevicePtr(host_ptr);
|
||||
alloc.copyFromDevice(host_ptr, device_ptr, n);
|
||||
}
|
||||
};
|
||||
|
||||
/** allocator for SYCL host pinned memory
|
||||
* @tparm T data type
|
||||
* @tparm ALIGN alignment in bytes
|
||||
*/
|
||||
template<typename T, size_t ALIGN = QMC_SIMD_ALIGNMENT>
|
||||
struct SYCLHostAllocator
|
||||
{
|
||||
typedef T value_type;
|
||||
typedef size_t size_type;
|
||||
typedef T* pointer;
|
||||
typedef const T* const_pointer;
|
||||
|
||||
static constexpr size_t alignment = ALIGN;
|
||||
|
||||
SYCLHostAllocator() = default;
|
||||
template<class U>
|
||||
SYCLHostAllocator(const SYCLHostAllocator<U>&)
|
||||
{}
|
||||
|
||||
template<class U>
|
||||
struct rebind
|
||||
{
|
||||
typedef SYCLHostAllocator<U> other;
|
||||
};
|
||||
|
||||
T* allocate(std::size_t n) { return sycl::aligned_alloc_host<T>(ALIGN, n, getSYCLDefaultDeviceDefaultQueue()); }
|
||||
void deallocate(T* p, std::size_t) { sycl::free(p, getSYCLDefaultDeviceDefaultQueue()); }
|
||||
};
|
||||
|
||||
template<class T1, class T2>
|
||||
bool operator==(const SYCLHostAllocator<T1>&, const SYCLHostAllocator<T2>&)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
template<class T1, class T2>
|
||||
bool operator!=(const SYCLHostAllocator<T1>&, const SYCLHostAllocator<T2>&)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace qmcplusplus
|
||||
|
||||
#endif
|
|
@ -0,0 +1,19 @@
|
|||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// This file is distributed under the University of Illinois/NCSA Open Source License.
|
||||
// See LICENSE file in top directory for details.
|
||||
//
|
||||
// Copyright (c) 2022 QMCPACK developers.
|
||||
//
|
||||
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//
|
||||
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
#include "SYCLDeviceManager.h"
|
||||
#include "SYCLruntime.hpp"
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
sycl::queue getSYCLDefaultDeviceDefaultQueue() { return SYCLDeviceManager::getDefaultDeviceQueue(); }
|
||||
} // namespace qmcplusplus
|
|
@ -0,0 +1,22 @@
|
|||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// This file is distributed under the University of Illinois/NCSA Open Source License.
|
||||
// See LICENSE file in top directory for details.
|
||||
//
|
||||
// Copyright (c) 2022 QMCPACK developers.
|
||||
//
|
||||
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//
|
||||
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef QMCPLUSPLUS_SYCL_RUNTIME_H
|
||||
#define QMCPLUSPLUS_SYCL_RUNTIME_H
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
sycl::queue getSYCLDefaultDeviceDefaultQueue();
|
||||
} // namespace qmcplusplus
|
||||
|
||||
#endif
|
|
@ -0,0 +1,100 @@
|
|||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// This file is distributed under the University of Illinois/NCSA Open Source License.
|
||||
// See LICENSE file in top directory for details.
|
||||
//
|
||||
// Copyright (c) 2022 QMCPACK developers.
|
||||
//
|
||||
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//
|
||||
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#include "syclBLAS.hpp"
|
||||
#include "oneapi/mkl/blas.hpp"
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
namespace syclBLAS
|
||||
{
|
||||
inline oneapi::mkl::transpose convertTransEnum(char trans)
|
||||
{
|
||||
return trans == 'T' ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
sycl::event gemv(sycl::queue& handle,
|
||||
const char trans,
|
||||
const int m,
|
||||
const int n,
|
||||
const T alpha,
|
||||
const T* const A,
|
||||
const int lda,
|
||||
const T* const x,
|
||||
const int incx,
|
||||
const T beta,
|
||||
T* const y,
|
||||
const int incy,
|
||||
const std::vector<sycl::event>& events)
|
||||
{
|
||||
return oneapi::mkl::blas::gemv(handle, convertTransEnum(trans), m, n, alpha, A, lda, x, incx, beta, y, incy, events);
|
||||
}
|
||||
|
||||
template sycl::event gemv(sycl::queue& handle,
|
||||
const char trans,
|
||||
const int m,
|
||||
const int n,
|
||||
const double alpha,
|
||||
const double* const A,
|
||||
const int lda,
|
||||
const double* const x,
|
||||
const int incx,
|
||||
const double beta,
|
||||
double* const y,
|
||||
const int incy,
|
||||
const std::vector<sycl::event>& events);
|
||||
|
||||
template sycl::event gemv(sycl::queue& handle,
|
||||
const char trans,
|
||||
const int m,
|
||||
const int n,
|
||||
const float alpha,
|
||||
const float* const A,
|
||||
const int lda,
|
||||
const float* const x,
|
||||
const int incx,
|
||||
const float beta,
|
||||
float* const y,
|
||||
const int incy,
|
||||
const std::vector<sycl::event>& events);
|
||||
|
||||
template sycl::event gemv(sycl::queue& handle,
|
||||
const char trans,
|
||||
const int m,
|
||||
const int n,
|
||||
const std::complex<double> alpha,
|
||||
const std::complex<double>* const A,
|
||||
const int lda,
|
||||
const std::complex<double>* const x,
|
||||
const int incx,
|
||||
const std::complex<double> beta,
|
||||
std::complex<double>* const y,
|
||||
const int incy,
|
||||
const std::vector<sycl::event>& events);
|
||||
|
||||
template sycl::event gemv(sycl::queue& handle,
|
||||
const char trans,
|
||||
const int m,
|
||||
const int n,
|
||||
const std::complex<float> alpha,
|
||||
const std::complex<float>* const A,
|
||||
const int lda,
|
||||
const std::complex<float>* const x,
|
||||
const int incx,
|
||||
const std::complex<float> beta,
|
||||
std::complex<float>* const y,
|
||||
const int incy,
|
||||
const std::vector<sycl::event>& events);
|
||||
} // namespace syclBLAS
|
||||
|
||||
} // namespace qmcplusplus
|
|
@ -0,0 +1,44 @@
|
|||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// This file is distributed under the University of Illinois/NCSA Open Source License.
|
||||
// See LICENSE file in top directory for details.
|
||||
//
|
||||
// Copyright (c) 2022 QMCPACK developers.
|
||||
//
|
||||
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//
|
||||
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#ifndef QMCPLUSPLUS_SYCL_BLAS_H
|
||||
#define QMCPLUSPLUS_SYCL_BLAS_H
|
||||
|
||||
#include <complex>
|
||||
#include <CL/sycl.hpp>
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
namespace syclBLAS
|
||||
{
|
||||
using syclBLAS_int = std::int64_t;
|
||||
using syclBLAS_status = sycl::event;
|
||||
using syclBLAS_handle = sycl::queue;
|
||||
|
||||
template<typename T>
|
||||
sycl::event gemv(sycl::queue& handle,
|
||||
const char trans,
|
||||
const int m,
|
||||
const int n,
|
||||
const T alpha,
|
||||
const T* const A,
|
||||
const int lda,
|
||||
const T* const x,
|
||||
const int incx,
|
||||
const T beta,
|
||||
T* const y,
|
||||
const int incy,
|
||||
const std::vector<sycl::event>& events = {});
|
||||
} // namespace syclBLAS
|
||||
|
||||
} // namespace qmcplusplus
|
||||
#endif // QMCPLUSPLUS_OMPBLAS_H
|
|
@ -15,6 +15,10 @@ if(ENABLE_CUDA)
|
|||
add_subdirectory(CUDA)
|
||||
endif()
|
||||
|
||||
if(ENABLE_SYCL)
|
||||
add_subdirectory(SYCL)
|
||||
endif()
|
||||
|
||||
if(ENABLE_OFFLOAD)
|
||||
add_subdirectory(OMPTarget)
|
||||
endif(ENABLE_OFFLOAD)
|
||||
|
|
|
@ -13,13 +13,6 @@ set(SRC_DIR CUDA)
|
|||
set(UTEST_EXE test_${SRC_DIR})
|
||||
set(UTEST_NAME deterministic-unit_test_${SRC_DIR})
|
||||
|
||||
if(NOT QMC_CUDA2HIP)
|
||||
add_library(cuda_device_value_test_kernels test_device_value_kernels.cu)
|
||||
else()
|
||||
hip_add_library(cuda_device_value_test_kernels test_device_value_kernels.cu)
|
||||
endif()
|
||||
target_link_libraries(cuda_device_value_test_kernels PUBLIC platform_runtime)
|
||||
|
||||
add_executable(${UTEST_EXE} test_CUDAallocator.cpp)
|
||||
target_link_libraries(${UTEST_EXE} platform_runtime containers catch_main)
|
||||
|
||||
|
|
|
@ -1,85 +0,0 @@
|
|||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// This file is distributed under the University of Illinois/NCSA Open Source License.
|
||||
// See LICENSE file in top directory for details.
|
||||
//
|
||||
// Copyright (c) 2021 QMCPACK developers.
|
||||
//
|
||||
// File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
|
||||
//
|
||||
// File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "test_device_value_kernels.hpp"
|
||||
#include "CUDA/CUDAallocator.hpp"
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
namespace testing
|
||||
{
|
||||
|
||||
/** checking an on device pointer's value against a passed value
|
||||
*/
|
||||
template<typename T>
|
||||
__global__ void checkValue_kernel(T* device_value_ptr, const T value, bool* result)
|
||||
{
|
||||
if (*device_value_ptr == value)
|
||||
*result = true;
|
||||
else
|
||||
*result = false;
|
||||
}
|
||||
|
||||
/** checking an on device pointer's value against a passed value
|
||||
*/
|
||||
template<typename T>
|
||||
cudaError_t checkValueCUDA(cudaStream_t hstream, T* device_value_ptr, T value, bool& result)
|
||||
{
|
||||
CUDAAllocator<bool> bool_allocator;
|
||||
CUDAHostAllocator<bool> host_bool_allocator;
|
||||
bool* device_result = bool_allocator.allocate(1);
|
||||
bool* host_result = host_bool_allocator.allocate(1);
|
||||
dim3 dim_block(1);
|
||||
dim3 dim_grid(1);
|
||||
checkValue_kernel<<<dim_grid, dim_block, 0, hstream>>>(device_value_ptr, value, device_result);
|
||||
cudaCheck(cudaStreamSynchronize(hstream));
|
||||
cudaError_t kernel_error = cudaPeekAtLastError();
|
||||
cudaCheck(cudaMemcpyAsync(host_result, device_result, sizeof(bool), cudaMemcpyDeviceToHost, hstream));
|
||||
cudaCheck(cudaStreamSynchronize(hstream));
|
||||
result = *host_result;
|
||||
bool_allocator.deallocate(device_result, 1);
|
||||
host_bool_allocator.deallocate(host_result, 1);
|
||||
return kernel_error;
|
||||
}
|
||||
|
||||
__global__ void checkDualStruct_kernel(DualStruct* device_struct_ptr, const DualStruct dual_struct, bool* result)
|
||||
{
|
||||
if (device_struct_ptr->index == dual_struct.index && device_struct_ptr->value == dual_struct.value)
|
||||
*result = true;
|
||||
else
|
||||
*result = false;
|
||||
}
|
||||
|
||||
/** check a particular test structure at device pointer against passed by value struct
|
||||
*/
|
||||
cudaError_t checkDualStruct(cudaStream_t hstream, DualStruct* device_struct_ptr, DualStruct dual_struct, bool& result)
|
||||
{
|
||||
CUDAAllocator<bool> bool_allocator;
|
||||
CUDAHostAllocator<bool> host_bool_allocator;
|
||||
bool* device_result = bool_allocator.allocate(1);
|
||||
bool* host_result = host_bool_allocator.allocate(1);
|
||||
dim3 dim_block(1);
|
||||
dim3 dim_grid(1);
|
||||
checkDualStruct_kernel<<<dim_grid, dim_block, 0, hstream>>>(device_struct_ptr, dual_struct, device_result);
|
||||
cudaCheck(cudaStreamSynchronize(hstream));
|
||||
cudaError_t kernel_error = cudaPeekAtLastError();
|
||||
cudaCheck(cudaMemcpyAsync(host_result, device_result, sizeof(bool), cudaMemcpyDeviceToHost, hstream));
|
||||
cudaCheck(cudaStreamSynchronize(hstream));
|
||||
result = *host_result;
|
||||
bool_allocator.deallocate(device_result, 1);
|
||||
host_bool_allocator.deallocate(host_result, 1);
|
||||
return kernel_error;
|
||||
}
|
||||
|
||||
template cudaError_t checkValueCUDA(cudaStream_t hstream, double* device_value_ptr, double value, bool& result);
|
||||
|
||||
} // namespace testing
|
||||
} // namespace qmcplusplus
|
|
@ -1,38 +0,0 @@
|
|||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// This file is distributed under the University of Illinois/NCSA Open Source License.
|
||||
// See LICENSE file in top directory for details.
|
||||
//
|
||||
// Copyright (c) 2021 QMCPACK developers.
|
||||
//
|
||||
// File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
|
||||
//
|
||||
// File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef QMCPLUSPLUS_TEST_DEVICE_VALUES_KERNELS_HPP
|
||||
#define QMCPLUSPLUS_TEST_DEVICE_VALUES_KERNELS_HPP
|
||||
|
||||
#include "CUDA/CUDAruntime.hpp"
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
namespace testing
|
||||
{
|
||||
|
||||
template<typename T>
|
||||
cudaError_t checkValueCUDA(cudaStream_t hstream, T* device_value_ptr, T value, bool& result);
|
||||
|
||||
/** just an arbitrary struct for testing */
|
||||
struct DualStruct
|
||||
{
|
||||
int index;
|
||||
double value;
|
||||
};
|
||||
cudaError_t checkDualStruct(cudaStream_t hstream, DualStruct* device_struct_ptr, DualStruct dual_struct, bool& result);
|
||||
|
||||
extern template cudaError_t checkValueCUDA(cudaStream_t hstream, double* device_value_ptr, double value, bool& result);
|
||||
|
||||
} // namespace testing
|
||||
} // namespace qmcplusplus
|
||||
|
||||
#endif
|
|
@ -0,0 +1,29 @@
|
|||
#//////////////////////////////////////////////////////////////////////////////////////
|
||||
#// This file is distributed under the University of Illinois/NCSA Open Source License.
|
||||
#// See LICENSE file in top directory for details.
|
||||
#//
|
||||
#// Copyright (c) 2022 QMCPACK developers.
|
||||
#//
|
||||
#// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
#//
|
||||
#// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
#//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
## runtime
|
||||
set(UTEST_EXE test_sycl)
|
||||
set(UTEST_NAME deterministic-unit_${UTEST_EXE})
|
||||
|
||||
add_executable(${UTEST_EXE} test_SYCLallocator.cpp)
|
||||
target_link_libraries(${UTEST_EXE} SYCL::device platform_runtime containers catch_main)
|
||||
|
||||
add_unit_test(${UTEST_NAME} 1 1 $<TARGET_FILE:${UTEST_EXE}>)
|
||||
|
||||
## gemv/ger
|
||||
set(UTEST_EXE test_sycl_blas)
|
||||
set(UTEST_NAME deterministic-unit_${UTEST_EXE})
|
||||
|
||||
add_executable(${UTEST_EXE} test_syclBLAS.cpp)
|
||||
target_link_libraries(${UTEST_EXE} catch_main containers platform_LA)
|
||||
|
||||
add_unit_test(${UTEST_NAME} 1 1 $<TARGET_FILE:${UTEST_EXE}>)
|
|
@ -0,0 +1,74 @@
|
|||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// This file is distributed under the University of Illinois/NCSA Open Source License.
|
||||
// See LICENSE file in top directory for details.
|
||||
//
|
||||
// Copyright (c) 2019 QMCPACK developers.
|
||||
//
|
||||
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//
|
||||
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#include "catch.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <iostream>
|
||||
#include "SYCL/SYCLallocator.hpp"
|
||||
#include "OhmmsPETE/OhmmsVector.h"
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
TEST_CASE("SYCL_allocator", "[SYCL]")
|
||||
{
|
||||
// SYCLAllocator
|
||||
sycl::queue m_queue = getSYCLDefaultDeviceDefaultQueue();
|
||||
Vector<double, SYCLAllocator<double>> vec(1024);
|
||||
Vector<double> vec_h(1024);
|
||||
|
||||
sycl::event e;
|
||||
{
|
||||
double* V = vec.data();
|
||||
|
||||
e = m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] = item + 1; });
|
||||
}
|
||||
|
||||
//copy to host
|
||||
m_queue.memcpy(vec_h.data(), vec.data(), 1024 * sizeof(double), {e}).wait();
|
||||
|
||||
CHECK(vec_h[0] == 1);
|
||||
CHECK(vec_h[77] == 78);
|
||||
}
|
||||
|
||||
TEST_CASE("SYCL_host_allocator", "[SYCL]")
|
||||
{
|
||||
sycl::queue m_queue = getSYCLDefaultDeviceDefaultQueue();
|
||||
// SYCLHostAllocator
|
||||
Vector<double, SYCLHostAllocator<double>> vec(1024, 1);
|
||||
|
||||
{
|
||||
double* V = vec.data();
|
||||
m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] += item + 1; }).wait();
|
||||
}
|
||||
|
||||
CHECK(vec[0] == 2);
|
||||
CHECK(vec[77] == 79);
|
||||
}
|
||||
|
||||
/*
|
||||
TEST_CASE("SYCL_shared_allocator", "[SYCL]")
|
||||
{
|
||||
sycl::queue m_queue = getSYCLDefaultDeviceDefaultQueue();
|
||||
Vector<double, SYCLSharedAllocator<double>> vec(1024);
|
||||
|
||||
std::cout << "Size " << vec.size() << std::endl;
|
||||
{
|
||||
double* V = vec.data();
|
||||
m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] = item + 1; }).wait();
|
||||
}
|
||||
CHECK(vec[0] == 1);
|
||||
CHECK(vec[77] == 78);
|
||||
}
|
||||
*/
|
||||
|
||||
} // namespace qmcplusplus
|
|
@ -0,0 +1,94 @@
|
|||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// This file is distributed under the University of Illinois/NCSA Open Source License.
|
||||
// See LICENSE file in top directory for details.
|
||||
//
|
||||
// Copyright (c) 2021 QMCPACK developers.
|
||||
//
|
||||
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//
|
||||
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "catch.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include "OMPTarget/OMPallocator.hpp"
|
||||
#include "SYCL/SYCLruntime.hpp"
|
||||
#include "SYCL/SYCLallocator.hpp"
|
||||
#include "SYCL/syclBLAS.hpp"
|
||||
#include <OhmmsPETE/OhmmsVector.h>
|
||||
#include <OhmmsPETE/OhmmsMatrix.h>
|
||||
#include "CPU/BLAS.hpp"
|
||||
|
||||
namespace qmcplusplus
|
||||
{
|
||||
template<typename T, typename Alloc>
|
||||
void test_gemv(const int M_b, const int N_b, const char trans)
|
||||
{
|
||||
const int M = trans == 'T' ? M_b : N_b;
|
||||
const int N = trans == 'T' ? N_b : M_b;
|
||||
|
||||
using vec_t = Vector<T, Alloc>;
|
||||
using mat_t = Matrix<T, Alloc>;
|
||||
|
||||
sycl::queue handle = getSYCLDefaultDeviceDefaultQueue();
|
||||
|
||||
vec_t A(N); // Input vector
|
||||
mat_t B(M_b, N_b); // Input matrix
|
||||
vec_t C(M); // Result vector ompBLAS
|
||||
vec_t D(M); // Result vector BLAS
|
||||
|
||||
// Fill data
|
||||
for (int i = 0; i < N; i++)
|
||||
A[i] = i;
|
||||
|
||||
for (int j = 0; j < M_b; j++)
|
||||
for (int i = 0; i < N_b; i++)
|
||||
B[j][i] = i + j * 2;
|
||||
|
||||
// Fill C and D with 0
|
||||
for (int i = 0; i < M; i++)
|
||||
C[i] = D[i] = T(-0.1);
|
||||
|
||||
A.updateTo();
|
||||
B.updateTo();
|
||||
|
||||
T alpha(1);
|
||||
T beta(0);
|
||||
|
||||
// in Fortran, B[M][N] is viewed as B^T
|
||||
// when trans == 'T', the actual calculation is B * A[N] = C[M]
|
||||
// when trans == 'N', the actual calculation is B^T * A[M] = C[N]
|
||||
//ompBLAS::gemv(handle, trans, N_b, M_b, alpha, B.device_data(), N_b, A.device_data(), 1, beta, C.device_data(), 1);
|
||||
|
||||
syclBLAS::gemv(handle, trans, M_b, M_b, alpha, B.device_data(), N_b, A.device_data(), 1, beta, C.device_data(), 1)
|
||||
.wait();
|
||||
|
||||
C.updateFrom();
|
||||
|
||||
if (trans == 'T')
|
||||
BLAS::gemv_trans(M_b, N_b, B.data(), A.data(), D.data());
|
||||
else
|
||||
BLAS::gemv(M_b, N_b, B.data(), A.data(), D.data());
|
||||
|
||||
for (int index = 0; index < M; index++)
|
||||
CHECK(C[index] == D[index]);
|
||||
}
|
||||
|
||||
TEST_CASE("OmpSYCL gemv", "[SYCL]")
|
||||
{
|
||||
const int M = 137;
|
||||
const int N = 79;
|
||||
const int batch_count = 23;
|
||||
|
||||
// Non-batched test
|
||||
std::cout << "Testing TRANS gemv" << std::endl;
|
||||
#if defined(ENABLE_OFFLOAD)
|
||||
test_gemv<float, OMPallocator<float>>(M, N, 'T');
|
||||
test_gemv<double, OMPallocator<double>>(M, N, 'T');
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace qmcplusplus
|
|
@ -419,6 +419,32 @@ void SpinorSet::evaluate_spin(const ParticleSet& P, int iat, ValueVector& psi, V
|
|||
dpsi = eye * (eis * psi_work_up - emis * psi_work_down);
|
||||
}
|
||||
|
||||
void SpinorSet::evaluateGradSource(const ParticleSet& P,
|
||||
int first,
|
||||
int last,
|
||||
const ParticleSet& source,
|
||||
int iat_src,
|
||||
GradMatrix& gradphi)
|
||||
{
|
||||
IndexType nelec = P.getTotalNum();
|
||||
|
||||
GradMatrix gradphi_up(nelec, OrbitalSetSize);
|
||||
GradMatrix gradphi_dn(nelec, OrbitalSetSize);
|
||||
spo_up->evaluateGradSource(P, first, last, source, iat_src, gradphi_up);
|
||||
spo_dn->evaluateGradSource(P, first, last, source, iat_src, gradphi_dn);
|
||||
|
||||
for (int iat = 0; iat < nelec; iat++)
|
||||
{
|
||||
ParticleSet::Scalar_t s = P.activeSpin(iat);
|
||||
RealType coss = std::cos(s);
|
||||
RealType sins = std::sin(s);
|
||||
ValueType eis(coss, sins);
|
||||
ValueType emis(coss, -sins);
|
||||
for (int imo = 0; imo < OrbitalSetSize; imo++)
|
||||
gradphi(iat, imo) = gradphi_up(iat, imo) * eis + gradphi_dn(iat, imo) * emis;
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<SPOSet> SpinorSet::makeClone() const
|
||||
{
|
||||
auto myclone = std::make_unique<SpinorSet>();
|
||||
|
|
|
@ -129,6 +129,22 @@ public:
|
|||
*/
|
||||
void evaluate_spin(const ParticleSet& P, int iat, ValueVector& psi, ValueVector& dpsi) override;
|
||||
|
||||
/** evaluate the gradients of this single-particle orbital
|
||||
* for [first,last) target particles with respect to the given source particle
|
||||
* @param P current ParticleSet
|
||||
* @param first starting index of the particles
|
||||
* @param last ending index of the particles
|
||||
* @param iat_src source particle index
|
||||
* @param gradphi gradients
|
||||
*
|
||||
*/
|
||||
virtual void evaluateGradSource(const ParticleSet& P,
|
||||
int first,
|
||||
int last,
|
||||
const ParticleSet& source,
|
||||
int iat_src,
|
||||
GradMatrix& gradphi) override;
|
||||
|
||||
std::unique_ptr<SPOSet> makeClone() const override;
|
||||
|
||||
private:
|
||||
|
|
|
@ -23,6 +23,7 @@ set(UTEST_HDF_INPUT6 ${qmcpack_SOURCE_DIR}/src/QMCWaveFunctions/tests/lcao_spino
|
|||
set(UTEST_HDF_INPUT7 ${qmcpack_SOURCE_DIR}/tests/molecules/LiH_ae_MSD/LiH.orbs.h5)
|
||||
set(UTEST_HDF_INPUT8 ${qmcpack_SOURCE_DIR}/tests/molecules/LiH_ae_MSD/LiH.Multidet.h5)
|
||||
set(UTEST_HDF_INPUT9 ${qmcpack_SOURCE_DIR}/tests/converter/test_Bi_dirac/gold.orbs.h5)
|
||||
set(UTEST_HDF_INPUT10 ${qmcpack_SOURCE_DIR}/src/QMCWaveFunctions/tests/lcao_spinor_molecule.h5)
|
||||
|
||||
maybe_symlink(${UTEST_HDF_INPUT0} ${UTEST_DIR}/diamondC_1x1x1.pwscf.h5)
|
||||
maybe_symlink(${UTEST_HDF_INPUT1} ${UTEST_DIR}/diamondC_2x1x1.pwscf.h5)
|
||||
|
@ -34,6 +35,7 @@ maybe_symlink(${UTEST_HDF_INPUT6} ${UTEST_DIR}/lcao_spinor.h5)
|
|||
maybe_symlink(${UTEST_HDF_INPUT7} ${UTEST_DIR}/LiH.orbs.h5)
|
||||
maybe_symlink(${UTEST_HDF_INPUT8} ${UTEST_DIR}/LiH.Multidet.h5)
|
||||
maybe_symlink(${UTEST_HDF_INPUT9} ${UTEST_DIR}/Bi.orbs.h5)
|
||||
maybe_symlink(${UTEST_HDF_INPUT10} ${UTEST_DIR}/lcao_spinor_molecule.h5)
|
||||
|
||||
set(FILES_TO_COPY
|
||||
he_sto3g.wfj.xml
|
||||
|
|
Binary file not shown.
|
@ -0,0 +1,231 @@
|
|||
import h5py
|
||||
import numpy as np
|
||||
from scipy.special import sph_harm, factorial2
|
||||
|
||||
def write_h5_file():
|
||||
hf = h5py.File('lcao_spinor_molecule.h5','w')
|
||||
|
||||
#atoms
|
||||
atoms = hf.create_group('atoms')
|
||||
nat = np.array([2])
|
||||
nsp = np.array([1])
|
||||
pos = np.array([[0.1,0.2,0.3],[-0.3,-0.2,-0.1]])
|
||||
ids = np.array([0,0])
|
||||
atoms.create_dataset('number_of_atoms', data=nat)
|
||||
atoms.create_dataset('number_of_species', data=nsp)
|
||||
atoms.create_dataset('positions', data=pos)
|
||||
atoms.create_dataset('species_ids', data=ids)
|
||||
sp = atoms.create_group('species_0')
|
||||
|
||||
atnum = np.array([1])
|
||||
charge = np.array([1])
|
||||
core = np.array([1])
|
||||
name = "H"
|
||||
mylen = "S"+str(len(name))
|
||||
strList = [name]
|
||||
asciiList = [n.encode("ascii", "ignore") for n in strList]
|
||||
sp.create_dataset("atomic_number", data=atnum)
|
||||
sp.create_dataset("charge", data=charge)
|
||||
sp.create_dataset("core", data=core)
|
||||
sp.create_dataset("name", (1,), mylen, asciiList)
|
||||
|
||||
#PBC
|
||||
pbc = hf.create_group("PBC")
|
||||
pbc.create_dataset("PBC",(1,), dtype="b1", data=False)
|
||||
|
||||
#application
|
||||
app = hf.create_group("application")
|
||||
code = "generic"
|
||||
mylen = "S"+str(len(code))
|
||||
strList = [code]
|
||||
asciiList = [n.encode("ascii", "ignore") for n in strList]
|
||||
app.create_dataset("code",(1,), mylen, asciiList)
|
||||
|
||||
#basisset
|
||||
bs = hf.create_group("basisset")
|
||||
bs.create_dataset("NbElements", data=np.array([1]))
|
||||
name="LCAOBSet"
|
||||
mylen="S"+str(len(name))
|
||||
strList=[name]
|
||||
asciiList=[n.encode("ascii","ignore") for n in strList]
|
||||
bs.create_dataset("name", (1,), mylen, asciiList)
|
||||
atbs = bs.create_group("atomicBasisSet0")
|
||||
|
||||
atbs.create_dataset("NbBasisGroups", data=np.array([1]))
|
||||
mystr = "cartesian"
|
||||
mylen = "S"+str(len(mystr))
|
||||
strList = [mystr]
|
||||
asciiList = [n.encode("ascii","ignore") for n in strList]
|
||||
atbs.create_dataset("angular",(1,), mylen, asciiList)
|
||||
mystr = "H"
|
||||
mylen = "S"+str(len(mystr))
|
||||
strList = [mystr]
|
||||
asciiList = [n.encode("ascii","ignore") for n in strList]
|
||||
atbs.create_dataset("elementType",(1,), mylen, asciiList)
|
||||
mystr = "Gamess"
|
||||
mylen = "S"+str(len(mystr))
|
||||
strList = [mystr]
|
||||
asciiList = [n.encode("ascii","ignore") for n in strList]
|
||||
atbs.create_dataset("expandYlm",(1,), mylen, asciiList)
|
||||
atbs.create_dataset("grid_npts", data=np.array([1001]))
|
||||
atbs.create_dataset("grid_rf", data=np.array([100]))
|
||||
atbs.create_dataset("grid_ri", data=np.array([1e-06]))
|
||||
mystr = "log"
|
||||
mylen = "S"+str(len(mystr))
|
||||
strList = [mystr]
|
||||
asciiList = [n.encode("ascii","ignore") for n in strList]
|
||||
atbs.create_dataset("grid_type",(1,), mylen, asciiList)
|
||||
mystr = "Gaussian"
|
||||
mylen = "S"+str(len(mystr))
|
||||
strList = [mystr]
|
||||
asciiList = [n.encode("ascii","ignore") for n in strList]
|
||||
atbs.create_dataset("name",(1,), mylen, asciiList)
|
||||
mystr = "no"
|
||||
mylen = "S"+str(len(mystr))
|
||||
strList = [mystr]
|
||||
asciiList = [n.encode("ascii","ignore") for n in strList]
|
||||
atbs.create_dataset("normalized",(1,), mylen, asciiList)
|
||||
|
||||
bg = atbs.create_group("basisGroup0")
|
||||
bg.create_dataset("NbRadFunc", data=np.array([1]))
|
||||
bg.create_dataset("l", data=np.array([0]))
|
||||
bg.create_dataset("n", data=np.array([0]))
|
||||
mystr = "H00"
|
||||
mylen = "S"+str(len(mystr))
|
||||
strList = [mystr]
|
||||
asciiList = [n.encode("ascii","ignore") for n in strList]
|
||||
bg.create_dataset("rid",(1,), mylen, asciiList)
|
||||
mystr = "Gaussian"
|
||||
mylen = "S"+str(len(mystr))
|
||||
strList = [mystr]
|
||||
asciiList = [n.encode("ascii","ignore") for n in strList]
|
||||
bg.create_dataset("type",(1,), mylen, asciiList)
|
||||
rf = bg.create_group("radfunctions")
|
||||
dr = rf.create_group("DataRad0")
|
||||
dr.create_dataset("contraction", data=np.array([1.0]))
|
||||
dr.create_dataset("exponent", data=np.array([2.5]))
|
||||
|
||||
|
||||
kpts = hf.create_group("Super_Twist")
|
||||
kpts.create_dataset("eigenset_0", data=np.array([[0.075, 0.15]]))
|
||||
kpts.create_dataset("eigenset_0_imag", data=np.array([[0.225, 0.45]]))
|
||||
kpts.create_dataset("eigenset_1", data=np.array([[-0.12, -0.06]]))
|
||||
kpts.create_dataset("eigenset_1_imag", data=np.array([[0.48, 0.24]]))
|
||||
hf.close()
|
||||
|
||||
class cartGauss:
|
||||
def __init__(self,expt,l=0,i=0,j=0,k=0):
|
||||
self.expt = expt
|
||||
self.l = l
|
||||
self.i = i
|
||||
self.j = j
|
||||
self.k = k
|
||||
assert(i+j+k == l)
|
||||
def norm(self):
|
||||
n = (2*self.expt / np.pi)**(3./4.)
|
||||
n *= np.sqrt(2.**(self.l) / factorial2(2*self.i - 1) / factorial2(2*self.j - 1) / factorial2(2*self.k - 1)) * np.sqrt(2*self.expt)**self.l
|
||||
return n
|
||||
def val(self,pos):
|
||||
r = np.linalg.norm(pos)
|
||||
norm = self.norm()
|
||||
return norm *pos[0]**self.i * pos[1]**self.j * pos[2]**self.k * np.exp(-self.expt * r * r)
|
||||
|
||||
def get_reference_values(pos, s):
|
||||
cs = np.cos(s)
|
||||
ss = np.sin(s)
|
||||
eis = cs + 1.j*ss
|
||||
emis = cs - 1.j*ss
|
||||
|
||||
print("Position: {}".format(pos))
|
||||
print("Spin: {}".format(s))
|
||||
|
||||
g0 = cartGauss(2.5, 0, 0, 0, 0)
|
||||
g1 = cartGauss(2.5, 0, 0, 0, 0)
|
||||
|
||||
R0 = np.array([0.1,0.2,0.3])
|
||||
R1 = np.array([-0.3,-0.2,-0.1])
|
||||
|
||||
c0 = 0.3
|
||||
c1 = 0.6
|
||||
|
||||
upcoef = (0.25 + 0.75j)
|
||||
dncoef = (-0.2 + 0.8j)
|
||||
|
||||
dr = 1e-7
|
||||
|
||||
g0val = g0.val(pos-R0)
|
||||
g0px = g0.val(pos-(R0 + np.array([dr,0,0])))
|
||||
g0mx = g0.val(pos-(R0 - np.array([dr,0,0])))
|
||||
g0py = g0.val(pos-(R0 + np.array([0,dr,0])))
|
||||
g0my = g0.val(pos-(R0 - np.array([0,dr,0])))
|
||||
g0pz = g0.val(pos-(R0 + np.array([0,0,dr])))
|
||||
g0mz = g0.val(pos-(R0 - np.array([0,0,dr])))
|
||||
|
||||
g1val = g1.val(pos-R1)
|
||||
g1px = g1.val(pos-(R1 + np.array([dr,0,0])))
|
||||
g1mx = g1.val(pos-(R1 - np.array([dr,0,0])))
|
||||
g1py = g1.val(pos-(R1 + np.array([0,dr,0])))
|
||||
g1my = g1.val(pos-(R1 - np.array([0,dr,0])))
|
||||
g1pz = g1.val(pos-(R1 + np.array([0,0,dr])))
|
||||
g1mz = g1.val(pos-(R1 - np.array([0,0,dr])))
|
||||
|
||||
#atom 0
|
||||
uppx = c0*g0px + c1*g1val
|
||||
upmx = c0*g0mx + c1*g1val
|
||||
updx = (uppx - upmx) / (2*dr)
|
||||
dnpx = c1*g0px + c0*g1val
|
||||
dnmx = c1*g0mx + c0*g1val
|
||||
dndx = (dnpx - dnmx) / (2*dr)
|
||||
uppy = c0*g0py + c1*g1val
|
||||
upmy = c0*g0my + c1*g1val
|
||||
updy = (uppy - upmy) / (2*dr)
|
||||
dnpy = c1*g0py + c0*g1val
|
||||
dnmy = c1*g0my + c0*g1val
|
||||
dndy = (dnpy - dnmy) / (2*dr)
|
||||
uppz = c0*g0pz + c1*g1val
|
||||
upmz = c0*g0mz + c1*g1val
|
||||
updz = (uppz - upmz) / (2*dr)
|
||||
dnpz = c1*g0pz + c0*g1val
|
||||
dnmz = c1*g0mz + c0*g1val
|
||||
dndz = (dnpz - dnmz) / (2*dr)
|
||||
|
||||
spdx = upcoef * updx * eis + dncoef * dndx * emis
|
||||
spdy = upcoef * updy * eis + dncoef * dndy * emis
|
||||
spdz = upcoef * updz * eis + dncoef * dndz * emis
|
||||
|
||||
print("grad atom 0: {}, {}, {}".format(spdx, spdy, spdz))
|
||||
|
||||
#atom 1
|
||||
uppx = c0*g0val + c1*g1px
|
||||
upmx = c0*g0val + c1*g1mx
|
||||
updx = (uppx - upmx) / (2*dr)
|
||||
dnpx = c1*g0val + c0*g1px
|
||||
dnmx = c1*g0val + c0*g1mx
|
||||
dndx = (dnpx - dnmx) / (2*dr)
|
||||
uppy = c0*g0val + c1*g1py
|
||||
upmy = c0*g0val + c1*g1my
|
||||
updy = (uppy - upmy) / (2*dr)
|
||||
dnpy = c1*g0val + c0*g1py
|
||||
dnmy = c1*g0val + c0*g1my
|
||||
dndy = (dnpy - dnmy) / (2*dr)
|
||||
uppz = c0*g0val + c1*g1pz
|
||||
upmz = c0*g0val + c1*g1mz
|
||||
updz = (uppz - upmz) / (2*dr)
|
||||
dnpz = c1*g0val + c0*g1pz
|
||||
dnmz = c1*g0val + c0*g1mz
|
||||
dndz = (dnpz - dnmz) / (2*dr)
|
||||
|
||||
spdx = upcoef * updx * eis + dncoef * dndx * emis
|
||||
spdy = upcoef * updy * eis + dncoef * dndy * emis
|
||||
spdz = upcoef * updz * eis + dncoef * dndz * emis
|
||||
|
||||
print("grad atom 1: {}, {}, {}".format(spdx, spdy, spdz))
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
write_h5_file()
|
||||
pos = np.array([0.01, -0.02, 0.03])
|
||||
s = 0.6
|
||||
get_reference_values(pos, s)
|
||||
|
|
@ -21,7 +21,6 @@
|
|||
#include "Utilities/for_testing/RandomForTest.h"
|
||||
#include "Platforms/DualAllocatorAliases.hpp"
|
||||
#include "Platforms/CUDA/CUDALinearAlgebraHandles.h"
|
||||
#include "Platforms/tests/CUDA/test_device_value_kernels.hpp"
|
||||
|
||||
// Legacy CPU inversion for temporary testing
|
||||
#include "QMCWaveFunctions/Fermion/DiracMatrix.h"
|
||||
|
|
|
@ -67,13 +67,12 @@ void test_lcao_spinor()
|
|||
elec_.addTable(ions_);
|
||||
elec_.update();
|
||||
|
||||
const char* particles = "<tmp> \
|
||||
<sposet_builder name=\"spinorbuilder\" type=\"molecularorbital\" href=\"lcao_spinor.h5\" source=\"ion\" precision=\"float\"> \
|
||||
<basisset transform=\"yes\"/> \
|
||||
<sposet name=\"myspo\" size=\"1\"/> \
|
||||
</sposet_builder> \
|
||||
</tmp> \
|
||||
";
|
||||
const char* particles = R"XML(<tmp>
|
||||
<sposet_builder name="spinorbuilder" type="molecularorbital" href="lcao_spinor.h5" source="ion" precision="float">
|
||||
<basisset transform="yes"/>
|
||||
<sposet name="myspo" size="1"/>
|
||||
</sposet_builder>
|
||||
</tmp>)XML";
|
||||
|
||||
Libxml2Document doc;
|
||||
bool okay = doc.parseFromString(particles);
|
||||
|
@ -118,6 +117,20 @@ void test_lcao_spinor()
|
|||
CHECK(d2psiM[iat][0] == ComplexApprox(vlp).epsilon(eps));
|
||||
}
|
||||
|
||||
/** this is a somewhat simple example. We have an ion at the origin
|
||||
* and a gaussian basis function centered on the ion as a orbital.
|
||||
* In this case, the ion derivative is actually just the negative of
|
||||
* the electron gradient.
|
||||
*/
|
||||
SPOSet::GradMatrix gradIon(elec_.R.size(), spo->getOrbitalSetSize());
|
||||
spo->evaluateGradSource(elec_, 0, elec_.R.size(), ions_, 0, gradIon);
|
||||
for (int iat = 0; iat < 1; iat++)
|
||||
{
|
||||
CHECK(gradIon[iat][0][0] == ComplexApprox(-vdx).epsilon(eps));
|
||||
CHECK(gradIon[iat][0][1] == ComplexApprox(-vdy).epsilon(eps));
|
||||
CHECK(gradIon[iat][0][2] == ComplexApprox(-vdz).epsilon(eps));
|
||||
}
|
||||
|
||||
int OrbitalSetSize = spo->getOrbitalSetSize();
|
||||
//temporary arrays for holding the values of the up and down channels respectively.
|
||||
SPOSet::ValueVector psi_work;
|
||||
|
@ -376,17 +389,16 @@ void test_lcao_spinor_excited()
|
|||
elec_.addTable(ions_);
|
||||
elec_.update();
|
||||
|
||||
const char* particles = "<tmp> \
|
||||
<sposet_builder name=\"spinorbuilder\" type=\"molecularorbital\" href=\"lcao_spinor.h5\" source=\"ion\" precision=\"float\"> \
|
||||
<basisset name=\"myset\" transform=\"yes\"/> \
|
||||
<sposet name=\"myspo\" basisset=\"myset\" size=\"1\"> \
|
||||
<occupation mode=\"excited\"> \
|
||||
-1 2 \
|
||||
</occupation> \
|
||||
</sposet> \
|
||||
</sposet_builder> \
|
||||
</tmp> \
|
||||
";
|
||||
const char* particles = R"XML(<tmp>
|
||||
<sposet_builder name="spinorbuilder" type="molecularorbital" href="lcao_spinor.h5" source="ion" precision="float">
|
||||
<basisset name="myset" transform="yes"/>
|
||||
<sposet name="myspo" basisset="myset" size="1">
|
||||
<occupation mode="excited">
|
||||
-1 2
|
||||
</occupation>
|
||||
</sposet>
|
||||
</sposet_builder>
|
||||
</tmp>)XML";
|
||||
|
||||
Libxml2Document doc;
|
||||
bool okay = doc.parseFromString(particles);
|
||||
|
@ -434,6 +446,20 @@ void test_lcao_spinor_excited()
|
|||
CHECK(d2psiM[iat][0] == ComplexApprox(vlp).epsilon(eps));
|
||||
}
|
||||
|
||||
/** this is a somewhat simple example. We have an ion at the origin
|
||||
* and a gaussian basis function centered on the ion as a orbital.
|
||||
* In this case, the ion derivative is actually just the negative of
|
||||
* the electron gradient.
|
||||
*/
|
||||
SPOSet::GradMatrix gradIon(elec_.R.size(), spo->getOrbitalSetSize());
|
||||
spo->evaluateGradSource(elec_, 0, elec_.R.size(), ions_, 0, gradIon);
|
||||
for (int iat = 0; iat < 1; iat++)
|
||||
{
|
||||
CHECK(gradIon[iat][0][0] == ComplexApprox(-vdx).epsilon(eps));
|
||||
CHECK(gradIon[iat][0][1] == ComplexApprox(-vdy).epsilon(eps));
|
||||
CHECK(gradIon[iat][0][2] == ComplexApprox(-vdz).epsilon(eps));
|
||||
}
|
||||
|
||||
//temporary arrays for holding the values of the up and down channels respectively.
|
||||
SPOSet::ValueVector psi_work;
|
||||
|
||||
|
@ -645,8 +671,102 @@ void test_lcao_spinor_excited()
|
|||
}
|
||||
}
|
||||
|
||||
void test_lcao_spinor_ion_derivs()
|
||||
{
|
||||
app_log() << "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n";
|
||||
app_log() << "!!!! LCAO SpinorSet from HDF (ion derivatives) !!!!\n";
|
||||
app_log() << "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n";
|
||||
|
||||
using ValueType = SPOSet::ValueType;
|
||||
using RealType = SPOSet::RealType;
|
||||
Communicate* c = OHMMS::Controller;
|
||||
|
||||
ParticleSetPool ptcl = ParticleSetPool(c);
|
||||
auto ions_uptr = std::make_unique<ParticleSet>(ptcl.getSimulationCell());
|
||||
auto elec_uptr = std::make_unique<ParticleSet>(ptcl.getSimulationCell());
|
||||
ParticleSet& ions_(*ions_uptr);
|
||||
ParticleSet& elec_(*elec_uptr);
|
||||
|
||||
ions_.setName("ion");
|
||||
ptcl.addParticleSet(std::move(ions_uptr));
|
||||
ions_.create({2});
|
||||
|
||||
ions_.R[0][0] = 0.10000000;
|
||||
ions_.R[0][1] = 0.20000000;
|
||||
ions_.R[0][2] = 0.30000000;
|
||||
ions_.R[1][0] = -0.30000000;
|
||||
ions_.R[1][1] = -0.20000000;
|
||||
ions_.R[1][2] = -0.10000000;
|
||||
SpeciesSet& ispecies = ions_.getSpeciesSet();
|
||||
int hIdx = ispecies.addSpecies("H");
|
||||
ions_.update();
|
||||
|
||||
elec_.setName("elec");
|
||||
ptcl.addParticleSet(std::move(elec_uptr));
|
||||
elec_.create({1});
|
||||
elec_.R[0][0] = 0.01;
|
||||
elec_.R[0][1] = -0.02;
|
||||
elec_.R[0][2] = 0.03;
|
||||
elec_.spins[0] = 0.6;
|
||||
elec_.setSpinor(true);
|
||||
|
||||
SpeciesSet& tspecies = elec_.getSpeciesSet();
|
||||
int upIdx = tspecies.addSpecies("u");
|
||||
int chargeIdx = tspecies.addAttribute("charge");
|
||||
tspecies(chargeIdx, upIdx) = -1;
|
||||
|
||||
|
||||
elec_.addTable(ions_);
|
||||
elec_.update();
|
||||
|
||||
const char* particles = R"XML(<tmp>
|
||||
<sposet_builder name="spinorbuilder" type="molecularorbital" href="lcao_spinor_molecule.h5" source="ion" precision="float">
|
||||
<basisset transform="yes"/>
|
||||
<sposet name="myspo" size="1"/>
|
||||
</sposet_builder>
|
||||
</tmp>)XML";
|
||||
|
||||
Libxml2Document doc;
|
||||
bool okay = doc.parseFromString(particles);
|
||||
REQUIRE(okay);
|
||||
|
||||
xmlNodePtr root = doc.getRoot();
|
||||
|
||||
xmlNodePtr bnode = xmlFirstElementChild(root);
|
||||
SPOSetBuilderFactory fac(c, elec_, ptcl.getPool());
|
||||
const auto spo_builder_ptr = fac.createSPOSetBuilder(bnode);
|
||||
auto& bb = *spo_builder_ptr;
|
||||
|
||||
// only pick up the last sposet
|
||||
std::unique_ptr<SPOSet> spo;
|
||||
processChildren(bnode, [&](const std::string& cname, const xmlNodePtr element) {
|
||||
if (cname == "sposet")
|
||||
spo = bb.createSPOSet(element);
|
||||
});
|
||||
REQUIRE(spo);
|
||||
|
||||
//reference values from finite difference in lcao_spinor_molecule_test.py
|
||||
ValueType dx0(-0.0492983, -0.3192778);
|
||||
ValueType dy0(-0.1205071, -0.7804567);
|
||||
ValueType dz0(-0.1478950, -0.9578333);
|
||||
ValueType dx1(-0.0676367, 1.0506422);
|
||||
ValueType dy1(-0.0392729, 0.6100503);
|
||||
ValueType dz1(-0.0283638, 0.4405919);
|
||||
|
||||
const RealType eps = 1e-4;
|
||||
SPOSet::GradMatrix gradIon(elec_.R.size(), spo->getOrbitalSetSize());
|
||||
spo->evaluateGradSource(elec_, 0, elec_.R.size(), ions_, 0, gradIon);
|
||||
CHECK(gradIon[0][0][0] == ComplexApprox(dx0).epsilon(eps));
|
||||
CHECK(gradIon[0][0][1] == ComplexApprox(dy0).epsilon(eps));
|
||||
CHECK(gradIon[0][0][2] == ComplexApprox(dz0).epsilon(eps));
|
||||
spo->evaluateGradSource(elec_, 0, elec_.R.size(), ions_, 1, gradIon);
|
||||
CHECK(gradIon[0][0][0] == ComplexApprox(dx1).epsilon(eps));
|
||||
CHECK(gradIon[0][0][1] == ComplexApprox(dy1).epsilon(eps));
|
||||
CHECK(gradIon[0][0][2] == ComplexApprox(dz1).epsilon(eps));
|
||||
}
|
||||
|
||||
TEST_CASE("ReadMolecularOrbital GTO spinor", "[wavefunction]") { test_lcao_spinor(); }
|
||||
TEST_CASE("ReadMolecularOrbital GTO spinor with excited", "[wavefunction]") { test_lcao_spinor_excited(); }
|
||||
TEST_CASE("spinor ion derivatives for molecule", "[wavefunction]") { test_lcao_spinor_ion_derivs(); }
|
||||
|
||||
} // namespace qmcplusplus
|
||||
|
|
|
@ -41,7 +41,7 @@ case "$1" in
|
|||
export CUDACXX=/usr/local/cuda-11.2/bin/nvcc
|
||||
|
||||
# Make current environment variables available to subsequent steps
|
||||
echo "PATH=/usr/local/cuda-11.2/bin:$PATH" >> $GITHUB_ENV
|
||||
echo "PATH=$PATH" >> $GITHUB_ENV
|
||||
echo "CUDACXX=/usr/local/cuda-11.2/bin/nvcc" >> $GITHUB_ENV
|
||||
|
||||
else
|
||||
|
@ -286,6 +286,12 @@ case "$1" in
|
|||
|
||||
# Build using ninja (~ 25 minutes on GitHub-hosted runner)
|
||||
build)
|
||||
# CUDA toolchain can be used implicitly by the compiler. Double check the location.
|
||||
if [[ "${GH_JOBNAME}" =~ (CUDA) ]]
|
||||
then
|
||||
which nvcc
|
||||
fi
|
||||
|
||||
cd ${GITHUB_WORKSPACE}/../qmcpack-build
|
||||
ninja
|
||||
;;
|
||||
|
|
Loading…
Reference in New Issue