Merge branch 'develop' into default-nlpp-deriv-on

This commit is contained in:
Ye Luo 2022-04-27 12:31:57 -05:00 committed by GitHub
commit a932157c0c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
41 changed files with 1287 additions and 215 deletions

View File

@ -2,6 +2,10 @@
Notable changes to QMCPACK are documented in this file.
## [Unreleased]
- Minimum CUDA version increased to 11.0 [\#3957](https://github.com/QMCPACK/qmcpack/pull/3957)
## [3.14.0] - 2022-04-06
This release focuses on performance improvements to the OpenMP target offload version for GPUs as well as ongoing minor

View File

@ -78,3 +78,18 @@ else(HAVE_MKL)
set(MKL_FOUND FALSE)
message(STATUS "MKL header files not found")
endif(HAVE_MKL)
# check for mkl_sycl
if(HAVE_MKL AND ENABLE_SYCL)
find_library(MKL_SYCL mkl_sycl
HINTS ${MKL_ROOT} $ENV{MKLROOT} $ENV{MKL_ROOT} $ENV{MKL_HOME}
PATH_SUFFIXES lib/intel64
REQUIRED
)
if(MKL_SYCL)
add_library(MKL::sycl INTERFACE IMPORTED)
target_include_directories(MKL::sycl INTERFACE "${MKL_INCLUDE}")
target_link_libraries(MKL::sycl INTERFACE ${MKL_SYCL})
endif()
endif()

View File

@ -126,8 +126,11 @@ if(NOT CMAKE_SYSTEM_NAME STREQUAL "CrayLinuxEnvironment")
endif() #(CMAKE_CXX_FLAGS MATCHES "-march=" AND CMAKE_C_FLAGS MATCHES "-march=")
else() #(CMAKE_CXX_FLAGS MATCHES "-march=" OR CMAKE_C_FLAGS MATCHES "-march=")
# use -march=native
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
# skipped in OneAPI 2022.0 when using SYCL which caused linking failure.
if (NOT (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 2022.0 AND ENABLE_SYCL))
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
endif()
endif() #(CMAKE_CXX_FLAGS MATCHES "-march=" OR CMAKE_C_FLAGS MATCHES "-march=")
endif()

View File

@ -715,8 +715,12 @@ if(QMC_CUDA OR ENABLE_CUDA)
endif()
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CUDA_EXTENSIONS OFF)
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING "nvcc host compiler passed via -ccbin")
if(NOT CMAKE_CUDA_FLAGS MATCHES "allow-unsupported-compiler")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --allow-unsupported-compiler")
endif()
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
find_package(CUDAToolkit 11.0 REQUIRED)
if(NOT TARGET CUDA::cublas)
message(
FATAL_ERROR
@ -839,14 +843,21 @@ endif(ENABLE_HIP)
# set up SYCL compiler options and libraries
#-------------------------------------------------------------------
if(ENABLE_SYCL)
if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM" OR INTEL_ONEAPI_COMPILER_FOUND))
message(FATAL_ERROR "Only LLVM-based Intel compiler supports SYCL.")
# require 3.20 to recognize IntelLLVM compiler ID and check accurate version numbers.
if(CMAKE_VERSION VERSION_LESS 3.20.0)
message(FATAL_ERROR "ENABLE_SYCL require CMake 3.20.0 or later")
endif()
if(NOT CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
message(FATAL_ERROR "QMCPACK only supports SYCL with LLVM-based Intel compiler (icpx).")
endif()
add_library(SYCL::host INTERFACE IMPORTED)
add_library(SYCL::device INTERFACE IMPORTED)
find_package(IntelDPCPP REQUIRED CONFIGS IntelDPCPPConfig-modified.cmake PATHS ${PROJECT_CMAKE})
target_link_libraries(SYCL::host INTERFACE OneAPI::DPCPP-host)
target_link_libraries(SYCL::device INTERFACE OneAPI::DPCPP-device)
if(TARGET MKL::sycl)
target_link_libraries(MKL::sycl INTERFACE OneAPI::DPCPP-host)
endif()
endif(ENABLE_SYCL)
#-------------------------------------------------------------------

View File

@ -24,13 +24,14 @@ particular emphasis is placed on code quality and reproducibility.
* C++ 17 and C99 capable compilers.
* CMake v3.15.0 or later, build utility, http://www.cmake.org
* BLAS/LAPACK, numerical library. Use platform-optimized libraries.
* BLAS/LAPACK, numerical library. Use vendor and platform-optimized libraries.
* LibXml2, XML parser, http://xmlsoft.org/
* HDF5, portable I/O library, http://www.hdfgroup.org/HDF5/
* BOOST v1.61.0 or newer, peer-reviewed portable C++ source libraries, http://www.boost.org
* FFTW, FFT library, http://www.fftw.org/
* MPI, parallel library. Optional, but a near requirement for production calculations.
* Python3. Older versions are not supported as of January 2020.
* CUDA v11.0 or later. Optional, but required for builds with NVIDIA GPU support.
We aim to support open source compilers and libraries released within two years of each QMCPACK release. Use of software versions
over two years old may work but is discouraged and untested. Proprietary compilers (Intel, NVHPC) are generally supported over the

View File

@ -18,6 +18,7 @@ User's Guide and Developer's Manual
introduction
features
performance_portable
installation
running
units

View File

@ -118,6 +118,8 @@ Batched drivers check against ``max_seconds`` and make efforts to stop the execu
In addition, a file named ``id`` plus ``.STOP``, in this case ``vmc.STOP``, stops QMCPACK execution on the fly cleanly once being found in the working directory.
.. _driver-version-parameter:
Driver version
~~~~~~~~~~~~~~
The ``driver_version`` parameter selects between the new performance-portable batched drivers and the previous drivers (now referred to as the 'legacy drivers').

View File

@ -401,6 +401,9 @@ the path to the source directory.
See :ref:`Sanitizer-Libraries` for more information.
.. _offloadbuild:
Notes for OpenMP target offload to accelerators (experimental)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
QMCPACK is currently being updated to support OpenMP target offload and obtain performance
@ -1777,8 +1780,8 @@ for the creation of projectors in UPF can introduce severe errors and inaccuraci
.. _buildqe:
Installing and patching Quantum ESPRESSO
----------------------------------------
Installing Quantum ESPRESSO and pw2qmcpack
------------------------------------------
For trial wavefunctions obtained in a plane-wave basis, we mainly
support QE. Note that ABINIT and QBox were supported historically
@ -1789,6 +1792,10 @@ QE stores wavefunctions in a nonstandard internal
we have developed a converter---pw2qmcpack---which is an add-on to the
QE distribution.
Quantum ESPRESSO (<=6.8)
~~~~~~~~~~~~~~~~~~~~~~~~
To simplify the process of patching QE we have developed
a script that will automatically download and patch the source
code. The patches are specific to each version. For example, to download and
@ -1821,15 +1828,36 @@ the HDF5 capability enabled in either way:
The complete process is described in external\_codes/quantum\_espresso/README.
The tests involving pw.x and pw2qmcpack.x have been integrated into the test suite of QMCPACK.
By adding ``-D QE_BIN=your_QE_binary_path`` in the CMake command line when building your QMCPACK,
tests named with the "qe-" prefix will be included in the test set of your build.
You can test the whole ``pw > pw2qmcpack > qmcpack workflow`` by
Quantum ESPRESSO (6.7, 6.8 and 7.0)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
After patching the QE source code like above, users may use CMake instead of configure to build QE with pw2qmcpack.
Options needed to enable pw2qmcpack have been set ON by default.
A HDF5 library installation with Fortran support is required.
::
::
ctest -R qe
mkdir build_mpi
cd build_mpi
cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_Fortran_COMPILER=mpif90 ..
make -j 16
Quantum ESPRESSO (>7.0)
~~~~~~~~~~~~~~~~~~~~~~~
Due to incorporation of pw2qmcpack as a plugin, there is no longer any need to patch QE.
Users may use upstream QE and activate the plugin by specifying ``-DQE_ENABLE_PLUGINS=pw2qmcpack`` at the CMake configure step.
Full QE CMake documentation can be found at
https://gitlab.com/QEF/q-e/-/wikis/Developers/CMake-build-system .
::
mkdir build_mpi
cd build_mpi
cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_Fortran_COMPILER=mpif90 -DQE_ENABLE_PLUGINS=pw2qmcpack ..
make -j 16
Testing QE after installation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Testing the QE to QMCPACK workflow after building QE and QMCPACK is highly recommended.
See :ref:`integtestqe` and the testing section for more details.
.. _buildperformance:

View File

@ -114,34 +114,22 @@ To continue a run, specify the ``mcwalkerset`` element before your VMC/DMC block
In the project id section, make sure that the series number is different from any existing ones to avoid overwriting them.
.. _batched_drivers:
Batched drivers
---------------
Under the Exascale Computing Project effort a new set of QMC drivers was developed
to eliminate the divergence of legacy CPU and GPU code paths at the QMC driver level and make the drivers CPU/GPU agnostic.
The divergence came from the the fact that the CPU code path favors executing all the compute tasks within a step
for one walker and then advance walker by walker. Multiple CPU threads process their own assigned walkers in parallel.
In this way, walkers are not synchronized with each other and maximal throughout can be achieved on CPU.
The GPU code path favors executing the same compute task over all the walkers together to maximize GPU throughput.
This compute dispatch pattern minimizes the overhead of dispatching computation and host-device data transfer.
However, the legacy GPU code path only leverages the OpenMP main host thread for handling
all the interaction between the host and GPUs and limit the kernel dispatch capability.
In brief, the CPU code path handles computation with a walker batch size of one and many batches
while the GPU code path uses only one batch containing all the walkers.
The new drivers that implement this flexible batching scheme are called "batched drivers".
The batched drivers introduce a new concept, "crowd", as a sub-organization of walker population.
A crowd is a subset of the walkers that are operated on as as single batch.
Walkers within a crowd operate their computation in lock-step, which helps the GPU efficiency.
Walkers between crowds remain fully asynchronous unless operations involving the full population are needed.
Walkers in different crowds remain fully asynchronous unless operations involving the full population are needed.
With this flexible batching capability the new drivers are capable of delivering maximal performance on given hardware.
In the new driver design, all the batched API calls may fallback to an existing single walker implementation.
Consequently, batched drivers allow mixing and matching CPU-only and GPU-accelerated features
in a way that is not feasible with the legacy GPU implementation.
For OpenMP GPU offload users, batched drivers are essential to effectively use GPUs.
.. _transition_guide:

View File

@ -0,0 +1,64 @@
.. _performance_portable:
Performance Portable Implementation
===================================
The so-called performance portable implementation was developed to present a unified way to run QMC on CPU and GPU
systems, and eliminate the divergence between CPU and GPU code paths that had been introduced in the past, while still
maintaining high performance. This required generalizing all the driver inputs to potentially drive larger batches of
walkers and also eliminating ambiguities in the various input blocks of QMCPACK. Internally many new code paths have
been created, including new QMC drivers for VMC, DMC, and the wavefunction optimizer.
Once this implementation is sufficiently matured and enough features are available, the old non-performance portable
drivers will be deprecated and eventually deleted. The number of changes required to old input files is usually very
small, so use of the new performance portable implementation is encouraged, particularly for new projects.
The performance portable implementation load balances the total number of walkers onto MPI tasks, as per the old
drivers. The new implementation is then able to subdivide the walkers of each MPI task into multiple similarly-sized
crowds. The walkers in each crowd can then be updated simultaneously. This structure enables the walkers to be
efficiently mapped to both CPUs and GPUs. On CPU systems, they then are mapped to OpenMP threads where a single walker
can be computed efficiently by even a single thread. On GPU systems, large numbers of GPU threads must be used
concurrently for high efficiency: Each crowd is first owned by a distinct CPU thread, which in turn executes batched
operations over all the walkers in its crowd on the GPU. Provided the batches are sufficiently large, this facilitates
efficient GPU execution, while the use of multiple crowds can reduce synchronization and allow higher performance to be
obtained. For these reasons the new performance portable drivers are also referred to as batched drivers, since this is
the largest change from the older code.
The new implementation largely uses OpenMP offload for portability, although other technologies are also used and the
implementation has flexible dispatch to help obtain high performance on every platform.
This implementation was designed and implemented as part of the Exascale Computing Project, with a view to bringing
QMCPACK to GPUs from multiple vendors with high-efficiency while creating a more maintainable and easy to contribute to
codebase.
Links to more information in other sections of the manual:
- **Build instructions:** :ref:`OpenMP target offload <offloadbuild>` section of the :ref:`obtaininginstalling` chapter.
- **Supported features:** :ref:`gpufeatures` section of the :ref:`chap:features` chapter.
- **Enabling batch drivers** :ref:`driver-version-parameter` section of the :ref:`input-overview` chapter.
- **Driver Inputs:** :ref:`batched_drivers` section of the :ref:`qmcmethods` chapter.
Input files for batched drivers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Use the following changes to update input files to use the batched drivers.
1. Update the project block with the ``driver_version`` parameter. For example:
::
<project id="vmc" series="0">
<parameter name="driver_version">batch</parameter>
</project>
See :ref:`driver-version-parameter` for more.
2. Modify the QMC algorithm blocks
The most significant change is the ``walkers`` parameter has been replaced with ``walkers_per_rank`` or ``total_walkers``.
See :ref:`batched_drivers` for details.

View File

@ -18,6 +18,9 @@
#if defined(ENABLE_CUDA)
#include "DualAllocator.hpp"
#include "CUDA/CUDAallocator.hpp"
#elif defined(ENABLE_SYCL)
#include "DualAllocator.hpp"
#include "SYCL/SYCLallocator.hpp"
#endif
#include "OhmmsPETE/OhmmsMatrix.h"
#include "OhmmsSoA/VectorSoaContainer.h"
@ -33,7 +36,10 @@ template<typename T>
using OffloadPinnedAllocator = OMPallocator<T, PinnedAlignedAllocator<T>>;
#if defined(ENABLE_CUDA)
template<typename T>
using CUDAPinnedAllocator = DualAllocator<T, CUDAAllocator<T>, PinnedAlignedAllocator<T>>;
using VendorDualPinnedAllocator = DualAllocator<T, CUDAAllocator<T>, PinnedAlignedAllocator<T>>;
#elif defined(ENABLE_SYCL)
template<typename T>
using VendorDualPinnedAllocator = DualAllocator<T, SYCLAllocator<T>, PinnedAlignedAllocator<T>>;
#endif
template<class OPA>
@ -109,9 +115,9 @@ TEST_CASE("OhmmsMatrix_VectorSoaContainer_View", "[Integration][Allocators]")
{
testDualAllocator<OffloadPinnedAllocator<double>>();
testDualAllocator<OffloadPinnedAllocator<std::complex<double>>>();
#if defined(ENABLE_CUDA)
testDualAllocator<CUDAPinnedAllocator<double>>();
testDualAllocator<CUDAPinnedAllocator<std::complex<double>>>();
#if defined(ENABLE_CUDA) || defined(ENABLE_SYCL)
testDualAllocator<VendorDualPinnedAllocator<double>>();
testDualAllocator<VendorDualPinnedAllocator<std::complex<double>>>();
#endif
}
} // namespace qmcplusplus

View File

@ -63,6 +63,7 @@ endif(ENABLE_ROCM)
if(ENABLE_SYCL)
add_subdirectory(SYCL)
target_link_libraries(platform_runtime PUBLIC platform_sycl_runtime)
target_link_libraries(platform_LA INTERFACE platform_sycl_LA)
endif(ENABLE_SYCL)
if(BUILD_UNIT_TESTS)

View File

@ -188,17 +188,6 @@ struct qmc_allocator_traits<qmcplusplus::CUDAAllocator<T>>
static const bool is_host_accessible = false;
static const bool is_dual_space = false;
static void fill_n(T* ptr, size_t n, const T& value) { qmcplusplus::CUDAfill_n(ptr, n, value); }
static void updateTo(CUDAAllocator<T>& alloc, T* host_ptr, size_t n)
{
T* device_ptr = alloc.getDevicePtr(host_ptr);
copyToDevice(device_ptr, host_ptr, n);
}
static void updateFrom(CUDAAllocator<T>& alloc, T* host_ptr, size_t n)
{
T* device_ptr = alloc.getDevicePtr(host_ptr);
copyFromDevice(host_ptr, device_ptr, n);
}
};
/** allocator for CUDA host pinned memory

View File

@ -57,7 +57,7 @@ void DeviceManager::initializeGlobalDeviceManager(int local_rank, int local_size
const DeviceManager& DeviceManager::getGlobal()
{
if (!global)
throw std::runtime_error("DeviceManager::getGlobal cannot access initialized the global instance.");
throw std::runtime_error("DeviceManager::getGlobal the global instance was not initialized.");
return *global;
}
} // namespace qmcplusplus

View File

@ -23,6 +23,8 @@
#include "PinnedAllocator.h"
#if defined(ENABLE_CUDA)
#include "CUDA/CUDAallocator.hpp"
#elif defined(ENABLE_SYCL)
#include "SYCL/SYCLallocator.hpp"
#endif
namespace qmcplusplus

View File

@ -22,9 +22,9 @@
#define QMCPLUSPLUS_DUAL_ALLOCATOR_ALIASES_HPP
#include "PinnedAllocator.h"
#if defined(ENABLE_CUDA) && !defined(ENABLE_OFFLOAD)
#if (defined(ENABLE_CUDA) || defined(ENABLE_SYCL)) && !defined(ENABLE_OFFLOAD)
#include "DualAllocator.hpp"
#if defined(ENABLE_CUDA)
namespace qmcplusplus
{
template<typename T>
@ -32,7 +32,19 @@ namespace qmcplusplus
template<typename T>
using PinnedDualAllocator = DualAllocator<T, CUDAAllocator<T>, PinnedAlignedAllocator<T>>;
}
#elif defined(ENABLE_SYCL)
namespace qmcplusplus
{
template<typename T>
using UnpinnedDualAllocator = DualAllocator<T, SYCLAllocator<T>, aligned_allocator<T>>;
template<typename T>
using PinnedDualAllocator = DualAllocator<T, SYCLAllocator<T>, PinnedAlignedAllocator<T>>;
}
#else
#error unhandled platform
#endif
#else // ENABLE_OFFLOAD or no CUDA or SYCL
#include "OMPTarget/OffloadAlignedAllocators.hpp"
namespace qmcplusplus
{

View File

@ -15,8 +15,10 @@
#include <memory>
#include "CPU/SIMD/aligned_allocator.hpp"
#ifdef ENABLE_CUDA
#if defined(ENABLE_CUDA)
#include "CUDA/CUDAallocator.hpp"
#elif defined(ENABLE_SYCL)
#include "SYCL/SYCLallocator.hpp"
#endif
namespace qmcplusplus
@ -24,15 +26,19 @@ namespace qmcplusplus
/** The fact that the pinned allocators are not always pinned hurts readability elsewhere. */
template<typename T>
#ifdef ENABLE_CUDA
#if defined(ENABLE_CUDA)
using PinnedAllocator = CUDALockedPageAllocator<T>;
#elif defined(ENABLE_SYCL)
using PinnedAllocator = SYCLHostAllocator<T>;
#else
using PinnedAllocator = std::allocator<T>;
#endif
template<typename T, size_t ALIGN = QMC_SIMD_ALIGNMENT>
#ifdef ENABLE_CUDA
#if defined(ENABLE_CUDA)
using PinnedAlignedAllocator = CUDALockedPageAllocator<T, aligned_allocator<T, ALIGN>>;
#elif defined(ENABLE_SYCL)
using PinnedAlignedAllocator = SYCLHostAllocator<T, ALIGN>;
#else
using PinnedAlignedAllocator = aligned_allocator<T, ALIGN>;
#endif

View File

@ -10,8 +10,13 @@
#//////////////////////////////////////////////////////////////////////////////////////
set(SYCL_RT_SRCS SYCLDeviceManager.cpp)
set(SYCL_RT_SRCS SYCLDeviceManager.cpp SYCLallocator.cpp SYCLruntime.cpp)
set(SYCL_LA_SRCS syclBLAS.cpp)
add_library(platform_sycl_runtime ${SYCL_RT_SRCS})
target_link_libraries(platform_sycl_runtime PUBLIC SYCL::host
PRIVATE platform_host_runtime)
add_library(platform_sycl_LA ${SYCL_LA_SRCS})
target_link_libraries(platform_sycl_LA PUBLIC platform_sycl_runtime MKL::sycl
PRIVATE SYCL::device platform_sycl_runtime)

View File

@ -16,6 +16,7 @@
#include <stdexcept>
#include <string>
#include <algorithm>
#include "config.h"
#include "OutputManager.h"
#include "determineDefaultDeviceNum.h"
#if defined(_OPENMP)
@ -27,7 +28,6 @@
namespace qmcplusplus
{
#if defined(_OPENMP)
/** create SYCL device/contexts from OpenMP owned ones to ensure interoperability.
* CUDA has the notion of primary context while SYCL requires explicitly sharing context.
@ -86,11 +86,20 @@ SYCLDeviceManager::SYCLDeviceManager(int& default_device_num, int& num_devices,
else if (default_device_num != sycl_default_device_num)
throw std::runtime_error("Inconsistent assigned SYCL devices with the previous record!");
default_device_queue =
sycl::queue(visible_devices[sycl_default_device_num].context, visible_devices[sycl_default_device_num].device);
default_device_queue = std::make_unique<sycl::queue>(visible_devices[sycl_default_device_num].context,
visible_devices[sycl_default_device_num].device);
}
}
std::unique_ptr<sycl::queue> SYCLDeviceManager::default_device_queue;
sycl::queue& SYCLDeviceManager::getDefaultDeviceQueue()
{
if (!default_device_queue)
throw std::runtime_error("SYCLDeviceManager::getDefaultDeviceQueue() the global instance not initialized.");
return *default_device_queue;
}
#if defined(_OPENMP)
static std::vector<struct syclDeviceInfo> xomp_get_sycl_devices()
{
@ -130,10 +139,9 @@ static std::vector<struct syclDeviceInfo> xomp_get_sycl_devices()
devices[id].device =
sycl::ext::oneapi::level_zero::make_device(sycl_platform, reinterpret_cast<pi_native_handle>(hDevice));
const sycl::context sycl_context =
sycl::ext::oneapi::level_zero::make_context({devices[id].device},
reinterpret_cast<pi_native_handle>(hContext),
true /* keep the ownership, no transfer */);
devices[id].context = sycl::ext::oneapi::level_zero::make_context({devices[id].device},
reinterpret_cast<pi_native_handle>(hContext),
true /* keep the ownership, no transfer */);
}
else if (omp_backend.find("opencl") == 0)
{

View File

@ -15,13 +15,12 @@
#ifndef QMCPLUSPLUS_SYCLDEVICEMANAGER_H
#define QMCPLUSPLUS_SYCLDEVICEMANAGER_H
#include <CL/sycl.hpp>
#include <vector>
#include "config.h"
#include <memory>
#include <CL/sycl.hpp>
namespace qmcplusplus
{
struct syclDeviceInfo
{
sycl::context context;
@ -34,7 +33,9 @@ class SYCLDeviceManager
{
int sycl_default_device_num;
std::vector<syclDeviceInfo> visible_devices;
sycl::queue default_device_queue;
/// the global singleton which can be used to access the default queue of the default device.
static std::unique_ptr<sycl::queue> default_device_queue;
public:
SYCLDeviceManager(int& default_device_num, int& num_devices, int local_rank, int local_size);
@ -42,10 +43,8 @@ public:
/** access the the DeviceManager owned default queue.
* Restrict the use of it to performance non-critical operations.
* Note: CUDA has a default queue but all the SYCL queues are explicit.
* Right now we return a copy of the default queue. Queues hold contexts and devices by referece.
* So making a copy is expected to be cheap. If this is not the case, we will find a cheap solution.
*/
sycl::queue getDefaultDeviceQueue() const { return default_device_queue; }
static sycl::queue& getDefaultDeviceQueue();
};
} // namespace qmcplusplus

View File

@ -0,0 +1,19 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2022 QMCPACK developers.
//
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//////////////////////////////////////////////////////////////////////////////////////
#include <cstddef>
#include <atomic>
namespace qmcplusplus
{
std::atomic<size_t> SYCLallocator_device_mem_allocated(0);
}

View File

@ -0,0 +1,252 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2022 QMCPACK developers.
//
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//////////////////////////////////////////////////////////////////////////////////////
// -*- C++ -*-
/** @file SYCLallocator.hpp
* this file provides three C++ memory allocators using SYCL specific memory allocation functions.
*
* SYCLManagedAllocator allocates SYCL shared memory
* SYCLAllocator allocates SYCL device memory
* SYCLHostAllocator allocates SYCL host memory
* They are based on CUDA*Allocator implementation
*/
#ifndef QMCPLUSPLUS_SYCL_ALLOCATOR_H
#define QMCPLUSPLUS_SYCL_ALLOCATOR_H
#include <memory>
#include <cstdlib>
#include <stdexcept>
#include <atomic>
#include <limits>
#include <CL/sycl.hpp>
#include "config.h"
#include "allocator_traits.hpp"
#include "SYCLruntime.hpp"
namespace qmcplusplus
{
extern std::atomic<size_t> SYCLallocator_device_mem_allocated;
inline size_t getSYCLdeviceMemAllocated() { return SYCLallocator_device_mem_allocated; }
/** allocator for SYCL shared memory
* @tparm T data type
* @tparm ALIGN alignment in bytes
*/
template<typename T, size_t ALIGN = QMC_SIMD_ALIGNMENT>
struct SYCLSharedAllocator
{
typedef T value_type;
typedef size_t size_type;
typedef T* pointer;
typedef const T* const_pointer;
static constexpr size_t alignment = ALIGN;
SYCLSharedAllocator() = default;
template<class U>
SYCLSharedAllocator(const SYCLSharedAllocator<U>&)
{}
template<class U>
struct rebind
{
typedef SYCLSharedAllocator<U> other;
};
T* allocate(std::size_t n)
{
T* pt = sycl::aligned_alloc_shared<T>(ALIGN, n, getSYCLDefaultDeviceDefaultQueue());
return pt;
}
void deallocate(T* p, std::size_t) { sycl::free(p, getSYCLDefaultDeviceDefaultQueue()); }
};
template<class T1, class T2>
bool operator==(const SYCLSharedAllocator<T1>&, const SYCLSharedAllocator<T2>&)
{
return true;
}
template<class T1, class T2>
bool operator!=(const SYCLSharedAllocator<T1>&, const SYCLSharedAllocator<T2>&)
{
return false;
}
/** allocator for SYCL device memory
* @tparm T data type
* @tparm ALIGN alignment in bytes
*
* using this with something other than Ohmms containers?
* -- use caution, write unit tests! --
* It's not tested beyond use in some unit tests using std::vector with constant size.
* SYCLAllocator appears to meet all the nonoptional requirements of a c++ Allocator.
*
* Some of the default implementations in std::allocator_traits
* of optional Allocator requirements may cause runtime or compilation failures.
* They assume there is only one memory space and that the host has access to it.
*/
template<typename T, size_t ALIGN = 64>
class SYCLAllocator
{
public:
typedef T value_type;
typedef size_t size_type;
typedef T* pointer;
typedef const T* const_pointer;
static constexpr size_t alignment = ALIGN;
SYCLAllocator() = default;
template<class U>
SYCLAllocator(const SYCLAllocator<U>&)
{}
template<class U>
struct rebind
{
typedef SYCLAllocator<U> other;
};
T* allocate(std::size_t n)
{
T* pt = sycl::aligned_alloc_device<T>(ALIGN, n, getSYCLDefaultDeviceDefaultQueue());
SYCLallocator_device_mem_allocated += n * sizeof(T);
return pt;
}
void deallocate(T* p, std::size_t n)
{
sycl::free(p, getSYCLDefaultDeviceDefaultQueue());
SYCLallocator_device_mem_allocated -= n * sizeof(T);
}
/** Provide a construct for std::allocator_traits::contruct to call.
* Don't do anything on construct, pointer p is on the device!
*
* For example std::vector calls this to default initialize each element. You'll segfault
* if std::allocator_traits::construct tries doing that at p.
*
* The standard is a bit confusing on this point. Implementing this is an optional requirement
* of Allocator from C++11 on, its not slated to be removed.
*
* Its deprecated for the std::allocator in c++17 and will be removed in c++20. But we are not implementing
* std::allocator.
*
* STL containers only use Allocators through allocator_traits and std::allocator_traits handles the case
* where no construct method is present in the Allocator.
* But std::allocator_traits will call the Allocators construct method if present.
*/
template<class U, class... Args>
static void construct(U* p, Args&&... args)
{}
/** Give std::allocator_traits something to call.
* The default if this isn't present is to call p->~T() which
* we can't do on device memory.
*/
template<class U>
static void destroy(U* p)
{}
void copyToDevice(T* device_ptr, T* host_ptr, size_t n)
{
getSYCLDefaultDeviceDefaultQueue().memcpy(device_ptr, host_ptr, n * sizeof(T)).wait();
}
void copyFromDevice(T* host_ptr, T* device_ptr, size_t n)
{
getSYCLDefaultDeviceDefaultQueue().memcpy(host_ptr, device_ptr, n * sizeof(T)).wait();
}
void copyDeviceToDevice(T* to_ptr, size_t n, T* from_ptr)
{
getSYCLDefaultDeviceDefaultQueue().memcpy(to_ptr, from_ptr, n * sizeof(T)).wait();
}
};
template<class T1, class T2>
bool operator==(const SYCLAllocator<T1>&, const SYCLAllocator<T2>&)
{
return true;
}
template<class T1, class T2>
bool operator!=(const SYCLAllocator<T1>&, const SYCLAllocator<T2>&)
{
return false;
}
template<typename T>
struct qmc_allocator_traits<qmcplusplus::SYCLAllocator<T>>
{
static const bool is_host_accessible = false;
static const bool is_dual_space = false;
static void fill_n(T* ptr, size_t n, const T& value)
{
//THINK
//qmcplusplus::SYCLfill_n(ptr, n, value);
}
static void updateTo(SYCLAllocator<T>& alloc, T* host_ptr, size_t n)
{
T* device_ptr = alloc.getDevicePtr(host_ptr);
alloc.copyToDevice(device_ptr, host_ptr, n);
}
static void updateFrom(SYCLAllocator<T>& alloc, T* host_ptr, size_t n)
{
T* device_ptr = alloc.getDevicePtr(host_ptr);
alloc.copyFromDevice(host_ptr, device_ptr, n);
}
};
/** allocator for SYCL host pinned memory
* @tparm T data type
* @tparm ALIGN alignment in bytes
*/
template<typename T, size_t ALIGN = QMC_SIMD_ALIGNMENT>
struct SYCLHostAllocator
{
typedef T value_type;
typedef size_t size_type;
typedef T* pointer;
typedef const T* const_pointer;
static constexpr size_t alignment = ALIGN;
SYCLHostAllocator() = default;
template<class U>
SYCLHostAllocator(const SYCLHostAllocator<U>&)
{}
template<class U>
struct rebind
{
typedef SYCLHostAllocator<U> other;
};
T* allocate(std::size_t n) { return sycl::aligned_alloc_host<T>(ALIGN, n, getSYCLDefaultDeviceDefaultQueue()); }
void deallocate(T* p, std::size_t) { sycl::free(p, getSYCLDefaultDeviceDefaultQueue()); }
};
template<class T1, class T2>
bool operator==(const SYCLHostAllocator<T1>&, const SYCLHostAllocator<T2>&)
{
return true;
}
template<class T1, class T2>
bool operator!=(const SYCLHostAllocator<T1>&, const SYCLHostAllocator<T2>&)
{
return false;
}
} // namespace qmcplusplus
#endif

View File

@ -0,0 +1,19 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2022 QMCPACK developers.
//
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//////////////////////////////////////////////////////////////////////////////////////
#include <CL/sycl.hpp>
#include "SYCLDeviceManager.h"
#include "SYCLruntime.hpp"
namespace qmcplusplus
{
sycl::queue getSYCLDefaultDeviceDefaultQueue() { return SYCLDeviceManager::getDefaultDeviceQueue(); }
} // namespace qmcplusplus

View File

@ -0,0 +1,22 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2022 QMCPACK developers.
//
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//////////////////////////////////////////////////////////////////////////////////////
#ifndef QMCPLUSPLUS_SYCL_RUNTIME_H
#define QMCPLUSPLUS_SYCL_RUNTIME_H
#include <CL/sycl.hpp>
namespace qmcplusplus
{
sycl::queue getSYCLDefaultDeviceDefaultQueue();
} // namespace qmcplusplus
#endif

View File

@ -0,0 +1,100 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2022 QMCPACK developers.
//
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//////////////////////////////////////////////////////////////////////////////////////
#include "syclBLAS.hpp"
#include "oneapi/mkl/blas.hpp"
namespace qmcplusplus
{
namespace syclBLAS
{
inline oneapi::mkl::transpose convertTransEnum(char trans)
{
return trans == 'T' ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
}
template<typename T>
sycl::event gemv(sycl::queue& handle,
const char trans,
const int m,
const int n,
const T alpha,
const T* const A,
const int lda,
const T* const x,
const int incx,
const T beta,
T* const y,
const int incy,
const std::vector<sycl::event>& events)
{
return oneapi::mkl::blas::gemv(handle, convertTransEnum(trans), m, n, alpha, A, lda, x, incx, beta, y, incy, events);
}
template sycl::event gemv(sycl::queue& handle,
const char trans,
const int m,
const int n,
const double alpha,
const double* const A,
const int lda,
const double* const x,
const int incx,
const double beta,
double* const y,
const int incy,
const std::vector<sycl::event>& events);
template sycl::event gemv(sycl::queue& handle,
const char trans,
const int m,
const int n,
const float alpha,
const float* const A,
const int lda,
const float* const x,
const int incx,
const float beta,
float* const y,
const int incy,
const std::vector<sycl::event>& events);
template sycl::event gemv(sycl::queue& handle,
const char trans,
const int m,
const int n,
const std::complex<double> alpha,
const std::complex<double>* const A,
const int lda,
const std::complex<double>* const x,
const int incx,
const std::complex<double> beta,
std::complex<double>* const y,
const int incy,
const std::vector<sycl::event>& events);
template sycl::event gemv(sycl::queue& handle,
const char trans,
const int m,
const int n,
const std::complex<float> alpha,
const std::complex<float>* const A,
const int lda,
const std::complex<float>* const x,
const int incx,
const std::complex<float> beta,
std::complex<float>* const y,
const int incy,
const std::vector<sycl::event>& events);
} // namespace syclBLAS
} // namespace qmcplusplus

View File

@ -0,0 +1,44 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2022 QMCPACK developers.
//
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//////////////////////////////////////////////////////////////////////////////////////
#ifndef QMCPLUSPLUS_SYCL_BLAS_H
#define QMCPLUSPLUS_SYCL_BLAS_H
#include <complex>
#include <CL/sycl.hpp>
namespace qmcplusplus
{
namespace syclBLAS
{
using syclBLAS_int = std::int64_t;
using syclBLAS_status = sycl::event;
using syclBLAS_handle = sycl::queue;
template<typename T>
sycl::event gemv(sycl::queue& handle,
const char trans,
const int m,
const int n,
const T alpha,
const T* const A,
const int lda,
const T* const x,
const int incx,
const T beta,
T* const y,
const int incy,
const std::vector<sycl::event>& events = {});
} // namespace syclBLAS
} // namespace qmcplusplus
#endif // QMCPLUSPLUS_OMPBLAS_H

View File

@ -15,6 +15,10 @@ if(ENABLE_CUDA)
add_subdirectory(CUDA)
endif()
if(ENABLE_SYCL)
add_subdirectory(SYCL)
endif()
if(ENABLE_OFFLOAD)
add_subdirectory(OMPTarget)
endif(ENABLE_OFFLOAD)

View File

@ -13,13 +13,6 @@ set(SRC_DIR CUDA)
set(UTEST_EXE test_${SRC_DIR})
set(UTEST_NAME deterministic-unit_test_${SRC_DIR})
if(NOT QMC_CUDA2HIP)
add_library(cuda_device_value_test_kernels test_device_value_kernels.cu)
else()
hip_add_library(cuda_device_value_test_kernels test_device_value_kernels.cu)
endif()
target_link_libraries(cuda_device_value_test_kernels PUBLIC platform_runtime)
add_executable(${UTEST_EXE} test_CUDAallocator.cpp)
target_link_libraries(${UTEST_EXE} platform_runtime containers catch_main)

View File

@ -1,85 +0,0 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2021 QMCPACK developers.
//
// File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
//
// File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
//////////////////////////////////////////////////////////////////////////////////////
#include "test_device_value_kernels.hpp"
#include "CUDA/CUDAallocator.hpp"
namespace qmcplusplus
{
namespace testing
{
/** checking an on device pointer's value against a passed value
*/
template<typename T>
__global__ void checkValue_kernel(T* device_value_ptr, const T value, bool* result)
{
if (*device_value_ptr == value)
*result = true;
else
*result = false;
}
/** checking an on device pointer's value against a passed value
*/
template<typename T>
cudaError_t checkValueCUDA(cudaStream_t hstream, T* device_value_ptr, T value, bool& result)
{
CUDAAllocator<bool> bool_allocator;
CUDAHostAllocator<bool> host_bool_allocator;
bool* device_result = bool_allocator.allocate(1);
bool* host_result = host_bool_allocator.allocate(1);
dim3 dim_block(1);
dim3 dim_grid(1);
checkValue_kernel<<<dim_grid, dim_block, 0, hstream>>>(device_value_ptr, value, device_result);
cudaCheck(cudaStreamSynchronize(hstream));
cudaError_t kernel_error = cudaPeekAtLastError();
cudaCheck(cudaMemcpyAsync(host_result, device_result, sizeof(bool), cudaMemcpyDeviceToHost, hstream));
cudaCheck(cudaStreamSynchronize(hstream));
result = *host_result;
bool_allocator.deallocate(device_result, 1);
host_bool_allocator.deallocate(host_result, 1);
return kernel_error;
}
__global__ void checkDualStruct_kernel(DualStruct* device_struct_ptr, const DualStruct dual_struct, bool* result)
{
if (device_struct_ptr->index == dual_struct.index && device_struct_ptr->value == dual_struct.value)
*result = true;
else
*result = false;
}
/** check a particular test structure at device pointer against passed by value struct
*/
cudaError_t checkDualStruct(cudaStream_t hstream, DualStruct* device_struct_ptr, DualStruct dual_struct, bool& result)
{
CUDAAllocator<bool> bool_allocator;
CUDAHostAllocator<bool> host_bool_allocator;
bool* device_result = bool_allocator.allocate(1);
bool* host_result = host_bool_allocator.allocate(1);
dim3 dim_block(1);
dim3 dim_grid(1);
checkDualStruct_kernel<<<dim_grid, dim_block, 0, hstream>>>(device_struct_ptr, dual_struct, device_result);
cudaCheck(cudaStreamSynchronize(hstream));
cudaError_t kernel_error = cudaPeekAtLastError();
cudaCheck(cudaMemcpyAsync(host_result, device_result, sizeof(bool), cudaMemcpyDeviceToHost, hstream));
cudaCheck(cudaStreamSynchronize(hstream));
result = *host_result;
bool_allocator.deallocate(device_result, 1);
host_bool_allocator.deallocate(host_result, 1);
return kernel_error;
}
template cudaError_t checkValueCUDA(cudaStream_t hstream, double* device_value_ptr, double value, bool& result);
} // namespace testing
} // namespace qmcplusplus

View File

@ -1,38 +0,0 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2021 QMCPACK developers.
//
// File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
//
// File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory
//////////////////////////////////////////////////////////////////////////////////////
#ifndef QMCPLUSPLUS_TEST_DEVICE_VALUES_KERNELS_HPP
#define QMCPLUSPLUS_TEST_DEVICE_VALUES_KERNELS_HPP
#include "CUDA/CUDAruntime.hpp"
namespace qmcplusplus
{
namespace testing
{
template<typename T>
cudaError_t checkValueCUDA(cudaStream_t hstream, T* device_value_ptr, T value, bool& result);
/** just an arbitrary struct for testing */
struct DualStruct
{
int index;
double value;
};
cudaError_t checkDualStruct(cudaStream_t hstream, DualStruct* device_struct_ptr, DualStruct dual_struct, bool& result);
extern template cudaError_t checkValueCUDA(cudaStream_t hstream, double* device_value_ptr, double value, bool& result);
} // namespace testing
} // namespace qmcplusplus
#endif

View File

@ -0,0 +1,29 @@
#//////////////////////////////////////////////////////////////////////////////////////
#// This file is distributed under the University of Illinois/NCSA Open Source License.
#// See LICENSE file in top directory for details.
#//
#// Copyright (c) 2022 QMCPACK developers.
#//
#// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
#//
#// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
#//////////////////////////////////////////////////////////////////////////////////////
## runtime
set(UTEST_EXE test_sycl)
set(UTEST_NAME deterministic-unit_${UTEST_EXE})
add_executable(${UTEST_EXE} test_SYCLallocator.cpp)
target_link_libraries(${UTEST_EXE} SYCL::device platform_runtime containers catch_main)
add_unit_test(${UTEST_NAME} 1 1 $<TARGET_FILE:${UTEST_EXE}>)
## gemv/ger
set(UTEST_EXE test_sycl_blas)
set(UTEST_NAME deterministic-unit_${UTEST_EXE})
add_executable(${UTEST_EXE} test_syclBLAS.cpp)
target_link_libraries(${UTEST_EXE} catch_main containers platform_LA)
add_unit_test(${UTEST_NAME} 1 1 $<TARGET_FILE:${UTEST_EXE}>)

View File

@ -0,0 +1,74 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2019 QMCPACK developers.
//
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//////////////////////////////////////////////////////////////////////////////////////
#include "catch.hpp"
#include <memory>
#include <iostream>
#include "SYCL/SYCLallocator.hpp"
#include "OhmmsPETE/OhmmsVector.h"
namespace qmcplusplus
{
TEST_CASE("SYCL_allocator", "[SYCL]")
{
// SYCLAllocator
sycl::queue m_queue = getSYCLDefaultDeviceDefaultQueue();
Vector<double, SYCLAllocator<double>> vec(1024);
Vector<double> vec_h(1024);
sycl::event e;
{
double* V = vec.data();
e = m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] = item + 1; });
}
//copy to host
m_queue.memcpy(vec_h.data(), vec.data(), 1024 * sizeof(double), {e}).wait();
CHECK(vec_h[0] == 1);
CHECK(vec_h[77] == 78);
}
TEST_CASE("SYCL_host_allocator", "[SYCL]")
{
sycl::queue m_queue = getSYCLDefaultDeviceDefaultQueue();
// SYCLHostAllocator
Vector<double, SYCLHostAllocator<double>> vec(1024, 1);
{
double* V = vec.data();
m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] += item + 1; }).wait();
}
CHECK(vec[0] == 2);
CHECK(vec[77] == 79);
}
/*
TEST_CASE("SYCL_shared_allocator", "[SYCL]")
{
sycl::queue m_queue = getSYCLDefaultDeviceDefaultQueue();
Vector<double, SYCLSharedAllocator<double>> vec(1024);
std::cout << "Size " << vec.size() << std::endl;
{
double* V = vec.data();
m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] = item + 1; }).wait();
}
CHECK(vec[0] == 1);
CHECK(vec[77] == 78);
}
*/
} // namespace qmcplusplus

View File

@ -0,0 +1,94 @@
//////////////////////////////////////////////////////////////////////////////////////
// This file is distributed under the University of Illinois/NCSA Open Source License.
// See LICENSE file in top directory for details.
//
// Copyright (c) 2021 QMCPACK developers.
//
// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//
// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
//////////////////////////////////////////////////////////////////////////////////////
#include "catch.hpp"
#include <memory>
#include <vector>
#include <iostream>
#include "OMPTarget/OMPallocator.hpp"
#include "SYCL/SYCLruntime.hpp"
#include "SYCL/SYCLallocator.hpp"
#include "SYCL/syclBLAS.hpp"
#include <OhmmsPETE/OhmmsVector.h>
#include <OhmmsPETE/OhmmsMatrix.h>
#include "CPU/BLAS.hpp"
namespace qmcplusplus
{
template<typename T, typename Alloc>
void test_gemv(const int M_b, const int N_b, const char trans)
{
const int M = trans == 'T' ? M_b : N_b;
const int N = trans == 'T' ? N_b : M_b;
using vec_t = Vector<T, Alloc>;
using mat_t = Matrix<T, Alloc>;
sycl::queue handle = getSYCLDefaultDeviceDefaultQueue();
vec_t A(N); // Input vector
mat_t B(M_b, N_b); // Input matrix
vec_t C(M); // Result vector ompBLAS
vec_t D(M); // Result vector BLAS
// Fill data
for (int i = 0; i < N; i++)
A[i] = i;
for (int j = 0; j < M_b; j++)
for (int i = 0; i < N_b; i++)
B[j][i] = i + j * 2;
// Fill C and D with 0
for (int i = 0; i < M; i++)
C[i] = D[i] = T(-0.1);
A.updateTo();
B.updateTo();
T alpha(1);
T beta(0);
// in Fortran, B[M][N] is viewed as B^T
// when trans == 'T', the actual calculation is B * A[N] = C[M]
// when trans == 'N', the actual calculation is B^T * A[M] = C[N]
//ompBLAS::gemv(handle, trans, N_b, M_b, alpha, B.device_data(), N_b, A.device_data(), 1, beta, C.device_data(), 1);
syclBLAS::gemv(handle, trans, M_b, M_b, alpha, B.device_data(), N_b, A.device_data(), 1, beta, C.device_data(), 1)
.wait();
C.updateFrom();
if (trans == 'T')
BLAS::gemv_trans(M_b, N_b, B.data(), A.data(), D.data());
else
BLAS::gemv(M_b, N_b, B.data(), A.data(), D.data());
for (int index = 0; index < M; index++)
CHECK(C[index] == D[index]);
}
TEST_CASE("OmpSYCL gemv", "[SYCL]")
{
const int M = 137;
const int N = 79;
const int batch_count = 23;
// Non-batched test
std::cout << "Testing TRANS gemv" << std::endl;
#if defined(ENABLE_OFFLOAD)
test_gemv<float, OMPallocator<float>>(M, N, 'T');
test_gemv<double, OMPallocator<double>>(M, N, 'T');
#endif
}
} // namespace qmcplusplus

View File

@ -419,6 +419,32 @@ void SpinorSet::evaluate_spin(const ParticleSet& P, int iat, ValueVector& psi, V
dpsi = eye * (eis * psi_work_up - emis * psi_work_down);
}
void SpinorSet::evaluateGradSource(const ParticleSet& P,
int first,
int last,
const ParticleSet& source,
int iat_src,
GradMatrix& gradphi)
{
IndexType nelec = P.getTotalNum();
GradMatrix gradphi_up(nelec, OrbitalSetSize);
GradMatrix gradphi_dn(nelec, OrbitalSetSize);
spo_up->evaluateGradSource(P, first, last, source, iat_src, gradphi_up);
spo_dn->evaluateGradSource(P, first, last, source, iat_src, gradphi_dn);
for (int iat = 0; iat < nelec; iat++)
{
ParticleSet::Scalar_t s = P.activeSpin(iat);
RealType coss = std::cos(s);
RealType sins = std::sin(s);
ValueType eis(coss, sins);
ValueType emis(coss, -sins);
for (int imo = 0; imo < OrbitalSetSize; imo++)
gradphi(iat, imo) = gradphi_up(iat, imo) * eis + gradphi_dn(iat, imo) * emis;
}
}
std::unique_ptr<SPOSet> SpinorSet::makeClone() const
{
auto myclone = std::make_unique<SpinorSet>();

View File

@ -129,6 +129,22 @@ public:
*/
void evaluate_spin(const ParticleSet& P, int iat, ValueVector& psi, ValueVector& dpsi) override;
/** evaluate the gradients of this single-particle orbital
* for [first,last) target particles with respect to the given source particle
* @param P current ParticleSet
* @param first starting index of the particles
* @param last ending index of the particles
* @param iat_src source particle index
* @param gradphi gradients
*
*/
virtual void evaluateGradSource(const ParticleSet& P,
int first,
int last,
const ParticleSet& source,
int iat_src,
GradMatrix& gradphi) override;
std::unique_ptr<SPOSet> makeClone() const override;
private:

View File

@ -23,6 +23,7 @@ set(UTEST_HDF_INPUT6 ${qmcpack_SOURCE_DIR}/src/QMCWaveFunctions/tests/lcao_spino
set(UTEST_HDF_INPUT7 ${qmcpack_SOURCE_DIR}/tests/molecules/LiH_ae_MSD/LiH.orbs.h5)
set(UTEST_HDF_INPUT8 ${qmcpack_SOURCE_DIR}/tests/molecules/LiH_ae_MSD/LiH.Multidet.h5)
set(UTEST_HDF_INPUT9 ${qmcpack_SOURCE_DIR}/tests/converter/test_Bi_dirac/gold.orbs.h5)
set(UTEST_HDF_INPUT10 ${qmcpack_SOURCE_DIR}/src/QMCWaveFunctions/tests/lcao_spinor_molecule.h5)
maybe_symlink(${UTEST_HDF_INPUT0} ${UTEST_DIR}/diamondC_1x1x1.pwscf.h5)
maybe_symlink(${UTEST_HDF_INPUT1} ${UTEST_DIR}/diamondC_2x1x1.pwscf.h5)
@ -34,6 +35,7 @@ maybe_symlink(${UTEST_HDF_INPUT6} ${UTEST_DIR}/lcao_spinor.h5)
maybe_symlink(${UTEST_HDF_INPUT7} ${UTEST_DIR}/LiH.orbs.h5)
maybe_symlink(${UTEST_HDF_INPUT8} ${UTEST_DIR}/LiH.Multidet.h5)
maybe_symlink(${UTEST_HDF_INPUT9} ${UTEST_DIR}/Bi.orbs.h5)
maybe_symlink(${UTEST_HDF_INPUT10} ${UTEST_DIR}/lcao_spinor_molecule.h5)
set(FILES_TO_COPY
he_sto3g.wfj.xml

Binary file not shown.

View File

@ -0,0 +1,231 @@
import h5py
import numpy as np
from scipy.special import sph_harm, factorial2
def write_h5_file():
hf = h5py.File('lcao_spinor_molecule.h5','w')
#atoms
atoms = hf.create_group('atoms')
nat = np.array([2])
nsp = np.array([1])
pos = np.array([[0.1,0.2,0.3],[-0.3,-0.2,-0.1]])
ids = np.array([0,0])
atoms.create_dataset('number_of_atoms', data=nat)
atoms.create_dataset('number_of_species', data=nsp)
atoms.create_dataset('positions', data=pos)
atoms.create_dataset('species_ids', data=ids)
sp = atoms.create_group('species_0')
atnum = np.array([1])
charge = np.array([1])
core = np.array([1])
name = "H"
mylen = "S"+str(len(name))
strList = [name]
asciiList = [n.encode("ascii", "ignore") for n in strList]
sp.create_dataset("atomic_number", data=atnum)
sp.create_dataset("charge", data=charge)
sp.create_dataset("core", data=core)
sp.create_dataset("name", (1,), mylen, asciiList)
#PBC
pbc = hf.create_group("PBC")
pbc.create_dataset("PBC",(1,), dtype="b1", data=False)
#application
app = hf.create_group("application")
code = "generic"
mylen = "S"+str(len(code))
strList = [code]
asciiList = [n.encode("ascii", "ignore") for n in strList]
app.create_dataset("code",(1,), mylen, asciiList)
#basisset
bs = hf.create_group("basisset")
bs.create_dataset("NbElements", data=np.array([1]))
name="LCAOBSet"
mylen="S"+str(len(name))
strList=[name]
asciiList=[n.encode("ascii","ignore") for n in strList]
bs.create_dataset("name", (1,), mylen, asciiList)
atbs = bs.create_group("atomicBasisSet0")
atbs.create_dataset("NbBasisGroups", data=np.array([1]))
mystr = "cartesian"
mylen = "S"+str(len(mystr))
strList = [mystr]
asciiList = [n.encode("ascii","ignore") for n in strList]
atbs.create_dataset("angular",(1,), mylen, asciiList)
mystr = "H"
mylen = "S"+str(len(mystr))
strList = [mystr]
asciiList = [n.encode("ascii","ignore") for n in strList]
atbs.create_dataset("elementType",(1,), mylen, asciiList)
mystr = "Gamess"
mylen = "S"+str(len(mystr))
strList = [mystr]
asciiList = [n.encode("ascii","ignore") for n in strList]
atbs.create_dataset("expandYlm",(1,), mylen, asciiList)
atbs.create_dataset("grid_npts", data=np.array([1001]))
atbs.create_dataset("grid_rf", data=np.array([100]))
atbs.create_dataset("grid_ri", data=np.array([1e-06]))
mystr = "log"
mylen = "S"+str(len(mystr))
strList = [mystr]
asciiList = [n.encode("ascii","ignore") for n in strList]
atbs.create_dataset("grid_type",(1,), mylen, asciiList)
mystr = "Gaussian"
mylen = "S"+str(len(mystr))
strList = [mystr]
asciiList = [n.encode("ascii","ignore") for n in strList]
atbs.create_dataset("name",(1,), mylen, asciiList)
mystr = "no"
mylen = "S"+str(len(mystr))
strList = [mystr]
asciiList = [n.encode("ascii","ignore") for n in strList]
atbs.create_dataset("normalized",(1,), mylen, asciiList)
bg = atbs.create_group("basisGroup0")
bg.create_dataset("NbRadFunc", data=np.array([1]))
bg.create_dataset("l", data=np.array([0]))
bg.create_dataset("n", data=np.array([0]))
mystr = "H00"
mylen = "S"+str(len(mystr))
strList = [mystr]
asciiList = [n.encode("ascii","ignore") for n in strList]
bg.create_dataset("rid",(1,), mylen, asciiList)
mystr = "Gaussian"
mylen = "S"+str(len(mystr))
strList = [mystr]
asciiList = [n.encode("ascii","ignore") for n in strList]
bg.create_dataset("type",(1,), mylen, asciiList)
rf = bg.create_group("radfunctions")
dr = rf.create_group("DataRad0")
dr.create_dataset("contraction", data=np.array([1.0]))
dr.create_dataset("exponent", data=np.array([2.5]))
kpts = hf.create_group("Super_Twist")
kpts.create_dataset("eigenset_0", data=np.array([[0.075, 0.15]]))
kpts.create_dataset("eigenset_0_imag", data=np.array([[0.225, 0.45]]))
kpts.create_dataset("eigenset_1", data=np.array([[-0.12, -0.06]]))
kpts.create_dataset("eigenset_1_imag", data=np.array([[0.48, 0.24]]))
hf.close()
class cartGauss:
def __init__(self,expt,l=0,i=0,j=0,k=0):
self.expt = expt
self.l = l
self.i = i
self.j = j
self.k = k
assert(i+j+k == l)
def norm(self):
n = (2*self.expt / np.pi)**(3./4.)
n *= np.sqrt(2.**(self.l) / factorial2(2*self.i - 1) / factorial2(2*self.j - 1) / factorial2(2*self.k - 1)) * np.sqrt(2*self.expt)**self.l
return n
def val(self,pos):
r = np.linalg.norm(pos)
norm = self.norm()
return norm *pos[0]**self.i * pos[1]**self.j * pos[2]**self.k * np.exp(-self.expt * r * r)
def get_reference_values(pos, s):
cs = np.cos(s)
ss = np.sin(s)
eis = cs + 1.j*ss
emis = cs - 1.j*ss
print("Position: {}".format(pos))
print("Spin: {}".format(s))
g0 = cartGauss(2.5, 0, 0, 0, 0)
g1 = cartGauss(2.5, 0, 0, 0, 0)
R0 = np.array([0.1,0.2,0.3])
R1 = np.array([-0.3,-0.2,-0.1])
c0 = 0.3
c1 = 0.6
upcoef = (0.25 + 0.75j)
dncoef = (-0.2 + 0.8j)
dr = 1e-7
g0val = g0.val(pos-R0)
g0px = g0.val(pos-(R0 + np.array([dr,0,0])))
g0mx = g0.val(pos-(R0 - np.array([dr,0,0])))
g0py = g0.val(pos-(R0 + np.array([0,dr,0])))
g0my = g0.val(pos-(R0 - np.array([0,dr,0])))
g0pz = g0.val(pos-(R0 + np.array([0,0,dr])))
g0mz = g0.val(pos-(R0 - np.array([0,0,dr])))
g1val = g1.val(pos-R1)
g1px = g1.val(pos-(R1 + np.array([dr,0,0])))
g1mx = g1.val(pos-(R1 - np.array([dr,0,0])))
g1py = g1.val(pos-(R1 + np.array([0,dr,0])))
g1my = g1.val(pos-(R1 - np.array([0,dr,0])))
g1pz = g1.val(pos-(R1 + np.array([0,0,dr])))
g1mz = g1.val(pos-(R1 - np.array([0,0,dr])))
#atom 0
uppx = c0*g0px + c1*g1val
upmx = c0*g0mx + c1*g1val
updx = (uppx - upmx) / (2*dr)
dnpx = c1*g0px + c0*g1val
dnmx = c1*g0mx + c0*g1val
dndx = (dnpx - dnmx) / (2*dr)
uppy = c0*g0py + c1*g1val
upmy = c0*g0my + c1*g1val
updy = (uppy - upmy) / (2*dr)
dnpy = c1*g0py + c0*g1val
dnmy = c1*g0my + c0*g1val
dndy = (dnpy - dnmy) / (2*dr)
uppz = c0*g0pz + c1*g1val
upmz = c0*g0mz + c1*g1val
updz = (uppz - upmz) / (2*dr)
dnpz = c1*g0pz + c0*g1val
dnmz = c1*g0mz + c0*g1val
dndz = (dnpz - dnmz) / (2*dr)
spdx = upcoef * updx * eis + dncoef * dndx * emis
spdy = upcoef * updy * eis + dncoef * dndy * emis
spdz = upcoef * updz * eis + dncoef * dndz * emis
print("grad atom 0: {}, {}, {}".format(spdx, spdy, spdz))
#atom 1
uppx = c0*g0val + c1*g1px
upmx = c0*g0val + c1*g1mx
updx = (uppx - upmx) / (2*dr)
dnpx = c1*g0val + c0*g1px
dnmx = c1*g0val + c0*g1mx
dndx = (dnpx - dnmx) / (2*dr)
uppy = c0*g0val + c1*g1py
upmy = c0*g0val + c1*g1my
updy = (uppy - upmy) / (2*dr)
dnpy = c1*g0val + c0*g1py
dnmy = c1*g0val + c0*g1my
dndy = (dnpy - dnmy) / (2*dr)
uppz = c0*g0val + c1*g1pz
upmz = c0*g0val + c1*g1mz
updz = (uppz - upmz) / (2*dr)
dnpz = c1*g0val + c0*g1pz
dnmz = c1*g0val + c0*g1mz
dndz = (dnpz - dnmz) / (2*dr)
spdx = upcoef * updx * eis + dncoef * dndx * emis
spdy = upcoef * updy * eis + dncoef * dndy * emis
spdz = upcoef * updz * eis + dncoef * dndz * emis
print("grad atom 1: {}, {}, {}".format(spdx, spdy, spdz))
if __name__ == "__main__":
write_h5_file()
pos = np.array([0.01, -0.02, 0.03])
s = 0.6
get_reference_values(pos, s)

View File

@ -21,7 +21,6 @@
#include "Utilities/for_testing/RandomForTest.h"
#include "Platforms/DualAllocatorAliases.hpp"
#include "Platforms/CUDA/CUDALinearAlgebraHandles.h"
#include "Platforms/tests/CUDA/test_device_value_kernels.hpp"
// Legacy CPU inversion for temporary testing
#include "QMCWaveFunctions/Fermion/DiracMatrix.h"

View File

@ -67,13 +67,12 @@ void test_lcao_spinor()
elec_.addTable(ions_);
elec_.update();
const char* particles = "<tmp> \
<sposet_builder name=\"spinorbuilder\" type=\"molecularorbital\" href=\"lcao_spinor.h5\" source=\"ion\" precision=\"float\"> \
<basisset transform=\"yes\"/> \
<sposet name=\"myspo\" size=\"1\"/> \
</sposet_builder> \
</tmp> \
";
const char* particles = R"XML(<tmp>
<sposet_builder name="spinorbuilder" type="molecularorbital" href="lcao_spinor.h5" source="ion" precision="float">
<basisset transform="yes"/>
<sposet name="myspo" size="1"/>
</sposet_builder>
</tmp>)XML";
Libxml2Document doc;
bool okay = doc.parseFromString(particles);
@ -118,6 +117,20 @@ void test_lcao_spinor()
CHECK(d2psiM[iat][0] == ComplexApprox(vlp).epsilon(eps));
}
/** this is a somewhat simple example. We have an ion at the origin
* and a gaussian basis function centered on the ion as a orbital.
* In this case, the ion derivative is actually just the negative of
* the electron gradient.
*/
SPOSet::GradMatrix gradIon(elec_.R.size(), spo->getOrbitalSetSize());
spo->evaluateGradSource(elec_, 0, elec_.R.size(), ions_, 0, gradIon);
for (int iat = 0; iat < 1; iat++)
{
CHECK(gradIon[iat][0][0] == ComplexApprox(-vdx).epsilon(eps));
CHECK(gradIon[iat][0][1] == ComplexApprox(-vdy).epsilon(eps));
CHECK(gradIon[iat][0][2] == ComplexApprox(-vdz).epsilon(eps));
}
int OrbitalSetSize = spo->getOrbitalSetSize();
//temporary arrays for holding the values of the up and down channels respectively.
SPOSet::ValueVector psi_work;
@ -376,17 +389,16 @@ void test_lcao_spinor_excited()
elec_.addTable(ions_);
elec_.update();
const char* particles = "<tmp> \
<sposet_builder name=\"spinorbuilder\" type=\"molecularorbital\" href=\"lcao_spinor.h5\" source=\"ion\" precision=\"float\"> \
<basisset name=\"myset\" transform=\"yes\"/> \
<sposet name=\"myspo\" basisset=\"myset\" size=\"1\"> \
<occupation mode=\"excited\"> \
-1 2 \
</occupation> \
</sposet> \
</sposet_builder> \
</tmp> \
";
const char* particles = R"XML(<tmp>
<sposet_builder name="spinorbuilder" type="molecularorbital" href="lcao_spinor.h5" source="ion" precision="float">
<basisset name="myset" transform="yes"/>
<sposet name="myspo" basisset="myset" size="1">
<occupation mode="excited">
-1 2
</occupation>
</sposet>
</sposet_builder>
</tmp>)XML";
Libxml2Document doc;
bool okay = doc.parseFromString(particles);
@ -434,6 +446,20 @@ void test_lcao_spinor_excited()
CHECK(d2psiM[iat][0] == ComplexApprox(vlp).epsilon(eps));
}
/** this is a somewhat simple example. We have an ion at the origin
* and a gaussian basis function centered on the ion as a orbital.
* In this case, the ion derivative is actually just the negative of
* the electron gradient.
*/
SPOSet::GradMatrix gradIon(elec_.R.size(), spo->getOrbitalSetSize());
spo->evaluateGradSource(elec_, 0, elec_.R.size(), ions_, 0, gradIon);
for (int iat = 0; iat < 1; iat++)
{
CHECK(gradIon[iat][0][0] == ComplexApprox(-vdx).epsilon(eps));
CHECK(gradIon[iat][0][1] == ComplexApprox(-vdy).epsilon(eps));
CHECK(gradIon[iat][0][2] == ComplexApprox(-vdz).epsilon(eps));
}
//temporary arrays for holding the values of the up and down channels respectively.
SPOSet::ValueVector psi_work;
@ -645,8 +671,102 @@ void test_lcao_spinor_excited()
}
}
void test_lcao_spinor_ion_derivs()
{
app_log() << "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n";
app_log() << "!!!! LCAO SpinorSet from HDF (ion derivatives) !!!!\n";
app_log() << "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n";
using ValueType = SPOSet::ValueType;
using RealType = SPOSet::RealType;
Communicate* c = OHMMS::Controller;
ParticleSetPool ptcl = ParticleSetPool(c);
auto ions_uptr = std::make_unique<ParticleSet>(ptcl.getSimulationCell());
auto elec_uptr = std::make_unique<ParticleSet>(ptcl.getSimulationCell());
ParticleSet& ions_(*ions_uptr);
ParticleSet& elec_(*elec_uptr);
ions_.setName("ion");
ptcl.addParticleSet(std::move(ions_uptr));
ions_.create({2});
ions_.R[0][0] = 0.10000000;
ions_.R[0][1] = 0.20000000;
ions_.R[0][2] = 0.30000000;
ions_.R[1][0] = -0.30000000;
ions_.R[1][1] = -0.20000000;
ions_.R[1][2] = -0.10000000;
SpeciesSet& ispecies = ions_.getSpeciesSet();
int hIdx = ispecies.addSpecies("H");
ions_.update();
elec_.setName("elec");
ptcl.addParticleSet(std::move(elec_uptr));
elec_.create({1});
elec_.R[0][0] = 0.01;
elec_.R[0][1] = -0.02;
elec_.R[0][2] = 0.03;
elec_.spins[0] = 0.6;
elec_.setSpinor(true);
SpeciesSet& tspecies = elec_.getSpeciesSet();
int upIdx = tspecies.addSpecies("u");
int chargeIdx = tspecies.addAttribute("charge");
tspecies(chargeIdx, upIdx) = -1;
elec_.addTable(ions_);
elec_.update();
const char* particles = R"XML(<tmp>
<sposet_builder name="spinorbuilder" type="molecularorbital" href="lcao_spinor_molecule.h5" source="ion" precision="float">
<basisset transform="yes"/>
<sposet name="myspo" size="1"/>
</sposet_builder>
</tmp>)XML";
Libxml2Document doc;
bool okay = doc.parseFromString(particles);
REQUIRE(okay);
xmlNodePtr root = doc.getRoot();
xmlNodePtr bnode = xmlFirstElementChild(root);
SPOSetBuilderFactory fac(c, elec_, ptcl.getPool());
const auto spo_builder_ptr = fac.createSPOSetBuilder(bnode);
auto& bb = *spo_builder_ptr;
// only pick up the last sposet
std::unique_ptr<SPOSet> spo;
processChildren(bnode, [&](const std::string& cname, const xmlNodePtr element) {
if (cname == "sposet")
spo = bb.createSPOSet(element);
});
REQUIRE(spo);
//reference values from finite difference in lcao_spinor_molecule_test.py
ValueType dx0(-0.0492983, -0.3192778);
ValueType dy0(-0.1205071, -0.7804567);
ValueType dz0(-0.1478950, -0.9578333);
ValueType dx1(-0.0676367, 1.0506422);
ValueType dy1(-0.0392729, 0.6100503);
ValueType dz1(-0.0283638, 0.4405919);
const RealType eps = 1e-4;
SPOSet::GradMatrix gradIon(elec_.R.size(), spo->getOrbitalSetSize());
spo->evaluateGradSource(elec_, 0, elec_.R.size(), ions_, 0, gradIon);
CHECK(gradIon[0][0][0] == ComplexApprox(dx0).epsilon(eps));
CHECK(gradIon[0][0][1] == ComplexApprox(dy0).epsilon(eps));
CHECK(gradIon[0][0][2] == ComplexApprox(dz0).epsilon(eps));
spo->evaluateGradSource(elec_, 0, elec_.R.size(), ions_, 1, gradIon);
CHECK(gradIon[0][0][0] == ComplexApprox(dx1).epsilon(eps));
CHECK(gradIon[0][0][1] == ComplexApprox(dy1).epsilon(eps));
CHECK(gradIon[0][0][2] == ComplexApprox(dz1).epsilon(eps));
}
TEST_CASE("ReadMolecularOrbital GTO spinor", "[wavefunction]") { test_lcao_spinor(); }
TEST_CASE("ReadMolecularOrbital GTO spinor with excited", "[wavefunction]") { test_lcao_spinor_excited(); }
TEST_CASE("spinor ion derivatives for molecule", "[wavefunction]") { test_lcao_spinor_ion_derivs(); }
} // namespace qmcplusplus

View File

@ -41,7 +41,7 @@ case "$1" in
export CUDACXX=/usr/local/cuda-11.2/bin/nvcc
# Make current environment variables available to subsequent steps
echo "PATH=/usr/local/cuda-11.2/bin:$PATH" >> $GITHUB_ENV
echo "PATH=$PATH" >> $GITHUB_ENV
echo "CUDACXX=/usr/local/cuda-11.2/bin/nvcc" >> $GITHUB_ENV
else
@ -286,6 +286,12 @@ case "$1" in
# Build using ninja (~ 25 minutes on GitHub-hosted runner)
build)
# CUDA toolchain can be used implicitly by the compiler. Double check the location.
if [[ "${GH_JOBNAME}" =~ (CUDA) ]]
then
which nvcc
fi
cd ${GITHUB_WORKSPACE}/../qmcpack-build
ninja
;;