Merge pull request #3086 from QMCPACK/rc_3110

Rc 3110
2021-04-09 08:53:27 -04:00 · 2021-04-09 08:53:27 -04:00 · 129b7d8849
parent f775d65b31 d8e2605d7d
commit 129b7d8849
1682 changed files with 103728 additions and 115264 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -14,7 +14,8 @@ _Delete the items that do not apply_
 - Code style update (formatting, renaming)
 - Refactoring (no functional changes, no api changes)
 - Build related changes
- Documentation content changes
+- Testing changes (e.g. new unit/integration/performance tests)
+- Documentation changes
 - Other (please describe):

 ### Does this introduce a breaking change?
--- a/.github/workflows/ci-github-actions.yaml
+++ b/.github/workflows/ci-github-actions.yaml
@ -0,0 +1,74 @@
+
+name: GitHub Actions CI
+
+on: 
+  push:
+    branches: 
+    - develop
+  pull_request:
+    branches: 
+    - develop
+
+jobs:
+
+  linux:
+    runs-on: ubuntu-latest
+    container: ${{ matrix.container }}
+    env:
+      GH_JOBNAME: ${{ matrix.jobname }}
+      GH_OS: Linux
+    strategy:
+      fail-fast: false
+      matrix:
+        jobname: [
+          gcc-openmpi-real-coverage,
+          gcc-openmpi-complex-coverage,
+          clang-openmpi-real-asan,
+          clang-openmpi-real-ubsan
+        ]
+        include:
+        - jobname: gcc-openmpi-real-coverage
+          container: 
+            image: williamfgc/qmcpack-ci:ubuntu20-openmpi
+            options: -u 1001
+        
+        - jobname: gcc-openmpi-complex-coverage
+          container: 
+            image: williamfgc/qmcpack-ci:ubuntu20-openmpi
+            options: -u 1001
+            
+        - jobname: clang-openmpi-real-asan
+          container: 
+            image: williamfgc/qmcpack-ci:ubuntu20-openmpi
+            options: -u 1001
+        
+        - jobname: clang-openmpi-real-ubsan
+          container: 
+            image: williamfgc/qmcpack-ci:ubuntu20-openmpi
+            options: -u 1001
+
+    steps:
+    - name: Checkout Action
+      uses: actions/checkout@v1
+
+    - name: Configure
+      run: tests/test_automation/github-actions/ci/run_step.sh configure
+
+    - name: Build
+      run: tests/test_automation/github-actions/ci/run_step.sh build
+
+    - name: Test
+      run: tests/test_automation/github-actions/ci/run_step.sh test
+    
+    - name: Coverage
+      if: contains(matrix.jobname, 'coverage')
+      run: tests/test_automation/github-actions/ci/run_step.sh coverage
+    
+    - name: Upload Coverage
+      if: contains(matrix.jobname, 'coverage')
+      uses: codecov/codecov-action@v1
+      with:
+        file:  ../qmcpack-build/coverage.xml
+        flags: tests-deterministic # optional
+        name: codecov-QMCPACK # optional
+        fail_ci_if_error: true # optional (default = false)
--- a/.gitignore
+++ b/.gitignore
@ -9,3 +9,11 @@ qmcpack_manual.pdf
 .DS_Store
 /build_*/
 docs/_build
+nexus/tests/unit/*output
+tests/solids/NiO_a4_e48_pp/NiO-fcc-supertwist111-supershift000-S1.h5
+
+# Eclipse IDE
+.cproject
+.project
+.settings/
+.pydevproject
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,51 @@

 Notable changes to QMCPACK are documented in this file.

+## [3.11.0] - 2021-04-09
+
+### Notes
+
+This release includes a large number of refinements to QMCPACK and the supporting ecosystem. These include support for the latest version of
+Quantum ESPRESSO, new capabilities in AFQMC, space-warp transformation for forces, numerous bug fixes, user-requested feature improvements,
+and further upgrades to the test system.
+
+* Quantum ESPRESSO (QE) v6.7 support. [\#2927](https://github.com/QMCPACK/qmcpack/pull/2927).
+* Detect and automatically use patched version of QE found on the PATH. [\#2974](https://github.com/QMCPACK/qmcpack/pull/2974).
+* Support for global max\_seconds and STOP file to cleanly halt QMCPACK during a run. [\#3028](https://github.com/QMCPACK/qmcpack/pull/3028).
+* Freezing of two-body Jastrow parameters in optimization works. [\#2814](https://github.com/QMCPACK/qmcpack/issues/2814).
+* Multideterminant code now works with only alpha determinants \(no down electrons\). [\#2698](https://github.com/QMCPACK/qmcpack/issues/2698).
+* High l-momentum channels as local channels in ECPs work. [\#2920](https://github.com/QMCPACK/qmcpack/pull/2920).
+* Space Warp Transformation for ZVZB Forces. [\#2828](https://github.com/QMCPACK/qmcpack/pull/2828).
+* Important bug fixes in legacy CUDA implementation causing incorrect energies. [\#2883](https://github.com/QMCPACK/qmcpack/pull/2883).
+* Implemented DLA in legacy CUDA. [\#2887](https://github.com/QMCPACK/qmcpack/pull/2887).
+* Updates to support CUDA 11.2.1 e.g. [\#2950](https://github.com/QMCPACK/qmcpack/pull/2950).
+* AFQMC supports energy estimator with different Hamiltonian \(from propagation\). [\#2795](https://github.com/QMCPACK/qmcpack/pull/2795).
+* Trial wavefunction optimization with spin-orbit supported. [\#3034](https://github.com/QMCPACK/qmcpack/pull/3034).
+* ppconvert executable automatically built when configured. [\#2904](https://github.com/QMCPACK/qmcpack/pull/2904).
+* Tests added for ppconvert. [\#2929](https://github.com/QMCPACK/qmcpack/issues/2929).
+* Fixed SIMD alignment for AVX512 on some systems. [\#2981](https://github.com/QMCPACK/qmcpack/pull/2981).
+* Improved wavefunction restart logic in AFQMC. [\#2942](https://github.com/QMCPACK/qmcpack/pull/2942).
+* Spin-density supported in batched code. [\#2840](https://github.com/QMCPACK/qmcpack/pull/2840).
+* Reduced I/O operations during cmake. [\#2808](https://github.com/QMCPACK/qmcpack/pull/2808).
+* Improved detection of unsupported-by-Intel combinations of Intel compilers and libstdc++. [\#2794](https://github.com/QMCPACK/qmcpack/pull/2794).
+* Initial support for Andes at OLCF. [\#3073](https://github.com/QMCPACK/qmcpack/pull/3073).
+* Deterministic tests expanded in scope and made reliable for more build types and compilers.
+* Various minor bug fixes and feature improvements based on user requests for both real-space and AFQMC.
+* Improved error handling throughout.
+* Numerous performance improvements, expansion of tests, and bug fixes to the batched VMC and DMC codes. Reasonable but not optimal GPU acceleration can now be achieved for spline-based wavefunctions.
+
+### NEXUS
+
+* Support AMD nodes on Cori. [\#2809](https://github.com/QMCPACK/qmcpack/pull/2809).
+* Interface for RMG code. [\#2932](https://github.com/QMCPACK/qmcpack/pull/2932).
+* Added h-channel to list of possible local channels in pseudopotential. [\#2915](https://github.com/QMCPACK/qmcpack/pull/2915).
+* Allow non spin-specific occupations in case of noncollinear. [\#2957](https://github.com/QMCPACK/qmcpack/pull/2957).
+* More robust handling of QE output when printed eigenvalues touch. [\#3042](https://github.com/QMCPACK/qmcpack/pull/3042).
+* Fixed type check for reblock\_factors in qmc-fit. [\#2830](https://github.com/QMCPACK/qmcpack/pull/2830).
+* Fixed a Jastrow read error/warning, add several QE inputs. [\#2819](https://github.com/QMCPACK/qmcpack/pull/2819).
+* Fixed tests on Summit. [\#2983](https://github.com/QMCPACK/qmcpack/pull/2983).
+* Fixed module overwrite bug in qmca. [\#2802](https://github.com/QMCPACK/qmcpack/pull/2802).
+
 ## [3.10.0] - 2020-11-10

 ### Notes
--- a/CMake/CheckSIMDAlignment.cmake
+++ b/CMake/CheckSIMDAlignment.cmake
@ -0,0 +1,12 @@
+# Check if AVX512 is activated in the compilation
+# Since cross-compiling is not unusual on HPC systems (Cray),
+# try_compile is robust against 
+try_compile(CXX_COMPILER_HAVE_AVX512_MACRO ${CMAKE_BINARY_DIR}
+      ${PROJECT_CMAKE}/try_compile_sources/check_AVX512.cpp
+      CMAKE_FLAGS "${CMAKE_CXX_FLAGS}")
+
+if (CXX_COMPILER_HAVE_AVX512_MACRO)
+  set(default_alignment 64)
+else()
+  set(default_alignment 32)
+endif()
--- a/CMake/ClangCompilers.cmake
+++ b/CMake/ClangCompilers.cmake
@ -4,6 +4,10 @@ IF ( CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0 )
  MESSAGE(FATAL_ERROR "Requires clang 7.0 or higher ")
 ENDIF()

+IF ( CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 11.0.0 AND QMC_CXX_STANDARD EQUAL 17 AND BUILD_AFQMC )
+  MESSAGE(FATAL_ERROR "Avoid Clang 11.0.0 which cannot compile AFQMC properly with C++17!")
+ENDIF()
+
 # Set the std
 SET(CMAKE_C_FLAGS     "${CMAKE_C_FLAGS} -std=c99")

@ -12,10 +16,14 @@ IF(QMC_OMP)
  SET(ENABLE_OPENMP 1)
  IF(ENABLE_OFFLOAD AND NOT CMAKE_SYSTEM_NAME STREQUAL "CrayLinuxEnvironment")
    SET(OFFLOAD_TARGET "nvptx64-nvidia-cuda" CACHE STRING "Offload target architecture")
+    SET(OPENMP_OFFLOAD_COMPILE_OPTIONS "-fopenmp-targets=${OFFLOAD_TARGET}")
+
    IF(DEFINED OFFLOAD_ARCH)
-      SET(CLANG_OPENMP_OFFLOAD_FLAGS "-fopenmp-targets=${OFFLOAD_TARGET} -Xopenmp-target=${OFFLOAD_TARGET} -march=${OFFLOAD_ARCH}")
-    ELSE()
-      SET(CLANG_OPENMP_OFFLOAD_FLAGS "-fopenmp-targets=${OFFLOAD_TARGET}")
+      SET(OPENMP_OFFLOAD_COMPILE_OPTIONS "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Xopenmp-target=${OFFLOAD_TARGET} -march=${OFFLOAD_ARCH}")
+    ENDIF()
+
+    IF(OFFLOAD_TARGET MATCHES "nvptx64")
+      SET(OPENMP_OFFLOAD_COMPILE_OPTIONS "${OPENMP_OFFLOAD_COMPILE_OPTIONS} -Wno-unknown-cuda-version")
    ENDIF()

    # Intel clang compiler needs a different flag for the host side OpenMP library when offload is used.
@ -110,12 +118,6 @@ ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64" OR CMAKE_SYSTEM_PROCESSOR MATCHES
  ENDIF()
 ENDIF()

-# Add OpenMP offload flags
-# This step is intentionally put after the -march parsing for CPUs.
-IF(DEFINED CLANG_OPENMP_OFFLOAD_FLAGS)
-  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CLANG_OPENMP_OFFLOAD_FLAGS}")
-ENDIF()
-
 # Add static flags if necessary
 IF(QMC_BUILD_STATIC)
    SET(CMAKE_CXX_LINK_FLAGS " -static")
@ -139,23 +141,3 @@ IF(XRAY_PROFILE)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${XRAY_FLAGS}")
  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${XRAY_FLAGS}")
 ENDIF(XRAY_PROFILE)
-
-SET(LLVM_SANITIZE_ADDRESS FALSE CACHE BOOL "Use llvm address sanitizer library")
-MARK_AS_ADVANCED(LLVM_SANITIZE_ADDRESS)
-IF(LLVM_SANITIZE_ADDRESS)
-  SET(CMAKE_C_FLAGS "-fno-omit-frame-pointer -fsanitize=address -fsanitize-address-use-after-scope ${CMAKE_C_FLAGS}")
-  SET(CMAKE_CXX_FLAGS "-fno-omit-frame-pointer -fsanitize=address -fsanitize-address-use-after-scope ${CMAKE_CXX_FLAGS}")
-  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fno-omit-frame-pointer -fsanitize=address -fsanitize-address-use-after-scope")
-  SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fno-omit-frame-pointer -fsanitize=address -fsanitize-address-use-after-scope")
-ENDIF(LLVM_SANITIZE_ADDRESS)
-
-# Don't expect this to be useful unless you have msan instrumented all libraries
-SET(LLVM_SANITIZE_MEMORY FALSE CACHE BOOL "Use llvm memory sanitizer library")
-MARK_AS_ADVANCED(LLVM_SANITIZE_MEMORY)
-IF(LLVM_SANITIZE_MEMORY)
-  SET(LLVM_BLACKLIST_SANITIZE_MEMORY "-fsanitize-blacklist=${PROJECT_SOURCE_DIR}/llvm_misc/memory_sanitizer_blacklist.txt")
-  SET(CMAKE_C_FLAGS_DEBUG "-fsanitize=memory ${LLVM_BLACKLIST_SANITIZE_MEMORY} ${CMAKE_C_FLAGS_DEBUG}")
-  SET(CMAKE_CXX_FLAGS_DEBUG "-fsanitize=memory ${LLVM_BLACKLIST_SANITIZE_MEMORY} ${CMAKE_CXX_FLAGS_DEBUG}")
-  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=memory ${LLVM_BLACKLIST_SANITIZE_MEMORY}")
-  SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fsanitize=memory ${LLVM_BLACKLIST_SANITIZE_MEMORY}")
-ENDIF(LLVM_SANITIZE_MEMORY)
--- a/CMake/FindQE.cmake
+++ b/CMake/FindQE.cmake
@ -0,0 +1,22 @@
+# Locate QE via the patched pw2qmcpack.x
+# Require both to be present to consider QE_FOUND
+# Take QE_BIN as hint for location
+
+FIND_PATH(QE_PW_DIR pw.x HINTS ${QE_BIN})
+FIND_PATH(QE_PW2Q_DIR pw2qmcpack.x HINTS ${QE_BIN})
+
+SET(QE_FOUND FALSE)
+IF(QE_PW2Q_DIR AND QE_PW_DIR)
+  IF ( NOT (QE_PW2Q_DIR STREQUAL QE_PW_DIR) )
+    MESSAGE(WARNING "Found pw.x and pw2qmcpack.x in different locations, ${QE_PW_DIR} and ${QE_PW2Q_DIR}, verify this is intentional.")
+  ENDIF()
+  #MESSAGE(STATUS "QE_PW2Q_DIR=${QE_PW2Q_DIR}")
+  #MESSAGE(STATUS "QE_PW_DIR=${QE_PW_DIR}")
+  SET(QE_FOUND TRUE)
+ENDIF()
+
+MARK_AS_ADVANCED(
+   QE_PW2Q_DIR
+   QE_PW_DIR
+   QE_FOUND
+)
--- a/CMake/GNUCompilers.cmake
+++ b/CMake/GNUCompilers.cmake
@ -1,6 +1,6 @@
 # Check compiler version
-IF ( CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0 )
-MESSAGE(FATAL_ERROR "Requires gcc 5.0 or higher ")
+IF ( CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0 )
+MESSAGE(FATAL_ERROR "Requires gcc 7.0 or higher ")
 ENDIF()

 # Set the std
@ -11,6 +11,10 @@ IF(QMC_OMP)
  SET(ENABLE_OPENMP 1)
  SET(CMAKE_C_FLAGS     "${CMAKE_C_FLAGS} -fopenmp")
  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+  IF(ENABLE_OFFLOAD AND NOT CMAKE_SYSTEM_NAME STREQUAL "CrayLinuxEnvironment")
+    SET(OFFLOAD_TARGET "nvptx-none" CACHE STRING "Offload target architecture")
+    SET(OPENMP_OFFLOAD_COMPILE_OPTIONS "-foffload=${OFFLOAD_TARGET} -foffload=\"-lm -latomic\"")
+  ENDIF()
 ENDIF(QMC_OMP)

 # Set gnu specific flags (which we always want)
--- a/CMake/IBMCompilers.cmake
+++ b/CMake/IBMCompilers.cmake
@ -26,10 +26,10 @@ SET( CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O3" )

 IF(QMC_OMP)
  SET(ENABLE_OPENMP 1)
+  SET(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -qsmp=omp")
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qsmp=omp")
  IF(ENABLE_OFFLOAD)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qsmp=omp -qoffload")
-  ELSE(ENABLE_OFFLOAD)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qsmp=omp")
+    set(OPENMP_OFFLOAD_COMPILE_OPTIONS "-qoffload")
  ENDIF(ENABLE_OFFLOAD)
 ELSE(QMC_OMP)
  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qnothreaded")
--- a/CMake/PGICompilers.cmake
+++ b/CMake/PGICompilers.cmake
@ -9,7 +9,16 @@ SET(CMAKE_C_FLAGS     "${CMAKE_C_FLAGS} -c99")
 IF(QMC_OMP)
  SET(ENABLE_OPENMP 1)
  SET(CMAKE_C_FLAGS     "${CMAKE_C_FLAGS} -mp=allcores")
-  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mp=allcores")
+  IF(ENABLE_OFFLOAD AND NOT CMAKE_SYSTEM_NAME STREQUAL "CrayLinuxEnvironment")
+    MESSAGE(WARNING "QMCPACK OpenMP offload is not ready for NVIDIA HPC compiler.")
+    IF(NOT DEFINED OFFLOAD_ARCH)
+      MESSAGE(FATAL_ERROR "NVIDIA HPC compiler requires -gpu=ccXX option set based on the target GPU architecture! "
+                          "Please add -DOFFLOAD_ARCH=ccXX to cmake. For example, cc70 is for Volta.")
+    ENDIF()
+    SET(OPENMP_OFFLOAD_COMPILE_OPTIONS "-mp=gpu -gpu=${OFFLOAD_ARCH}")
+  ELSE()
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mp=allcores")
+  ENDIF()
 ENDIF(QMC_OMP)

 ADD_DEFINITIONS( -Drestrict=__restrict__ )
--- a/CMake/TestCxx14Library.cmake
+++ b/CMake/TestCxx14Library.cmake
@ -27,7 +27,7 @@ if (NOT CXX14_LIBRARY_OKAY)
  set(COMPILE_FAIL_OUTPUT cpp14_compile_fail.txt)
  file(WRITE "${CMAKE_BINARY_DIR}/${COMPILE_FAIL_OUTPUT}" "${COMPILE_OUTPUT}")

-  message(STATUS "C++ 14 standard library support not found")
+  message(STATUS "C++14 standard library support not found")
  message("compiler is ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
  if (CMAKE_CXX_COMPILER_ID MATCHES "GNU") 
    message("Compiler detected is g++.\n  Use version 5.0 or newer for a C++14 standard library")
@ -39,6 +39,5 @@ if (NOT CXX14_LIBRARY_OKAY)
  message("  Output of test compile is in ${COMPILE_FAIL_OUTPUT}")
  message(FATAL_ERROR "stopping")
 else()
-  message(STATUS "C++ 14 standard library supported")
+  message(STATUS "C++14 standard library supported")
 endif()
-
--- a/CMake/TestCxx17Library.cmake
+++ b/CMake/TestCxx17Library.cmake
@ -0,0 +1,42 @@
+
+# Test that the compiler is configured with a C++17 standard library
+
+set(TEST_CXX17_SOURCE ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/try_cxx17_library.cpp)
+file(WRITE ${TEST_CXX17_SOURCE}
+"// Test for C++17 standard library support
+#include <variant>
+#include <string>
+
+int main(int argc, char **argv)
+{
+    std::variant<int, float, std::string> intFloatString;
+    return 0;
+}
+")
+
+
+try_compile(CXX17_LIBRARY_OKAY ${CMAKE_BINARY_DIR}
+            ${TEST_CXX17_SOURCE}
+            CXX_STANDARD 17
+            CXX_STANDARD_REQUIRED ON
+            OUTPUT_VARIABLE COMPILE_OUTPUT)
+
+
+if (NOT CXX17_LIBRARY_OKAY)
+  set(COMPILE_FAIL_OUTPUT cpp17_compile_fail.txt)
+  file(WRITE "${CMAKE_BINARY_DIR}/${COMPILE_FAIL_OUTPUT}" "${COMPILE_OUTPUT}")
+
+  message(STATUS "C++17 standard library support not found")
+  message("compiler is ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+  if (CMAKE_CXX_COMPILER_ID MATCHES "GNU") 
+    message("Compiler detected is g++.\n  Use version 7.0 or newer for C++17 standard library support.")
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    message("Compiler detected is clang++.\n  If not using libcxx, ensure a g++ version greater than 7.0 is also on the path so that its C++17 library can be used.")
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+    message("Compiler detected is icpc.\n  Ensure a gcc version greater than 7.0 is also on the path so that its C++17 library can be used.  Or use the -cxxlib switch to point to a newer gcc install.")
+  endif()
+  message("  Output of test compile is in ${COMPILE_FAIL_OUTPUT}")
+  message(FATAL_ERROR "stopping")
+else()
+  message(STATUS "C++17 standard library supported")
+endif()
--- a/CMake/Testlibstdc++.cmake
+++ b/CMake/Testlibstdc++.cmake
@ -1,23 +1,8 @@

 # Test that if a C++ compiler is compatiable with the libstdc++ in use

-set(TEST_LIBSTDCXX_SOURCE ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/try_libstdcxx.cpp)
-file(WRITE ${TEST_LIBSTDCXX_SOURCE}
-"// Test the compatibility between the compiler and the libstdc++ from GNU
-#include <cstdio>
-
-int main(int argc, char **argv)
-{
-#if ( ( __INTEL_COMPILER == 1800 ) && (  _GLIBCXX_RELEASE > 7 ) )
-#error too new libstdc++ from GNU for Intel 18, use GNU version <= 7
-#endif
-    return 0;
-}
-")
-
-
 try_compile(LIBSTDCXX_OKAY ${CMAKE_BINARY_DIR}
-            ${TEST_LIBSTDCXX_SOURCE}
+            ${PROJECT_CMAKE}/try_compile_sources/check_libstdcxx.cpp
            CXX_STANDARD 14
            CXX_STANDARD_REQUIRED ON
            OUTPUT_VARIABLE COMPILE_OUTPUT)
--- a/CMake/ctest_script.cmake
+++ b/CMake/ctest_script.cmake
@ -199,10 +199,6 @@ IF ( DEFINED QMC_MIXED_PRECISION )
  SET( CTEST_OPTIONS "${CTEST_OPTIONS};-DQMC_MIXED_PRECISION=${QMC_MIXED_PRECISION}" )
 ENDIF()

-IF ( DEFINED ENABLE_SOA )
-  SET( CTEST_OPTIONS "${CTEST_OPTIONS};-DENABLE_SOA=${ENABLE_SOA}" )
-ENDIF()
-
 IF ( DEFINED CUDA_ARCH )
  SET( CTEST_OPTIONS "${CTEST_OPTIONS};-DCUDA_ARCH='${CUDA_ARCH}'" )
 ENDIF()
--- a/CMake/macros.cmake
+++ b/CMake/macros.cmake
@ -37,29 +37,31 @@ FUNCTION( COPY_DIRECTORY_USING_SYMLINK SRC_DIR DST_DIR )
    ENDFOREACH()
 ENDFUNCTION()

-# Copy files, but symlink the *.h5 files (which are the large ones)
-FUNCTION( COPY_DIRECTORY_SYMLINK_H5 SRC_DIR DST_DIR)
-    # Copy everything but *.h5 files and pseudopotential files
-    FILE(COPY "${SRC_DIR}/" DESTINATION "${DST_DIR}"
-         PATTERN "*.h5" EXCLUDE
-         PATTERN "*.opt.xml" EXCLUDE
-         PATTERN "*.ncpp.xml" EXCLUDE
-         PATTERN "*.BFD.xml" EXCLUDE)
-
-    # Now find and symlink the *.h5 files and psuedopotential files
-    FILE(GLOB_RECURSE H5 "${SRC_DIR}/*.h5" "${SRC_DIR}/*.opt.xml" "${SRC_DIR}/*.ncpp.xml" "${SRC_DIR}/*.BFD.xml")
-    FOREACH(F IN LISTS H5)
-      FILE(RELATIVE_PATH R "${SRC_DIR}" "${F}")
-      #MESSAGE("Creating symlink from  ${SRC_DIR}/${R} to ${DST_DIR}/${R}")
-      EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} -E create_symlink "${SRC_DIR}/${R}" "${DST_DIR}/${R}")
+# Copy selected files only. h5, pseudopotentials, wavefunction, structure and the used one input file are copied.
+FUNCTION( COPY_DIRECTORY_USING_SYMLINK_LIMITED SRC_DIR DST_DIR ${ARGN})
+    FILE(MAKE_DIRECTORY "${DST_DIR}")
+    # Find all the files but not subdirectories
+    FILE(GLOB FILE_FOLDER_NAMES LIST_DIRECTORIES TRUE
+        "${SRC_DIR}/qmc_ref" "${SRC_DIR}/qmc-ref" "${SRC_DIR}/*.h5"
+        "${SRC_DIR}/*.opt.xml" "${SRC_DIR}/*.ncpp.xml" "${SRC_DIR}/*.BFD.xml"
+        "${SRC_DIR}/*.ccECP.xml"
+        "${SRC_DIR}/*.py" "${SRC_DIR}/*.sh" "${SRC_DIR}/*.restart.xml"
+        "${SRC_DIR}/Li.xml" "${SRC_DIR}/H.xml" "${SRC_DIR}/*.L2_test.xml" "${SRC_DIR}/*.opt_L2.xml"
+        "${SRC_DIR}/*.wfnoj.xml" "${SRC_DIR}/*.wfj*.xml" "${SRC_DIR}/*.wfs*.xml"
+        "${SRC_DIR}/*.wfn*.xml" "${SRC_DIR}/*.cuspInfo.xml" "${SRC_DIR}/*.H*.xml"
+        "${SRC_DIR}/*.structure.xml" "${SRC_DIR}/*ptcl.xml")
+    FOREACH(F IN LISTS FILE_FOLDER_NAMES)
+      EXECUTE_PROCESS( COMMAND ln -sf "${F}" "." WORKING_DIRECTORY ${DST_DIR})
+    ENDFOREACH()
+    FOREACH(F IN LISTS ARGN)
+      EXECUTE_PROCESS( COMMAND ln -sf "${SRC_DIR}/${F}" "." WORKING_DIRECTORY ${DST_DIR})
    ENDFOREACH()
 ENDFUNCTION()

 # Control copy vs. symlink with top-level variable
-FUNCTION( COPY_DIRECTORY_MAYBE_USING_SYMLINK SRC_DIR DST_DIR )
+FUNCTION( COPY_DIRECTORY_MAYBE_USING_SYMLINK SRC_DIR DST_DIR ${ARGN})
  IF (QMC_SYMLINK_TEST_FILES)
-    COPY_DIRECTORY_USING_SYMLINK("${SRC_DIR}" "${DST_DIR}")
-    #COPY_DIRECTORY_SYMLINK_H5("${SRC_DIR}" "${DST_DIR}" )
+    COPY_DIRECTORY_USING_SYMLINK_LIMITED("${SRC_DIR}" "${DST_DIR}" ${ARGN})
  ELSE()
    COPY_DIRECTORY("${SRC_DIR}" "${DST_DIR}")
  ENDIF()
@ -95,8 +97,9 @@ FUNCTION( RUN_QMC_APP_NO_COPY TESTNAME WORKDIR PROCS THREADS TEST_ADDED TEST_LAB
        IF ( ${TOT_PROCS} GREATER ${TEST_MAX_PROCS} )
            MESSAGE_VERBOSE("Disabling test ${TESTNAME} (exceeds maximum number of processors ${TEST_MAX_PROCS})")
        ELSE()
-            ADD_TEST( ${TESTNAME} ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${PROCS} ${MPIEXEC_PREFLAGS} ${QMC_APP} ${ARGN} )
+            ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${PROCS} ${MPIEXEC_PREFLAGS} ${QMC_APP} ${ARGN} )
            SET_TESTS_PROPERTIES( ${TESTNAME} PROPERTIES FAIL_REGULAR_EXPRESSION "${TEST_FAIL_REGULAR_EXPRESSION}"
+                PASS_REGULAR_EXPRESSION "QMCPACK execution completed successfully"
                PROCESSORS ${TOT_PROCS} PROCESSOR_AFFINITY TRUE WORKING_DIRECTORY ${WORKDIR}
                ENVIRONMENT OMP_NUM_THREADS=${THREADS} )
            SET( TEST_ADDED_TEMP TRUE )
@ -105,6 +108,7 @@ FUNCTION( RUN_QMC_APP_NO_COPY TESTNAME WORKDIR PROCS THREADS TEST_ADDED TEST_LAB
        IF ( ( ${PROCS} STREQUAL "1" ) )
            ADD_TEST( ${TESTNAME} ${QMC_APP} ${ARGN} )
            SET_TESTS_PROPERTIES( ${TESTNAME} PROPERTIES FAIL_REGULAR_EXPRESSION "${TEST_FAIL_REGULAR_EXPRESSION}"
+                PASS_REGULAR_EXPRESSION "QMCPACK execution completed successfully"
                PROCESSORS ${TOT_PROCS} PROCESSOR_AFFINITY TRUE WORKING_DIRECTORY ${WORKDIR}
                ENVIRONMENT OMP_NUM_THREADS=${THREADS} )
            SET( TEST_ADDED_TEMP TRUE )
@ -112,6 +116,11 @@ FUNCTION( RUN_QMC_APP_NO_COPY TESTNAME WORKDIR PROCS THREADS TEST_ADDED TEST_LAB
            MESSAGE_VERBOSE("Disabling test ${TESTNAME} (building without MPI)")
        ENDIF()
    ENDIF()
+
+    if (TEST_ADDED_TEMP AND (QMC_CUDA OR ENABLE_CUDA OR ENABLE_OFFLOAD))
+      set_tests_properties(${TESTNAME} PROPERTIES RESOURCE_LOCK exclusively_owned_gpus)
+    endif()
+
    SET(TEST_LABELS_TEMP "")
    IF ( TEST_ADDED_TEMP )
       ADD_TEST_LABELS( ${TESTNAME} TEST_LABELS_TEMP )
@ -123,7 +132,13 @@ ENDFUNCTION()
 # Runs qmcpack
 #  Note that TEST_ADDED is an output variable
 FUNCTION( RUN_QMC_APP TESTNAME SRC_DIR PROCS THREADS TEST_ADDED TEST_LABELS ${ARGN} )
-    COPY_DIRECTORY_MAYBE_USING_SYMLINK( "${SRC_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/${TESTNAME}" )
+    # restrict ARGN to only one file or empty
+    LIST(LENGTH ARGN INPUT_FILE_LENGTH)
+    IF(INPUT_FILE_LENGTH GREATER 1)
+      MESSAGE(FATAL_ERROR "Incorrect invocation of RUN_QMC_APP by ${TESTNAME}. ARGN value is \"${ARGN}\"")
+    ENDIF()
+
+    COPY_DIRECTORY_MAYBE_USING_SYMLINK( "${SRC_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/${TESTNAME}" "${ARGN}")
    SET( TEST_ADDED_TEMP FALSE )
    SET( TEST_LABELS_TEMP "" )
    RUN_QMC_APP_NO_COPY( ${TESTNAME} ${CMAKE_CURRENT_BINARY_DIR}/${TESTNAME} ${PROCS} ${THREADS} TEST_ADDED_TEMP TEST_LABELS_TEMP ${ARGN} )
@ -251,7 +266,7 @@ ENDFUNCTION()

 function(QMC_RUN_AND_CHECK_CUSTOM_SCALAR)
    set(OPTIONS SHOULD_FAIL)
-    set(ONE_VALUE_ARGS BASE_NAME BASE_DIR PREFIX INPUT_FILE PROCS THREADS SERIES SCALAR_VALUES)
+    set(ONE_VALUE_ARGS BASE_NAME BASE_DIR PREFIX INPUT_FILE PROCS THREADS SERIES SCALAR_VALUES EQUILIBRATION)
    # Eventually many want to support multiple SERIES/SCALAR_VALUES pairs
    #SET(MULTI_VALUE_ARGS SERIES SCALAR_VALUES)

@ -272,6 +287,11 @@ function(QMC_RUN_AND_CHECK_CUSTOM_SCALAR)
    set(PREFIX ${QRC_PREFIX})
    set(INPUT_FILE ${QRC_INPUT_FILE})

+    set(EQUIL 2)
+    if (DEFINED QRC_EQUILIBRATION)
+      set(EQUIL ${QRC_EQUILIBRATION})
+    endif()
+
    set( TEST_ADDED FALSE )
    set( TEST_LABELS "")
    set( FULL_NAME "${BASE_NAME}-${PROCS}-${THREADS}" )
@ -307,7 +327,7 @@ function(QMC_RUN_AND_CHECK_CUSTOM_SCALAR)
        else()
          set( TEST_NAME "${FULL_NAME}-${SCALAR_NAME}" )
        endif()
-        set(CHECK_CMD ${CMAKE_SOURCE_DIR}/tests/scripts/check_scalars.py --ns 3 --series ${SERIES} -p ${PREFIX} -e 2 --name ${SCALAR_NAME} --ref-value ${SCALAR_VALUE} --ref-error ${SCALAR_ERROR})
+        set(CHECK_CMD ${CMAKE_SOURCE_DIR}/tests/scripts/check_scalars.py --ns 3 --series ${SERIES} -p ${PREFIX} -e ${EQUIL} --name ${SCALAR_NAME} --ref-value ${SCALAR_VALUE} --ref-error ${SCALAR_ERROR})
        add_test( NAME ${TEST_NAME}
                  COMMAND ${CHECK_CMD}
                  WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${FULL_NAME}"
@ -394,6 +414,6 @@ ENDFUNCTION()
 # Print THE_MESSAGE if verbose configuration is enabled
 FUNCTION ( MESSAGE_VERBOSE THE_MESSAGE )
    IF ( QMC_VERBOSE_CONFIGURATION )
-      MESSAGE( ${THE_MESSAGE} )
+      MESSAGE(STATUS ${THE_MESSAGE})
    ENDIF()
 ENDFUNCTION()
--- a/CMake/python.cmake
+++ b/CMake/python.cmake
@ -4,6 +4,7 @@
 #   MODULE_NAME - input, name of module to test for
 #   MODULE_PRESENT - output - True/False based on success of the import
 FUNCTION (TEST_PYTHON_MODULE MODULE_NAME MODULE_PRESENT)
+  MESSAGE_VERBOSE("Checking import python module ${MODULE_NAME}")
  EXECUTE_PROCESS(
    COMMAND ${qmcpack_SOURCE_DIR}/tests/scripts/test_import.py ${MODULE_NAME}
    OUTPUT_VARIABLE TMP_OUTPUT_VAR
--- a/CMake/run_qe.cmake
+++ b/CMake/run_qe.cmake
@ -14,7 +14,7 @@ FUNCTION( ADD_QE_TEST TESTNAME PROCS TEST_BINARY NPOOL WORKDIR TEST_INPUT)
    IF ( HAVE_MPI )
        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${PROCS} ${MPIEXEC_PREFLAGS} ${TEST_BINARY} -npool ${NPOOL} -inp ${TEST_INPUT} )
    ELSE()
-        ADD_TEST( NAME ${TESTNAME} COMMAND ${TEST_BINARY} -npool 1 ${TEST_INPUT} )
+        ADD_TEST( NAME ${TESTNAME} COMMAND ${TEST_BINARY} -npool 1 -inp ${TEST_INPUT} )
    ENDIF()
    SET_TESTS_PROPERTIES( ${TESTNAME} PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1 PROCESSORS ${PROCS} PROCESSOR_AFFINITY TRUE WORKING_DIRECTORY ${WORKDIR} )
    SET_PROPERTY( TEST ${TESTNAME} APPEND PROPERTY LABELS "converter" )
@ -26,14 +26,14 @@ FUNCTION( RUN_QE_TEST BASE_NAME SRC_DIR PROCS1 PROCS2 PROCS3 NPOOL1 NPOOL2 NPOOL
    SET( MY_WORKDIR ${CMAKE_CURRENT_BINARY_DIR}/${FULL_NAME} )
    MESSAGE_VERBOSE("Adding test ${FULL_NAME}")
    COPY_DIRECTORY( "${SRC_DIR}" "${MY_WORKDIR}" )
-    ADD_QE_TEST(${FULL_NAME}-scf  ${PROCS1} ${QE_BIN}/pw.x         ${NPOOL1} ${MY_WORKDIR} ${TEST_INPUT_PREFIX}-scf.in )
+    ADD_QE_TEST(${FULL_NAME}-scf  ${PROCS1} ${QE_PW_DIR}/pw.x         ${NPOOL1} ${MY_WORKDIR} ${TEST_INPUT_PREFIX}-scf.in )
    IF(PROCS2 EQUAL 0)
-        ADD_QE_TEST(${FULL_NAME}-pw2x ${PROCS3} ${QE_BIN}/pw2qmcpack.x ${NPOOL3} ${MY_WORKDIR} ${TEST_INPUT_PREFIX}-pw2x.in )
+        ADD_QE_TEST(${FULL_NAME}-pw2x ${PROCS3} ${QE_PW2Q_DIR}/pw2qmcpack.x ${NPOOL3} ${MY_WORKDIR} ${TEST_INPUT_PREFIX}-pw2x.in )
        SET_TESTS_PROPERTIES(${FULL_NAME}-pw2x PROPERTIES DEPENDS ${FULL_NAME}-scf)
    ELSE(PROCS2 EQUAL 0)
-        ADD_QE_TEST(${FULL_NAME}-nscf ${PROCS2} ${QE_BIN}/pw.x         ${NPOOL2} ${MY_WORKDIR} ${TEST_INPUT_PREFIX}-nscf.in )
+        ADD_QE_TEST(${FULL_NAME}-nscf ${PROCS2} ${QE_PW_DIR}/pw.x         ${NPOOL2} ${MY_WORKDIR} ${TEST_INPUT_PREFIX}-nscf.in )
        SET_TESTS_PROPERTIES(${FULL_NAME}-nscf PROPERTIES DEPENDS ${FULL_NAME}-scf)
-        ADD_QE_TEST(${FULL_NAME}-pw2x ${PROCS3} ${QE_BIN}/pw2qmcpack.x ${NPOOL3} ${MY_WORKDIR} ${TEST_INPUT_PREFIX}-pw2x.in )
+        ADD_QE_TEST(${FULL_NAME}-pw2x ${PROCS3} ${QE_PW2Q_DIR}/pw2qmcpack.x ${NPOOL3} ${MY_WORKDIR} ${TEST_INPUT_PREFIX}-pw2x.in )
        SET_TESTS_PROPERTIES(${FULL_NAME}-pw2x PROPERTIES DEPENDS ${FULL_NAME}-nscf)
    ENDIF(PROCS2 EQUAL 0)
 ENDFUNCTION()
@ -48,18 +48,4 @@ FUNCTION( SOFTLINK_H5 SOURCE TARGET PREFIX FILENAME TEST_NAME)
    SET_PROPERTY( TEST LINK_${SOURCE}_TO_${TARGET} APPEND PROPERTY LABELS "converter" )
 ENDFUNCTION()

-FUNCTION( VERIFY_QE_PRESENT QE_BIN )
- IF ( EXISTS "${QE_BIN}/pw.x" )
-  MESSAGE( STATUS "Found pw.x at ${QE_BIN}/pw.x")
- ELSE()
-  MESSAGE( "QE_BIN was specified but could not find ${QE_BIN}/pw.x" )
-  MESSAGE( FATAL_ERROR "QE should be built ahead of QMCPACK. See qmcpack/external_codes/quantum_espresso/README for details on building patched version." ) 
- ENDIF()
- IF ( EXISTS "${QE_BIN}/pw2qmcpack.x" )
-  MESSAGE( STATUS "Found pw2qmcpack.x at ${QE_BIN}/pw.x")
- ELSE()
-  MESSAGE( "QE_BIN was specified but could not find ${QE_BIN}/pw2qmcpack.x" )
-  MESSAGE( FATAL_ERROR "See qmcpack/external_codes/quantum_espresso/README for details on building patched version" ) 
- ENDIF()
-ENDFUNCTION()

--- a/CMake/test_labels.cmake
+++ b/CMake/test_labels.cmake
@ -2,7 +2,7 @@
 FUNCTION( ADD_TEST_LABELS TEST_NAME TEST_LABELS )
  SET(SUCCESS FALSE)
  SET(TEST_LABELS_TEMP "")
-  EXECUTE_PROCESS(COMMAND ${PROJECT_SOURCE_DIR}/tests/scripts/test_labels.py ${TEST_NAME} ${QMC_CUDA} ${ENABLE_SOA} ${QMC_COMPLEX} ${QMC_MIXED_PRECISION} OUTPUT_VARIABLE TEST_LABELS_TEMP RESULT_VARIABLE SUCCESS)
+  EXECUTE_PROCESS(COMMAND ${PROJECT_SOURCE_DIR}/tests/scripts/test_labels.py ${TEST_NAME} ${QMC_CUDA} ${QMC_COMPLEX} ${QMC_MIXED_PRECISION} OUTPUT_VARIABLE TEST_LABELS_TEMP RESULT_VARIABLE SUCCESS)
  #MESSAGE("  Label script return value: ${SUCCESS}")
  IF( NOT ${SUCCESS} STREQUAL "0" )
    MESSAGE("Warning: test labeling failed.  Test labeling error output:\n${TEST_LABELS_TEMP}")
--- a/CMake/try_compile_sources/check_AVX512.cpp
+++ b/CMake/try_compile_sources/check_AVX512.cpp
@ -0,0 +1,8 @@
+// Check if AVX512 is activated by the compiler
+int main(int argc, char **argv)
+{
+#if !defined(__AVX512F__)
+#error "AVX512 not found"
+#endif
+  return 0;
+}
--- a/CMake/try_compile_sources/check_libstdcxx.cpp
+++ b/CMake/try_compile_sources/check_libstdcxx.cpp
@ -0,0 +1,19 @@
+// Test the compatibility between the compiler and the libstdc++ from GNU
+#include <cstdio>
+
+int main(int argc, char **argv)
+{
+// Unfortunately this check doesn't work for compilers <=v7.0 because _GLIBCXX_RELEASE appeared in the GCC 7.1 release.
+// It is kept here as an example for the future.
+#if ( defined(__INTEL_COMPILER) && (  _GLIBCXX_RELEASE < 7 ) )
+#error You are using an Intel compiler. They obtain libstdc++ from a GNU compiler installation. For Intel compilers, you must use a GNU version >= 7. Found version <7.
+#endif
+// libstdc++ from GCC 8 is bad for Intel 19 in both C++14 and C++17
+#if ( ( __INTEL_COMPILER == 1900 ) && (  _GLIBCXX_RELEASE > 7 ) )
+#error You are using the Intel compiler v19 which obtains libstdc++ from a GNU compiler installation. You must use GNU version 7 with this Intel compiler. Found version >7. Alternatively (preferred route), use a more recent Intel compiler.
+#endif
+#if ( ( __INTEL_COMPILER == 1910 ) && (  _GLIBCXX_RELEASE > 9 ) )
+#error You are using the Intel compiler v19.1 ("20") which obtains libstdc++ from a GNU compiler installation. Due to incompatibilities, you must use a GNU version <= 9 with this Intel compiler version. Found version >9.
+#endif
+    return 0;
+}
--- a/CMake/unit_test.cmake
+++ b/CMake/unit_test.cmake
@ -2,30 +2,33 @@
 INCLUDE("${PROJECT_SOURCE_DIR}/CMake/test_labels.cmake")

 # Runs unit tests
-FUNCTION( ADD_UNIT_TEST TESTNAME TEST_BINARY )
-    MESSAGE_VERBOSE("Adding test ${TESTNAME}")
-    IF ( HAVE_MPI )
-        ADD_TEST(NAME ${TESTNAME} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${TEST_BINARY} ${ARGN})
-        #SET_TESTS_PROPERTIES( ${TESTNAME} PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1 )
-    ELSE()
-        ADD_TEST(NAME ${TESTNAME} COMMAND ${TEST_BINARY} ${ARGN})
-        #SET_TESTS_PROPERTIES( ${TESTNAME} PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1 )
-    ENDIF()
-    SET(TEST_LABELS_TEMP "")
-    ADD_TEST_LABELS( ${TESTNAME} TEST_LABELS_TEMP )
-    SET_PROPERTY(TEST ${TESTNAME} APPEND PROPERTY LABELS "unit")
-ENDFUNCTION()
+FUNCTION( ADD_UNIT_TEST TESTNAME PROCS THREADS TEST_BINARY )
+    message_verbose("Adding test ${TESTNAME}")
+    math( EXPR TOT_PROCS "${PROCS} * ${THREADS}" )
+    if ( HAVE_MPI )
+        add_test(NAME ${TESTNAME} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${PROCS} ${MPIEXEC_PREFLAGS} ${TEST_BINARY} ${ARGN})
+        set( TEST_ADDED TRUE )
+    else()
+        if ( ( ${PROCS} STREQUAL "1" ) )
+            add_test(NAME ${TESTNAME} COMMAND ${TEST_BINARY} ${ARGN})
+            set( TEST_ADDED TRUE )
+        else()
+            message_verbose("Disabling test ${TESTNAME} (building without MPI)")
+        endif()
+    endif()

-FUNCTION( ADD_MPI_UNIT_TEST TESTNAME TEST_BINARY PROC_COUNT )
-    MESSAGE_VERBOSE("Adding test ${TESTNAME}")
-    IF ( HAVE_MPI )
-      ADD_TEST(NAME ${TESTNAME} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${PROC_COUNT} ${MPIEXEC_PREFLAGS} ${TEST_BINARY} ${ARGN})
-      # Tests should be able to deal with any number of threads but mpi aware unit tests aren't
-      # guaranteed yet.
-    SET_TESTS_PROPERTIES( ${TESTNAME} PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1 )
-    SET(TEST_LABELS_TEMP "")
-    ADD_TEST_LABELS( ${TESTNAME} TEST_LABELS_TEMP )
-    SET_PROPERTY(TEST ${TESTNAME} APPEND PROPERTY LABELS "unit")
-    ENDIF()
-ENDFUNCTION()
+    if (TEST_ADDED)
+        set_tests_properties( ${TESTNAME} PROPERTIES
+                              PROCESSORS ${TOT_PROCS}
+                              ENVIRONMENT OMP_NUM_THREADS=${THREADS}
+                              PROCESSOR_AFFINITY TRUE )

+        if (QMC_CUDA OR ENABLE_CUDA OR ENABLE_OFFLOAD)
+            set_tests_properties(${TESTNAME} PROPERTIES RESOURCE_LOCK exclusively_owned_gpus)
+        endif()
+    endif()
+
+    set(TEST_LABELS_TEMP "")
+    add_test_labels( ${TESTNAME} TEST_LABELS_TEMP )
+    set_property(TEST ${TESTNAME} APPEND PROPERTY LABELS "unit")
+ENDFUNCTION()
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -18,7 +18,7 @@ include(CMakeDependentOption)
 PROJECT(qmcpack)

 SET(QMCPACK_VERSION_MAJOR 3)
-SET(QMCPACK_VERSION_MINOR 10)
+SET(QMCPACK_VERSION_MINOR 11)
 SET(QMCPACK_VERSION_PATCH 0)
 SET(QMCPACK_VERSION "${QMCPACK_VERSION_MAJOR}.${QMCPACK_VERSION_MINOR}.${QMCPACK_VERSION_PATCH}")

@ -29,57 +29,11 @@ SET (PROJECT_CMAKE ${qmcpack_SOURCE_DIR}/CMake)

 SET(QMCPACK_UNIT_TEST_DIR ${qmcpack_BINARY_DIR}/tests/bin)

-######################################################################
-# Verify Python3 available
-######################################################################
-INCLUDE(CMake/python.cmake)
-find_package(Python3)
-IF ( NOT Python3_FOUND )
-  MESSAGE( FATAL_ERROR "Could not find required python3" )
-ENDIF ( NOT Python3_FOUND )
-
-######################################################################
-# Check is PySCF is availabile within Python
-######################################################################
-TEST_PYTHON_MODULE(pyscf HAVE_PYSCF)
-IF (NOT HAVE_PYSCF)
-  MESSAGE( STATUS "Unable to import PySCF python module. PySCF tests will not be run." )
-ELSE()
-  MESSAGE( STATUS "Successfully imported PySCF python module." )
-ENDIF()
-  
-######################################################################
-# Verify QE executables present if QE_BIN specified
-######################################################################
-IF ( DEFINED QE_BIN )
-  INCLUDE(CMake/run_qe.cmake)
-  VERIFY_QE_PRESENT( ${QE_BIN} )
-ENDIF()
+INCLUDE(CMake/macros.cmake)

 ######################################################################
 # CTest
 ######################################################################
-INCLUDE( "${qmcpack_SOURCE_DIR}/CMake/macros.cmake" )
-SET( DROP_METHOD "http" )
-SET( DROP_SITE "cdash.qmcpack.org" )
-SET( DROP_LOCATION "/CDash/submit.php?project=QMCPACK" )
-SET( TRIGGER_SITE "" )
-SET( DROP_SITE_CDASH TRUE )
-# Increase timeout per test over the default of 1500 seconds (25 minutes)
-SET( DART_TESTING_TIMEOUT 3600 CACHE STRING "Maximum time for one test")
-ENABLE_TESTING()
-INCLUDE( CTest )
-
-OPTION(QMC_SYMLINK_TEST_FILES "Use symbolic links for test files to save space.  Set to false to copy files instead." ON)
-
-
-IF (QMC_SYMLINK_TEST_FILES)
-  SET(SYMLINK_MSG "Using symbolic links for large test files may cause test failures if the build is installed on a separate filesystem from the source.  For example, Titan at OLCF.")
-ELSE()
-  SET(SYMLINK_MSG "Copying large test files uses more disk space than using symbolic links.")
-ENDIF()
-MESSAGE(STATUS "QMC_SYMLINK_TEST_FILES = ${QMC_SYMLINK_TEST_FILES}.  ${SYMLINK_MSG}")
-
 # The following option disables the extremely slow setup of full system run tests
 # This is a developer option
 # It offers a more reasonable compile debug loop if CMakeLists.txt files are being changed
@ -92,6 +46,57 @@ MESSAGE(STATUS "QMC_SYMLINK_TEST_FILES = ${QMC_SYMLINK_TEST_FILES}.  ${SYMLINK_M
 OPTION(QMC_NO_SLOW_CUSTOM_TESTING_COMMANDS "Disable the slow cmake custom commands for integration tests." OFF)
 MARK_AS_ADVANCED(QMC_NO_SLOW_CUSTOM_TESTING_COMMANDS)

+SET( DROP_METHOD "http" )
+SET( DROP_SITE "cdash.qmcpack.org" )
+SET( DROP_LOCATION "/CDash/submit.php?project=QMCPACK" )
+SET( TRIGGER_SITE "" )
+SET( DROP_SITE_CDASH TRUE )
+# Increase timeout per test over the default of 1500 seconds (25 minutes)
+SET( DART_TESTING_TIMEOUT 3600 CACHE STRING "Maximum time for one test")
+ENABLE_TESTING()
+INCLUDE( CTest )
+
+OPTION(QMC_SYMLINK_TEST_FILES "Use symbolic links for test files to save space.  Set to false to copy files instead." ON)
+
+IF (QMC_SYMLINK_TEST_FILES)
+  SET(SYMLINK_MSG "Using symbolic links for large test files may cause test failures if the build is installed on a separate filesystem from the source.")
+ELSE()
+  SET(SYMLINK_MSG "Copying large test files uses more disk space than using symbolic links.")
+ENDIF()
+MESSAGE(STATUS "QMC_SYMLINK_TEST_FILES = ${QMC_SYMLINK_TEST_FILES}.  ${SYMLINK_MSG}")
+
+######################################################################
+# Verify Python3 available
+######################################################################
+INCLUDE(CMake/python.cmake)
+find_package(Python3)
+IF ( NOT Python3_FOUND )
+  MESSAGE( FATAL_ERROR "Could not find required python3" )
+ENDIF ( NOT Python3_FOUND )
+
+######################################################################
+# Check if PySCF is available
+######################################################################
+IF( NOT QMC_NO_SLOW_CUSTOM_TESTING_COMMANDS )
+  TEST_PYTHON_MODULE(pyscf HAVE_PYSCF)
+  IF (NOT HAVE_PYSCF)
+    MESSAGE( STATUS "Unable to import PySCF python module. PySCF tests will not be run." )
+  ELSE()
+    MESSAGE( STATUS "Successfully imported PySCF python module." )
+  ENDIF() 
+ENDIF()
+
+######################################################################
+# Check if QE executables available at QE_BIN or on the PATH
+######################################################################
+IF (NOT QMC_NO_SLOW_CUSTOM_TESTING_COMMANDS)
+  INCLUDE("CMake/FindQE.cmake")
+  IF (QE_FOUND)
+    MESSAGE( STATUS "Found and using patched Quantum ESPRESSO (QE) with pw2qmcpack.x at ${QE_PW_DIR}" )
+  ELSE(QE_FOUND)
+    MESSAGE( STATUS "Did not find a patched Quantum ESPRESSO (QE) distribution with pw2qmcpack.x. QE tests will not be run." )
+  ENDIF(QE_FOUND)
+ENDIF()

 ######################################################################
 # Build level
@ -110,20 +115,14 @@ MESSAGE( STATUS "CMAKE_BUILD_TYPE is ${CMAKE_BUILD_TYPE}")
 SET (LIBRARY_OUTPUT_PATH ${qmcpack_BINARY_DIR}/lib CACHE PATH "Single output directory for building all libraries.")
 SET (EXECUTABLE_OUTPUT_PATH ${qmcpack_BINARY_DIR}/bin CACHE PATH "Single output directory for building all executables.")

-######################################################################
-# build include and library paths required for the projects
-# For XYZ library, set an environment as
-# export XYZ_HOME='root-of-the-library'
-# XYZ_HOME/include
-# XYZ_HOME/lib
-# IF XYZ library is found, XYZ_FOUND is set to true
-#
-# find_package is used with cmake 2.8 and higher
-######################################################################
-#if(APPLE)
-#  set(QMC_INCLUDE_PATHS ${QMC_INCLUDE_PATHS} /sw/include)
-#  set(QMC_LIBRARY_PATHS ${QMC_LIBRARY_PATHS} /sw/lib)
-#endif(APPLE)
+##################################
+# Set C++ standard used by QMCPACK
+##################################
+SET(QMC_CXX_STANDARD 14 CACHE STRING "QMCPACK C++ language standard")
+IF(NOT QMC_CXX_STANDARD EQUAL 14)
+  MESSAGE(WARNING "C++14 is the only language standard officially supported by this QMCPACK version. "
+                  "Using other versions of the C++ standard is unsupported and done entirely at user's own risk.")
+ENDIF()

 ######################################################################
 # Set the compiler-time parameters
@ -145,6 +144,7 @@ SET(OHMMS_DIM 3 CACHE STRING "Select physical dimension")
 SET(OHMMS_INDEXTYPE int)
 MESSAGE(STATUS "defining the float point precision")
 SET(OHMMS_PRECISION_FULL double)
+
 IF(QMC_CUDA)
  SET(QMC_MIXED_PRECISION 1 CACHE BOOL "Enable/disable mixed precision")
  SET(OHMMS_PRECISION double)
@ -180,7 +180,6 @@ OPTION(ENABLE_GCOV "Enable code coverage" OFF)
 OPTION(QMC_MPI "Enable/disable MPI" ON)
 OPTION(QMC_OMP "Enable/disable OpenMP" ON)
 OPTION(QMC_COMPLEX "Build for complex binary" OFF)
-SET(ENABLE_SOA 1 CACHE BOOL "Enable/disable SoA optimization")
 SET(QMC_CUDA 0 CACHE BOOL "Build with GPU support through CUDA")
 OPTION(ENABLE_CUDA "Build with the second generation of GPU support through CUDA (production quality for AFQMC, experimental for real space)" OFF)
 OPTION(ENABLE_HIP "Build with with GPU support through HIP" OFF)
@ -243,30 +242,46 @@ CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H)
 # BUILD_QMCTOOLS   build utility programs
 # MPIP_PROFILE     profile mpi performance
 ######################################################################
-SET(BUILD_UNIT_TESTS 1 CACHE BOOL "Build unit tests")
-SET(BUILD_LMYENGINE_INTERFACE 1 CACHE BOOL "Build LMY engine")
+OPTION(BUILD_UNIT_TESTS "Build unit tests" ON)
+OPTION(BUILD_LMYENGINE_INTERFACE "Build LMY engine" ON)
 IF (QMC_CUDA AND BUILD_LMYENGINE_INTERFACE)
  MESSAGE(STATUS "LMY engine is not compatible with CUDA build! Disabling LMY engine")
-  SET(BUILD_LMYENGINE_INTERFACE 0)
+  SET(BUILD_LMYENGINE_INTERFACE OFF)
 ENDIF()
 IF(MIXED_PRECISION AND BUILD_LMYENGINE_INTERFACE)
  MESSAGE(STATUS "LMY engine is not compatible with CPU mixed precision build! Disabling LMY engine")
-  SET(BUILD_LMYENGINE_INTERFACE 0)
+  SET(BUILD_LMYENGINE_INTERFACE OFF)
 ENDIF()
-SET(BUILD_AFQMC 0 CACHE BOOL "Build with AFQMC")
-SET(BUILD_AFQMC_WITH_NCCL 0 CACHE BOOL "Build AFQMC with NCCL library.")
+OPTION(BUILD_AFQMC "Build with AFQMC" OFF)
+OPTION(BUILD_AFQMC_WITH_NCCL "Build AFQMC with NCCL library." OFF)
 # AFQMC requires MPI.
 If (BUILD_AFQMC AND NOT QMC_MPI)
  MESSAGE(FATAL_ERROR "AFQMC requires building with MPI (QMC_MPI=1). Set BUILD_AFQMC=0 or configure MPI.")
 ENDIF()
-SET(BUILD_FCIQMC 0 CACHE BOOL "Build with FCIQMC")
-#SET(BUILD_QMCTOOLS 1 CACHE BOOL "Build tools for QMCPACK")
-#SET(MPIP_PROFILE 0 CACHE BOOL "Build with mpip for mpi profile")
-#SET(BUILD_QMCTOOLS 1)
-#SET(MPIP_PROFILE 0)
+OPTION(BUILD_FCIQMC "Build with FCIQMC" OFF)
 OPTION(QMC_BUILD_STATIC "Link to static libraries" OFF)
 OPTION(ENABLE_TIMERS "Enable internal timers" ON)
 OPTION(ENABLE_STACKTRACE "Enable use of boost::stacktrace" OFF)
+OPTION(USE_VTUNE_API "Enable use of VTune ittnotify APIs" OFF)
+CMAKE_DEPENDENT_OPTION(USE_VTUNE_TASKS "USE VTune ittnotify task annotation" OFF "ENABLE_TIMERS AND USE_VTUNE_API" OFF)
+# CMake note - complex conditionals in cmake_dependent_option must have spaces around parentheses
+CMAKE_DEPENDENT_OPTION(USE_NVTX_API "Enable/disable NVTX regions in CUDA code." OFF "ENABLE_TIMERS AND ( QMC_CUDA OR ENABLE_CUDA )" OFF)
+
+######################################################################
+# Sanitizer options
+######################################################################
+
+# Add optional sanitizers ASAN, UBSAN, MSAN
+SET(VALID_SANITIZERS "none" "asan" "ubsan" "tsan" "msan")
+SET(ENABLE_SANITIZER "none" CACHE STRING "none,asan,ubsan,tsan,msan")
+SET_PROPERTY(CACHE ENABLE_SANITIZER PROPERTY STRINGS ${VALID_SANITIZERS})
+
+# Perform sanitizer option check, only works in debug mode
+IF(NOT ENABLE_SANITIZER IN_LIST VALID_SANITIZERS)
+  MESSAGE( FATAL_ERROR "Invalid -DENABLE_SANITIZER=${ENABLE_SANITIZER}, value must be one of ${VALID_SANITIZERS}")
+ELSE()
+  MESSAGE( STATUS "Enable sanitizer ENABLE_SANITIZER=${ENABLE_SANITIZER}" )
+ENDIF()

 ######################################################################
 # Install options
@ -318,12 +333,6 @@ ELSE(CMAKE_TOOLCHAIN_FILE)
  SET(ENABLE_OPENMP 0)
  SET(HAVE_ACML 0)

-  ######################################################
-  # Try to identify CPU identity
-  ######################################################
-  SET(CPU_IDENTITY "generic")
-  #INCLUDE(${PROJECT_CMAKE}/CheckProcessorID.cmake)
-
  #------------------------
  # On Cray's machine
  #------------------------
@ -355,13 +364,6 @@ ELSE(CMAKE_TOOLCHAIN_FILE)
    ENDIF()
  ENDIF()

-  #--------------------------------
-  #  C++ 14 is the minimum standard
-  #--------------------------------
-  set(CMAKE_CXX_STANDARD 14)
-  set(CMAKE_CXX_STANDARD_REQUIRED ON)
-  set(CMAKE_CXX_EXTENSIONS OFF)
-
  #------------------------------------
  # Identify the compiler -- This serves only to deal with case where only C or CXX is set, refactor? 
  #------------------------------------
@ -370,10 +372,14 @@ ELSE(CMAKE_TOOLCHAIN_FILE)
      SET( COMPILER GNU )
    ELSEIF( (CMAKE_CXX_COMPILER_ID MATCHES "XL") OR ( CMAKE_C_COMPILER_ID MATCHES "XL") )
      SET( COMPILER IBM )
+    ELSEIF( (CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") OR (CMAKE_C_COMPILER_ID MATCHES "IntelLLVM") )
+      SET( COMPILER Clang )
    ELSEIF( (CMAKE_CXX_COMPILER_ID MATCHES "Intel") OR (CMAKE_C_COMPILER_ID MATCHES "Intel") )
      SET( COMPILER Intel )
    ELSEIF( (CMAKE_CXX_COMPILER_ID MATCHES "PGI") OR (CMAKE_C_COMPILER_ID MATCHES "PGI") )
      SET( COMPILER PGI )
+    ELSEIF( (CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") OR (CMAKE_C_COMPILER_ID MATCHES "NVHPC") )
+      SET( COMPILER PGI )
    ELSEIF( (CMAKE_CXX_COMPILER_ID MATCHES "Cray") OR (CMAKE_C_COMPILER_ID MATCHES "Cray") )
      SET( COMPILER Cray )
    ELSEIF( (CMAKE_CXX_COMPILER_ID MATCHES "Clang") OR (CMAKE_C_COMPILER_ID MATCHES "Clang") )
@ -405,19 +411,29 @@ ELSE(CMAKE_TOOLCHAIN_FILE)
    MESSAGE(WARNING "No default file for compiler (${COMPILER})")
  ENDIF()

-
-  #--------------------------------------------------------------------------
-  # Check that a C++ compiler is compatiable with the underlying libstdc++
-  #--------------------------------------------------------------------------
-  include(CMake/Testlibstdc++.cmake)
-
-  #---------------------------------------------------
-  # Check that a C++ 14 standard library is configured
-  #---------------------------------------------------
-  include(CMake/TestCxx14Library.cmake)
-
 ENDIF(CMAKE_TOOLCHAIN_FILE)

+###############################################
+# Set C++ minimum standard and run basic checks
+###############################################
+set(CMAKE_CXX_STANDARD ${QMC_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+#-----------------------------------------------------------------------
+# Check that a C++ compiler is compatiable with the underlying libstdc++
+#-----------------------------------------------------------------------
+include(CMake/Testlibstdc++.cmake)
+
+#---------------------------------------------------------
+# Check that a C++ standard library is configured properly
+#---------------------------------------------------------
+IF(QMC_CXX_STANDARD GREATER_EQUAL 17)
+  include(CMake/TestCxx17Library.cmake)
+ELSE()
+  include(CMake/TestCxx14Library.cmake)
+ENDIF()
+
 IF (ENABLE_GCOV)
  IF (NOT GCOV_SUPPORTED)
    MESSAGE(FATAL_ERROR "ENABLE_GCOV was specified but compiler does not support GCC coverage flag")
@ -428,6 +444,31 @@ IF (BUILD_AFQMC AND NOT APPLE)
  LINK_LIBRARIES("rt")
 ENDIF()

+#-------------------------------------------------------------------------------
+# Check SIMD alignment for CPU only
+#-------------------------------------------------------------------------------
+# This is intentionally placed before adding OpenMP offload compile options
+# to avoid contamination from device compilation pass.
+# When '-march=skylake-avx512 -fopenmp-targets=nvptx64 -march=sm_70' is added
+# for Clang, the source code is parsed twice for both host and offload targets.
+# A trap for macro __AVX512F__ always fails because the offload pass doesn't
+# carry '-march=skylake-avx512' but only takes '-march=sm_70'.
+#-------------------------------------------------------------------------------
+include(CMake/CheckSIMDAlignment.cmake)
+set(QMC_SIMD_ALIGNMENT ${default_alignment} CACHE STRING "Cache/SIMD alignment in bytes")
+math(EXPR alignment_remainder "${QMC_SIMD_ALIGNMENT} % ${default_alignment}")
+if (alignment_remainder)
+  message(FATAL_ERROR "QMC_SIMD_ALIGNMENT must be a multiple of ${default_alignment}. Bad cached value is ${QMC_SIMD_ALIGNMENT}")
+endif()
+message(STATUS "Setting QMC_SIMD_ALIGNMENT to ${QMC_SIMD_ALIGNMENT}")
+
+#---------------------------------------------------------
+# Set up OpenMP offload compile options
+#---------------------------------------------------------
+if (ENABLE_OFFLOAD AND DEFINED OPENMP_OFFLOAD_COMPILE_OPTIONS)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_OFFLOAD_COMPILE_OPTIONS}")
+endif()
+
 #-------------------------------------------------------------------
 #  check MPI installation
 #-------------------------------------------------------------------
@ -508,7 +549,7 @@ ENDIF(QMC_MPI)
 # list of libraries to be linked with the main applications using I/O

 #-------------------------------------------------------------------
-# check lapack/blas
+# Check LAPACK/BLAS
 #-------------------------------------------------------------------
 # Add Math::BLAS_LAPACK target
 ADD_LIBRARY(Math::BLAS_LAPACK INTERFACE IMPORTED)
@ -648,7 +689,7 @@ IF(HDF5_FOUND)
  ENDIF(ENABLE_PHDF5)

  IF(HAVE_MPI AND NOT ENABLE_PHDF5)
-    MESSAGE(WARNING "MPI builds may have potential performance loss not using parallel HDF5!")
+    MESSAGE(WARNING "MPI builds may have performance loss by not using parallel HDF5! (Safe to ignore for workstation builds).")
  ENDIF()

  IF ( CMAKE_BUILD_TYPE AND HDF5_LIBRARIES_DEBUG )
@ -718,22 +759,22 @@ IF(QMC_CUDA OR ENABLE_CUDA)
  SET(HAVE_CUDA 1)
  MESSAGE("   CUDA_NVCC_FLAGS=${CUDA_NVCC_FLAGS}")
 ELSE(QMC_CUDA OR ENABLE_CUDA)
-  MESSAGE(STATUS "Disabling CUDA")
+  MESSAGE(STATUS "CUDA disabled")
 ENDIF(QMC_CUDA OR ENABLE_CUDA)

-OPTION(USE_NVTX_API "Enable/disable NVTX regions in CUDA code." OFF)
 IF(USE_NVTX_API)
-  IF(HAVE_CUDA)
-    FIND_LIBRARY(NVTX_API_LIB
-      NAME nvToolsExt
-      HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-      PATH_SUFFIXES lib lib64)
-    IF(NOT NVTX_API_LIB)
-      MESSAGE(FATAL_ERROR "USE_NVTX_API set but NVTX_API_LIB not found")
-    ENDIF(NOT NVTX_API_LIB)
-    MESSAGE("CUDA nvToolsExt library: ${NVTX_API_LIB}")
-    LINK_LIBRARIES(${NVTX_API_LIB})
-  ENDIF(HAVE_CUDA)
+  MESSAGE(STATUS "Enabling use of CUDA NVTX APIs")
+  FIND_LIBRARY(NVTX_API_LIB
+    NAME nvToolsExt
+    HINTS ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES lib lib64)
+  IF(NOT NVTX_API_LIB)
+    MESSAGE(FATAL_ERROR "USE_NVTX_API set but NVTX_API_LIB not found")
+  ENDIF(NOT NVTX_API_LIB)
+  MESSAGE("CUDA nvToolsExt library: ${NVTX_API_LIB}")
+  LINK_LIBRARIES(${NVTX_API_LIB})
+ELSE()
+  MESSAGE(STATUS "CUDA NVTX APIs disabled")
 ENDIF(USE_NVTX_API)

 #-------------------------------------------------------------------
@ -771,25 +812,20 @@ ENDIF(ENABLE_HIP)
 #include qmcpack/src build/src
 INCLUDE_DIRECTORIES( ${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src)

-IF (USE_VTUNE_TASKS)
-  IF (NOT ENABLE_TIMERS)
-    MESSAGE(FATAL_ERROR "USE_VTUNE_TASKS is set, but timers are not enabled.  Set ENABLE_TIMERS=ON.")
-  ENDIF()
-  SET(USE_VTUNE_API 1)
-ENDIF()
-
 IF (USE_VTUNE_API)
-  include(CheckIncludeFileCXX)
-  CHECK_INCLUDE_FILE_CXX(ittnotify.h HAVE_ITTNOTIFY_H)
-  IF (NOT HAVE_ITTNOTIFY_H)
-    MESSAGE(FATAL_ERROR "USE_VTUNE_API is defined, but the ittnotify.h include file is not found.  Check that the correct include directory is present in CMAKE_CXX_FLAGS.")
-  ENDIF()
+  MESSAGE(STATUS "Enabling use of VTune ittnotify APIs")
+  FIND_PATH(VTUNE_ITTNOTIFY_INCLUDE_DIR ittnotify.h HINTS ${VTUNE_ROOT} $ENV{VTUNE_ROOT} PATH_SUFFIXES include REQUIRED)
+  MESSAGE(STATUS "Found VTUNE_ITTNOTIFY_INCLUDE_DIR ${VTUNE_ITTNOTIFY_INCLUDE_DIR}")
+  FIND_LIBRARY(VTUNE_ITTNOTIFY_LIBRARY ittnotify HINTS ${VTUNE_ROOT} $ENV{VTUNE_ROOT} PATH_SUFFIXES lib64 lib REQUIRED)
+  MESSAGE(STATUS "Found VTUNE_ITTNOTIFY_LIBRARY ${VTUNE_ITTNOTIFY_LIBRARY}")

-  FIND_LIBRARY(VTUNE_ITTNOTIFY_LIBRARY ittnotify)
-  IF (NOT VTUNE_ITTNOTIFY_LIBRARY)
-    MESSAGE(FATAL_ERROR "USE_VTUNE_API is defined, but the ittnotify library is not found.  Check that correct library path is present in CMAKE_LIBRARY_PATH.")
+  INCLUDE_DIRECTORIES(${VTUNE_ITTNOTIFY_INCLUDE_DIR})
+  LINK_LIBRARIES(${VTUNE_ITTNOTIFY_LIBRARY})
+  IF (USE_VTUNE_TASKS)
+    MESSAGE(STATUS "VTune ittnotify tasks enabled")
  ENDIF()
-  LINK_LIBRARIES("${VTUNE_ITTNOTIFY_LIBRARY}")
+ELSE()
+  MESSAGE(STATUS "VTune ittnotify APIs disabled")
 ENDIF()

 OPTION(QMC_EXP_THREADING "Experimental non openmp threading models" OFF)
@ -849,6 +885,30 @@ IF(ENABLE_GCOV)
 MESSAGE(STATUS "GCOV is enabled")
 ENDIF()

+# SETUP SANITIZERS FLAGS
+IF( NOT "${ENABLE_SANITIZER}" STREQUAL "none")
+  IF( NOT ${COMPILER} MATCHES "GNU" AND NOT ${COMPILER} MATCHES "Clang")
+    MESSAGE(FATAL_ERROR "-DENABLE_SANITIZER=${ENABLE_SANITIZER} only works with GNU or Clang compilers")
+  ENDIF()
+  
+  IF( "${ENABLE_SANITIZER}" STREQUAL "asan" )
+    SET(CMAKE_CXX_FLAGS_SAN "-fsanitize=address -fno-optimize-sibling-calls -fsanitize-address-use-after-scope -fno-omit-frame-pointer"
+        CACHE STRING "AddressSanitizer C++ compiler builds." FORCE)
+  ELSEIF( "${ENABLE_SANITIZER}" STREQUAL "ubsan" )
+    SET(CMAKE_CXX_FLAGS_SAN "-fsanitize=undefined"
+        CACHE STRING "UndefinedBehaviorSanitizer C++ compiler builds." FORCE)
+  ELSEIF( "${ENABLE_SANITIZER}" STREQUAL "msan" )
+    SET(CMAKE_CXX_FLAGS_SAN "-fsanitize=memory"
+        CACHE STRING "MemorySanitizer C++ compiler builds." FORCE)
+  ELSEIF( "${ENABLE_SANITIZER}" STREQUAL "tsan" )
+    SET(CMAKE_CXX_FLAGS_SAN "-fsanitize=thread"
+        CACHE STRING "ThreadSanitizer C++ compiler builds." FORCE)
+  ENDIF()
+  
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_SAN}")
+  
+ENDIF()
+
 ############################################################
 # FLAGS at the project level
 ############################################################
--- a/14
+++ b/14
@ -1,17 +1,3 @@
-Important Notice for qmcpack developers and end-users:
-
-i) The original programs developed by Jeongnim Kim and her collaborators
-are distributed under UIUC/NCSA Open Source License (below). 
-
-ii) Some packages/features are not available under open source license and require
-separate license agreements with the authors. Contact the responsible authors.
-
-iii) autoconf/automake scripts are distributed under GPL (see COPYING).
-
-iv) The sources derived from any package distributed under GPL contain
-explicit acknowledgments of the original works and are distributed under GPL.
-
-------------
 		  University of Illinois/NCSA Open Source License

 Copyright (c) 2003, University of Illinois Board of Trustees.
--- a/README.md
+++ b/README.md
@ -1,7 +1,24 @@
-# Getting and building QMCPACK
+![QMCPACK Logo](docs/figs/QMCPACK_logo.png)
+
+[![License](https://img.shields.io/badge/License-UIUC/NCSA-blue.svg)](https://opensource.org/licenses/NCSA)
+[![Documentation Status](https://readthedocs.org/projects/qmcpack/badge/?version=develop)](https://qmcpack.readthedocs.io/en/develop/?badge=develop)
+
+[![GitHub release](https://img.shields.io/github/release/QMCPACK/qmcpack/all.svg)](https://github.com/QMCPACK/qmcpack/releases)
+[![Spack Version](https://img.shields.io/spack/v/qmcpack.svg)](https://spack.readthedocs.io/en/latest/package_list.html#qmcpack)
+
+[![GitHub Actions CI](https://github.com/QMCPACK/qmcpack/actions/workflows/ci-github-actions.yaml/badge.svg)](https://github.com/QMCPACK/qmcpack/actions/workflows/ci-github-actions.yaml)
+[![codecov-deterministic](https://codecov.io/gh/QMCPACK/qmcpack/branch/develop/graph/badge.svg?token=35D0u6GlBm)](https://codecov.io/gh/QMCPACK/qmcpack)
+
+QMCPACK is an open-source production-level many-body ab initio Quantum Monte Carlo code for computing the electronic structure of
+atoms, molecules, 2D nanomaterials and solids. The solid-state capabilities include metallic systems as well as insulators.
+QMCPACK is expected to run well on workstations through to the latest generation supercomputers. Besides high performance,
+particular emphasis is placed on code quality and reproducibility.
+
+# Obtaining and installing QMCPACK

 Obtain the latest release from https://github.com/QMCPACK/qmcpack/releases or clone the development source from
- https://github.com/QMCPACK/qmcpack. 
+ https://github.com/QMCPACK/qmcpack. A full installation guide and steps to perform an initial QMC calculation are given in the
+ [extensive online documentation for QMCPACK](https://qmcpack.readthedocs.io/en/develop/index.html).

 # Prerequisites

@ -15,47 +32,42 @@
 * MPI, parallel library. Optional, but a near requirement for production calculations.
 * Python3. Older versions are not supported as of January 2020.

-We aim to support open source compilers and libraries released within two years of each QMCPACK release. Use of software versions over
-two years old may work but is discouraged and untested. Proprietary compilers (Intel, PGI) are generally supported over the same
-period but may require use of an exact version. We also aim to support the standard software environments on Summit at OLCF, Theta
-at ALCF, and Cori at NERSC. Use of the most recently released compilers and library versions is particularly encouraged for highest
-performance and easiest configuration.
+We aim to support open source compilers and libraries released within two years of each QMCPACK release. Use of software versions
+over two years old may work but is discouraged and untested. Proprietary compilers (Intel, PGI) are generally supported over the
+same period but may require use of an exact version. We also aim to support the standard software environments on machines such as
+Summit at OLCF, Theta at ALCF, and Cori at NERSC. Use of the most recently released compilers and library versions is particularly
+encouraged for highest performance and easiest configuration.

 Nightly testing currently includes the following software versions on x86:

 * Compilers
-  * GCC 10.2.0, 7.3.0
-  * Clang/LLVM 10.0.1, 6.0.1
+  * GCC 10.2.0, 8.3.0
+  * Clang/LLVM 11.0.1
  * Intel 19.1.1.217 configured to use C++ library from GCC 8.3.0 
-  * PGI 19.4 configured to use C++ library from GCC 8.3.0
-* Boost 1.74.0, 1.68.0
-* HDF5 1.10.5, 1.8.19
+  * PGI/NVIDIA HPC SDK 20.9 configured to use C++ library from GCC 8.3.0
+* Boost 1.75.0, 1.68.0
+* HDF5 1.12.0, 1.8.19
 * FFTW 3.3.8, 3.3.4
-* CMake 3.18.2, 3.12.1
+* CMake 3.19.5, 3.13.2
 * MPI
-  * OpenMPI 4.0.4, 3.1.2
+  * OpenMPI 4.1.0, 3.1.6
  * Intel MPI 19.1.1.217
-* CUDA 10.2.89
+* CUDA 11.2.1

-Workflow tests are performed with Quantum Espresso v6.4.1 and PySCF v1.7.4. These check trial wavefunction generation and
+Workflow tests are performed with Quantum Espresso v6.7.0 and PySCF v1.7.5. These check trial wavefunction generation and
 conversion through to actual QMC runs.

-On a developmental basis we also check the latest Clang development version, AMD AOMP and Intel OneAPI compilers.
+On a developmental basis we also check the latest Clang and GCC development versions, AMD AOMP and Intel OneAPI compilers.

 # Building with CMake

- The build system for QMCPACK is based on CMake.  It will auto-configure
- based on the detected compilers and libraries. Previously QMCPACK made
- extensive use of toolchains, but the system has since been updated to
- eliminate the use of toolchain files for most cases.  The build
- system works with GNU, Intel, and IBM XLC compilers.  Specific compile options
- can be specified either through specific environment or CMake
- variables.  When the libraries are installed in standard locations,
- e.g., /usr, /usr/local, there is no need to set environment or CMake
- variables for the packages.
+ The build system for QMCPACK is based on CMake.  It will auto-configure based on the detected compilers and libraries. Previously
+ QMCPACK made extensive use of toolchains, but the system has since been updated to eliminate the use of toolchain files for most
+ cases.  Specific compile options can be specified either through specific environment or CMake variables.  When the libraries are
+ installed in standard locations, e.g., /usr, /usr/local, there is no need to set environment or CMake variables for the packages.

 See the manual linked at https://qmcpack.readthedocs.io/en/develop/ and https://www.qmcpack.org/documentation or buildable using
- sphinx from the sources in docs/.
+ sphinx from the sources in docs/. A PDF version is still available at https://qmcpack.readthedocs.io/_/downloads/en/develop/pdf/

 ## Quick build

@ -94,9 +106,9 @@ make -j 8

 ## Set the environment

- A number of environment variables affect the build.  In particular
+ A number of environment variables affect the build.  In particular,
 they can control the default paths for libraries, the default
- compilers, etc.  The list of environment variables is given below:
+ compilers, etc. The list of environment variables is given below:

 | Environment variable | Description |
 |----------------------|-------------|
@ -266,31 +278,22 @@ Add extra include directories:

 # Testing and validation of QMCPACK

-Before using QMCPACK we highly encourage tests to be run.
-QMCPACK includes extensive validation tests to ensure the correctness of the
-code, compilers, tools, and runtime. The tests should ideally be run
-each compilation, and certainly before any research use. The tests include
-checks of the output against known mean-field, quantum chemistry, and
-other QMC results.
+We highly encourage tests to be run before using QMCPACK. Details are given in the [QMCPACK
+manual](https://qmcpack.readthedocs.io/en/develop/index.html). QMCPACK includes extensive validation tests to ensure the
+correctness of the code, compilers, tools, and runtime. The tests should ideally be run each compilation, and certainly before any
+research use. The tests include checks of the output against known mean-field, quantum chemistry, and other QMC results.

-While some tests are fully deterministic, due to QMCPACK's stochastic
-nature some tests are statistical and can occasionally fail. We employ
-a range of test names and labeling to differentiate between these, as
-well as developmental tests that are known to fail. In particular,
-"deterministic" tests include this in their ctest test name, while
-tests known to be unstable (stochastically or otherwise) are labeled
-unstable using ctest labels.
+While some tests are fully deterministic, due to QMCPACK's stochastic nature some tests are statistical and can occasionally fail.
+We employ a range of test names and labeling to differentiate between these, as well as developmental tests that are known to
+fail. In particular, "deterministic" tests include this in their ctest test name, while tests known to be unstable (stochastically
+or otherwise) are labeled unstable using ctest labels.

-For more informaton, consult http://www.qmcpack.org and the manual.
-The tests currently use up to 16 cores in various combinations of MPI
-tasks and OpenMP threads. Current status for many systems can be
-checked at https://cdash.qmcpack.org
+The tests currently use up to 16 cores in various combinations of MPI tasks and OpenMP threads. Current status for many
+combinations of systems, compilers, and libraries can be checked at https://cdash.qmcpack.org

-Note that due to the small electron and walker counts used in the
-tests, they should not be used for any performance measurements. These
-should be made on problem sizes that are representative of actual
-research calculations. As described in the manual, performance tests
-are provided to aid in monitoring performance.
+Note that due to the small electron and walker counts used in the tests, they should not be used for any performance measurements.
+These should be made on problem sizes that are representative of actual research calculations. As described in the manual,
+performance tests are provided to aid in monitoring performance.

 ## Run the unit tests

@ -308,11 +311,9 @@ that are deterministic and known to be reliable.
 ctest -R deterministic -LE unstable
 ```

-These tests currently take a few seconds to run, and include all the
- unit tests. All tests should pass. Failing tests likely indicate a
- significant problem that should be solved before using QMCPACK
- further. This ctest invocation can be used as part of an automated
- installation verification process.
+These tests currently take a few seconds to run, and include all the unit tests. All tests should pass. Failing tests likely
+indicate a significant problem that should be solved before using QMCPACK further. This ctest invocation can be used as part of an
+automated installation verification process.
 
 ## Run the short (quick) tests

@ -335,23 +336,20 @@ ctest -R name-of-test-to-run

 # Documentation and support

-For more informaton, consult QMCPACK pages at http://www.qmcpack.org,
-the manual PDF at https://docs.qmcpack.org/qmcpack_manual.pdf, 
-or its sources in the manual directory.
+For more information, consult QMCPACK pages at http://www.qmcpack.org, the manual at
+https://qmcpack.readthedocs.io/en/develop/index.html, or its sources in the docs directory.

-If you have trouble using or building QMCPACK, or have questions about
-its use, please post to the [Google QMCPACK group](https://groups.google.com/forum/#!forum/qmcpack) or contact a developer.
+If you have trouble using or building QMCPACK, or have questions about its use, please post to the [Google QMCPACK
+group](https://groups.google.com/forum/#!forum/qmcpack), create a GitHub issue at https://github.com/QMCPACK/qmcpack/issues or
+contact a developer.

 # Contributing

-Contributions of any size are very welcome. Guidance for contributing
-to QMCPACK is included in Chapter 1 of the manual
-https://docs.qmcpack.org/qmcpack_manual.pdf . We use a git flow model
-including pull request reviews. A continuous integration system runs
-on pull requests. See https://github.com/QMCPACK/qmcpack/wiki for
-details. For an extensive contribution, it can be helpful to discuss
-on the [Google QMCPACK group](https://groups.google.com/forum/#!forum/qmcpack), to create a GitHub issue, or to talk
-directly with a developer.
+Contributions of any size are very welcome. Guidance for contributing to QMCPACK is included in Chapter 1 of the manual
+https://qmcpack.readthedocs.io/en/develop/introduction.html#contributing-to-qmcpack. We use a git flow model including pull
+request reviews. A continuous integration system runs on pull requests. See https://github.com/QMCPACK/qmcpack/wiki for details.
+For an extensive contribution, it can be helpful to discuss on the [Google QMCPACK
+group](https://groups.google.com/forum/#!forum/qmcpack), to create a GitHub issue, or to talk directly with a developer in
+advance.

-Contributions are made under the same UIUC/NCSA open source license
-that covers QMCPACK. Please contact us if this is problematic.
+Contributions are made under the same UIUC/NCSA open source license that covers QMCPACK. Please contact us if this is problematic.
--- a/codecov.yaml
+++ b/codecov.yaml
@ -0,0 +1,14 @@
+# Configuration file for codecov reporting code coverage
+
+# Disable codecov comments in every PR
+comment: off
+
+# Ignore testing directory itself
+ignore:
+  - "external_codes"
+  - "tests"
+
+# Fixes report prefix paths from CI dynamic coverage action
+# from https://docs.codecov.io/docs/fixing-paths
+fixes:
+  - "/__w/::"
--- a/config/build_olcf_andes.sh
+++ b/config/build_olcf_andes.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+
+BUILD_MODULES=config/load_olcf_andes_modules.sh
+
+module purge
+echo "Purging current module set"
+echo "Sourcing file: $BUILD_MODULES to build QMCPACK"
+
+. $BUILD_MODULES
+
+echo "Either source $BUILD_MODULES or load these same modules to run QMCPACK"
+
+export BLAS_LIBS="-L$OLCF_OPENBLAS_ROOT/lib -lopenblas"
+export LAPACK_LIBS="$BLAS_LIBS $OLCF_NETLIB_LAPACK_ROOT/lib64/liblapack.a"
+
+declare -A builds=( ["cpu"]="-DBUILD_PPCONVERT=1" \
+                    ["complex_cpu"]="-DQMC_COMPLEX=1" \
+#		    ["legacy_gpu"]="-DQMC_CUDA=1 " \
+#		    ["complex_legacy_gpu"]="-DQMC_CUDA=1 -DQMC_COMPLEX=1 " \
+		  )
+
+mkdir bin_andes
+
+for build in "${!builds[@]}"
+do
+    echo "building: $build with ${builds[$build]}"
+    rm bin_andes/qmcpack_${build}
+    mkdir build_andes_${build}
+    cd build_andes_${build}
+    cmake -DCMAKE_C_COMPILER="mpicc" \
+          -DCMAKE_CXX_COMPILER="mpicxx" \
+          -DBUILD_LMYENGINE_INTERFACE=0 \
+          ${builds[$build]} \
+          ..
+    make -j 20
+    if [ $? -eq 0 ]; then
+      build_dir=$(pwd)
+      ln -sf ${build_dir}/bin/qmcpack ${build_dir}/../bin_andes/qmcpack_${build}
+    fi
+    cd ..
+done
+
--- a/config/build_olcf_summit_Clang.sh
+++ b/config/build_olcf_summit_Clang.sh
@ -18,6 +18,11 @@ module load netlib-lapack
 module load hdf5
 module load python/3.6.6-anaconda3-5.3.0
 # private module until OLCF provides a new llvm build
+if [[ ! -d /ccs/proj/mat151/opt/modules ]] ; then
+  echo "Required module folder /ccs/proj/mat151/opt/modules not found!"
+  exit 1
+fi
+module use /ccs/proj/mat151/opt/modules
 module load llvm/master-latest

 #the XL built fftw is buggy, use the gcc version
@ -28,10 +33,11 @@ export BOOST_ROOT=/autofs/nccs-svm1_sw/summit/.swci/1-compute/opt/spack/20180914
 TYPE=Release
 Compiler=Clang

-for name in offload_real_MP offload_real offload_cplx offload_cplx_MP
+for name in offload_cuda_real offload_cuda_real_MP offload_cuda_cplx offload_cuda_cplx_MP \
+            cpu_real cpu_real_MP cpu_cplx cpu_cplx_MP
 do

-CMAKE_FLAGS="-D CMAKE_BUILD_TYPE=$TYPE -D ENABLE_CUDA=1 -D CUDA_ARCH=sm_70 -D ENABLE_MASS=1 -D MASS_ROOT=/sw/summit/xl/16.1.1-5/xlmass/9.1.1 -D MPIEXEC_EXECUTABLE=`which jsrun` -D MPIEXEC_NUMPROC_FLAG='-n' -D MPIEXEC_PREFLAGS='-c;16;-g;1;-b;packed:16'"
+CMAKE_FLAGS="-D CMAKE_BUILD_TYPE=$TYPE -D ENABLE_MASS=1 -D MASS_ROOT=/sw/summit/xl/16.1.1-5/xlmass/9.1.1 -D MPIEXEC_EXECUTABLE=`which jsrun` -D MPIEXEC_NUMPROC_FLAG='-n' -D MPIEXEC_PREFLAGS='-c;16;-g;1;-b;packed:16'"
 if [[ $name == *"cplx"* ]]; then
  CMAKE_FLAGS="$CMAKE_FLAGS -D QMC_COMPLEX=1"
 fi
@ -41,7 +47,11 @@ if [[ $name == *"_MP"* ]]; then
 fi

 if [[ $name == *"offload"* ]]; then
-  CMAKE_FLAGS="$CMAKE_FLAGS -D ENABLE_OFFLOAD=ON -D CUDA_HOST_COMPILER=`which gcc` -D USE_OBJECT_TARGET=ON"
+  CMAKE_FLAGS="$CMAKE_FLAGS -D ENABLE_OFFLOAD=ON -D USE_OBJECT_TARGET=ON -DOFFLOAD_ARCH=sm_70"
+fi
+
+if [[ $name == *"cuda"* ]]; then
+  CMAKE_FLAGS="$CMAKE_FLAGS -D ENABLE_CUDA=1 -D CUDA_ARCH=sm_70 -D CUDA_HOST_COMPILER=`which gcc`"
 fi

 folder=build_summit_${Compiler}_${name}
@ -55,7 +65,7 @@ if [ ! -f CMakeCache.txt ] ; then
 cmake $CMAKE_FLAGS -D CMAKE_C_COMPILER=mpicc -D CMAKE_CXX_COMPILER=mpicxx ..
 cmake ..
 fi
-make -j24
+make -j16
 cd ..

 echo
--- a/config/build_ornl_cades.sh
+++ b/config/build_ornl_cades.sh
@ -7,24 +7,24 @@
 ## * Execute this script in trunk/                            ##
 ##   ./config/build_ornl_cades.sh                             ##
 ##                                                            ##
-## Last verified: Mar 11, 2020                                ##
+## Last verified: Nov 12, 2020                                ##
 ################################################################

 # module files resulting from module imports below:
 # Currently Loaded Modulefiles:
-#   1) python/3.6.3           3) openmpi/3.1.5          5) gcc/6.3.0              7) fftw/3.3.5-pe3         9) boost/1.67.0-pe3
-#   2) intel/18.0.0           4) PE-intel/3.0           6) hdf5_parallel/1.10.3   8) cmake/3.11.0          10) libxml2/2.9.9
+#   1) python/3.6.3           3) openmpi/3.1.5          5) gcc/7.2.0              7) fftw/3.3.5             9) boost/1.70.0
+#   2) intel/19.0.3           4) PE-intel/3.0           6) hdf5-parallel/1.8.21   8) cmake/3.12.0          10) libxml2/2.9.9

 source $MODULESHOME/init/bash
 module purge
 module load python
 module load PE-intel/3.0
-module load intel/18.0.0
-module load gcc/6.3.0
-module load hdf5_parallel/1.10.3
-module load fftw/3.3.5-pe3
-module load cmake
-module load boost/1.67.0-pe3
+module swap intel intel/19.0.3
+module load gcc/7.2.0
+module load hdf5-parallel/1.8.21
+module load fftw/3.3.5
+module load cmake/3.12.0
+module load boost/1.70.0
 module load libxml2/2.9.9
 module list

@ -37,24 +37,24 @@ CMAKE_FLAGS="-DCMAKE_C_COMPILER=mpicc \
             -DCMAKE_C_FLAGS=-xCOMMON-AVX512 \
             -DCMAKE_CXX_FLAGS=-xCOMMON-AVX512"

-# Configure and build cpu real SoA. Build targets skylake nodes.
+# Configure and build cpu real. Build targets skylake nodes.
 echo ""
 echo ""
-echo "building QMCPACK for cpu SoA real for CADES SHPC Condo -- Using AVX512 for Skylake nodes"
+echo "building QMCPACK for cpu real for CADES SHPC Condo -- Using AVX512 for Skylake nodes"
 mkdir -p build_cades_cpu_real_skylake
 cd build_cades_cpu_real_skylake
-cmake -DENABLE_SOA=1 $CMAKE_FLAGS ..
+cmake $CMAKE_FLAGS ..
 make -j 16
 cd ..
 ln -sf ./build_cades_cpu_real_skylake/bin/qmcpack ./qmcpack_cades_cpu_real_skylake

-# Configure and build cpu complex SoA. Build targets skylake nodes.
+# Configure and build cpu complex. Build targets skylake nodes.
 echo ""
 echo ""
-echo "building QMCPACK for cpu SoA complex for CADES SHPC Condo -- Using AVX512 for Skylake nodes"
+echo "building QMCPACK for cpu complex for CADES SHPC Condo -- Using AVX512 for Skylake nodes"
 mkdir -p build_cades_cpu_comp_skylake
 cd build_cades_cpu_comp_skylake
-cmake -DENABLE_SOA=1 -DQMC_COMPLEX=1 $CMAKE_FLAGS ..
+cmake -DQMC_COMPLEX=1 $CMAKE_FLAGS ..
 make -j 16
 cd ..
 ln -sf ./build_cades_cpu_comp_skylake/bin/qmcpack ./qmcpack_cades_cpu_comp_skylake
@ -62,25 +62,24 @@ ln -sf ./build_cades_cpu_comp_skylake/bin/qmcpack ./qmcpack_cades_cpu_comp_skyla
 CMAKE_FLAGS="-DCMAKE_C_COMPILER=mpicc \
             -DCMAKE_CXX_COMPILER=mpicxx"

-# Configure and build cpu real SoA 
+# Configure and build cpu real 
 echo ""
 echo ""
-echo "building QMCPACK for cpu SoA real for CADES SHPC Condo"
+echo "building QMCPACK for cpu real for CADES SHPC Condo"
 mkdir -p build_cades_cpu_real
 cd build_cades_cpu_real
-cmake -DENABLE_SOA=1 $CMAKE_FLAGS ..
+cmake $CMAKE_FLAGS ..
 make -j 16
 cd ..
 ln -sf ./build_cades_cpu_real/bin/qmcpack ./qmcpack_cades_cpu_real

-# Configure and build cpu complex SoA
+# Configure and build cpu complex
 echo ""
 echo ""
-echo "building QMCPACK for cpu SoA complex for CADES SHPC Condo"
+echo "building QMCPACK for cpu complex for CADES SHPC Condo"
 mkdir -p build_cades_cpu_comp
 cd build_cades_cpu_comp
-cmake -DQMC_COMPLEX=1 -DENABLE_SOA=1 $CMAKE_FLAGS ..
+cmake -DQMC_COMPLEX=1 $CMAKE_FLAGS ..
 make -j 16
 cd ..
 ln -sf ./build_cades_cpu_comp/bin/qmcpack ./qmcpack_cades_cpu_comp
-
--- a/config/build_ornl_oic.sh
+++ b/config/build_ornl_oic.sh
@ -1,75 +0,0 @@
-#!/bin/bash
-
-################################################################
-## * This script builds available configurations of QMCPACK   ##
-##   on OIC5, at Oak Ridge National Lab.                      ##
-##                                                            ##
-## * Execute this script in trunk/                            ##
-##   ./config/build_ornl_oic.sh                               ##
-##                                                            ##
-## Last modified: Dec 7, 2017                                 ##
-################################################################
-
-
-module () 
-{ 
-    eval `/opt/modules/3.1.6/bin/modulecmd bash $*`
-}
-module purge
-module load gcc/4.9.3
-module load mpi/openmpi-1.4.5-gcc4
-module load hdf5/1.8.8-gcc4-parallel
-
-export LIBXML2_HOME=/home/j1k/share/oic5_gcc4/libxml2-2.7.6/build
-
-CMAKE_FLAGS="-DCMAKE_C_COMPILER=mpicc \ 
-             -DCMAKE_CXX_COMPILER=mpicxx"
-
-# Configure and build cpu real AoS
-echo ""
-echo ""
-echo "building qmcpack for cpu AoS real for oic5"
-mkdir -p build_oic_cpu_real
-cd build_oic_cpu_real
-cmake -DENABLE_SOA=0 $CMAKE_FLAGS ..
-make -j 32 
-cd ..
-ln -sf ./build_oic_cpu_real/bin/qmcpack ./qmcpack_oic_cpu_real
-
-# Configure and build cpu complex AoS
-echo ""
-echo ""
-echo "building qmcpack for cpu AoS complex for oic5"
-mkdir -p build_oic_cpu_comp
-cd build_oic_cpu_comp
-cmake -DENABLE_SOA=0 -DQMC_COMPLEX=1 $CMAKE_FLAGS ..
-make -j 32 
-cd ..
-ln -sf ./build_oic_cpu_comp/bin/qmcpack ./qmcpack_oic_cpu_comp
-
-
-# Configure and build cpu real SoA
-echo ""
-echo ""
-echo "building qmcpack for cpu SoA real for oic5"
-mkdir -p build_oic_cpu_real_SoA
-cd build_oic_cpu_real_SoA
-cmake -DENABLE_SOA=1 $CMAKE_FLAGS ..
-make -j 32 
-cd ..
-ln -sf ./build_oic_cpu_real_SoA/bin/qmcpack ./qmcpack_oic_cpu_real_SoA
-
-# Configure and build cpu complex SoA
-echo ""
-echo ""
-echo "building qmcpack for cpu SoA complex for oic5"
-mkdir -p build_oic_cpu_comp_SoA
-cd build_oic_cpu_comp_SoA
-cmake -DQMC_COMPLEX=1 -DENABLE_SOA=1 $CMAKE_FLAGS ..
-make -j 32 
-cd ..
-ln -sf ./build_oic_cpu_comp_SoA/bin/qmcpack ./qmcpack_oic_cpu_comp_SoA
-
-
-
-
--- a/config/docker/README.md
+++ b/config/docker/README.md
@ -0,0 +1,5 @@
+# Docker containers for QMCPACK
+
+This directory contains Dockerfile for:
+
+- dependencies: pre-packaged dependencies, used for CI (GitHub Actions)
--- a/config/docker/dependencies/ubuntu/openmpi/Dockerfile
+++ b/config/docker/dependencies/ubuntu/openmpi/Dockerfile
@ -0,0 +1,42 @@
+FROM ubuntu:20.04
+MAINTAINER William F Godoy williamfgc@yahoo.com
+
+RUN export DEBIAN_FRONTEND=noninteractive &&\
+    apt-get update -y &&\
+    apt-get upgrade -y apt-utils
+
+# Dependencies
+RUN export DEBIAN_FRONTEND=noninteractive &&\
+    apt-get install gcc g++ \ 
+                    clang \
+                    clang-format \
+                    gcovr \
+                    python3 \
+                    cmake \
+                    ninja-build \
+                    libboost-all-dev \
+                    git \
+                    libopenmpi-dev \
+                    libhdf5-openmpi-dev \
+                    hdf5-tools \
+                    libfftw3-dev \
+                    libopenblas-dev \
+                    libxml2-dev \
+                    sudo \
+                    curl \
+                    -y
+
+# Python packages for tests
+RUN export DEBIAN_FRONTEND=noninteractive &&\
+    apt-get install python3-numpy \
+                    python3-h5py \
+                    python3-pandas \
+                    -y
+                
+# must add a user different from root 
+# to run MPI executables
+RUN useradd -ms /bin/bash user
+RUN adduser user sudo
+
+USER user
+WORKDIR /home/user
--- a/config/load_olcf_andes_modules.sh
+++ b/config/load_olcf_andes_modules.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+echo "Loading QMCPACK dependency modules for andes"
+echo "https://docs.olcf.ornl.gov/systems/andes_user_guide.html"
+echo
+module load gcc/9.3.0
+#module load intel/19.0.3
+module load openmpi/4.0.4
+#module load essl
+module load openblas/0.3.12
+module load netlib-lapack
+#module load netlib-scalapack
+module load hdf5
+module load fftw
+export FFTW_ROOT=$OLCF_FFTW_ROOT
+module load cmake/3.18.4
+module load boost/1.74.0
+#module load cuda
+module load python/3.7-anaconda3
+
--- a/config/sanitizers/lsan.supp
+++ b/config/sanitizers/lsan.supp
@ -0,0 +1,7 @@
+
+
+leak:opal_free_list_grow_st
+leak:malloc
+leak:opal_hash_table_init2
+leak:strdup
+leak:calloc
--- a/docs/additional_tools.rst
+++ b/docs/additional_tools.rst
@ -694,11 +694,11 @@ Periodic boundary conditions with Gaussian orbitals from PySCF is fully supporte
 pw2qmcpack.x
 ~~~~~~~~~~~~

-``pw2qmcpack.x`` is an executable that converts PWSCF wavefunctions to QMCPACK readable
-HDF5 format.  This utility is built alongside the QE postprocessing utilities.
-This utility is written in Fortran90 and is distributed as a patch of the QE
-source code.  The patch, as well as automated QE download and patch scripts, can be found in
-``qmcpack/external_codes/quantum_espresso``.
+``pw2qmcpack.x`` is an executable that converts PWSCF wavefunctions from the Quantum ESPRESSO (QE) package to QMCPACK readable
+HDF5 format.  This utility is built alongside the QE postprocessing utilities. This utility is written in Fortran90 and is
+distributed as a patch of the QE source code.  The patch, as well as automated QE download and patch scripts, can be found in
+``qmcpack/external_codes/quantum_espresso``. Once built, we recommend also build QMCPACK with the QE_BIN option pointing to the
+build pw.x and pw2qmcpack.x directory. This will enable workflow tests to be run.

 pw2qmcpack can be used in serial in small systems and should be used in parallel with large systems for best performance. The K_POINT gamma optimization is not supported.

@ -757,14 +757,14 @@ After the wavefunction file is written (basename.sample in this case) one can us

 This reads the Qbox wavefunction and performs the Fourier transform before saving to a QMCPACK eshdf format wavefunction.  Currently multiple k-points are supported, but due to difficulties with the qbox wavefunction file format, the single particle orbitals do not have their proper energies associated with them.  This means that when tiling from a primitive cell to a supercell, the lowest n single particle orbitals from all necessary k-points will be used.  This can be problematic in the case of a metal and this feature should be used with EXTREME caution.

-In the case of quantum espresso, QE must be compiled with HDF support.  If this is the case, then an eshdf file can be generated by targeting the data-file-schema.xml file
-generated in the output of quantum espresso.  For example, if one is running a calculation with outdir = 'out' and prefix='Pt' then the converter can be invoked as:
+In the case of Quantum ESPRESSO, QE must be compiled with HDF support.  If this is the case, then an eshdf file can be generated by targeting the data-file-schema.xml file
+generated in the output of Quantum ESPRESSO.  For example, if one is running a calculation with outdir = 'out' and prefix='Pt' then the converter can be invoked as:

 ::

  convertpw4qmc out/Pt.save/data-file-schema.xml -o qmcpackWavefunction.h5

-Note that this method is insensitive to parallelization options given to quantum espresso.  Additionally, it supports noncollinear magnetism and can be used to generate
+Note that this method is insensitive to parallelization options given to Quantum ESPRESSO.  Additionally, it supports noncollinear magnetism and can be used to generate
 wavefunctions suitable for qmcpack calculations with spin-orbit coupling.

 .. _ppconvert:
@ -772,9 +772,9 @@ wavefunctions suitable for qmcpack calculations with spin-orbit coupling.
 ppconvert
 ~~~~~~~~~

-``ppconvert`` is a utility to convert PPs between different commonly used formats.
-It is a stand-alone C++ executable that is not built by default but that is accessible via adding
-``-DBUILD_PPCONVERT=1`` to CMake and then typing ``make ppconvert``.
+``ppconvert`` is a utility to convert PPs between different commonly used formats. As with all operations on pseudopotentials, great care should be exercised when using this tool.
+It is a stand-alone executable that is not built by default but that is accessible via adding
+``-DBUILD_PPCONVERT=1`` to CMake.
 Currently it converts CASINO, FHI, UPF (generated by OPIUM), BFD, and GAMESS formats to several other formats
 including XML (QMCPACK) and UPF (QE). See all the formats via ``ppconvert -h``.
 For output formats requiring Kleinman-Bylander projectors, the atom will be solved with DFT
--- a/docs/contributing.rst
+++ b/docs/contributing.rst
@ -9,23 +9,9 @@ This section briefly describes how to contribute to the manual and is primarily

 -  Use the following table templates when describing XML input.

-  Instead of ``\texttt`` or ``\verb`` use
-
-   -  ``\ishell`` for shell text
-
-   -  ``\ixml`` for xml text
-
-   -  ``\icode`` for C++ text
-
-   **Except within tabularx or math environments**
-
-  Instead of ``\begin{verbatim}`` environments, use the appropriate ``\begin{lstlisting}[style=<see qmcpack_listings.sty>]``.
-
-  ``\begin{shade}`` can be used in place of ``\begin{lstlisting}[style=SHELL]``.
-
 -  Unicode rules

-   -  Do not use characters for which well-established latex idiom
+   -  Do not use characters for which well-established idioms
      exists, especially dashes, quotes, and apostrophes.

   -  Use math mode markup instead of unicode characters for equations.
@ -37,36 +23,17 @@ This section briefly describes how to contribute to the manual and is primarily
      (emacs and ‘esc-x toggle-enable-multibyte-characters’)—see any
      unicode you did not intend?

-  Place unformatted text targeted at developers working on the LaTeX in
-   comments. Include generously.
-
-  Encapsulate formatted text aimed at developers (like this entire
-   chapter), in ``\dev{}``. Text encapsulated in this way will be removed from the
-   user version of the manual by editing the definition of ``\dev{}`` in ``qmcpack_manual.tex``. Existing
-   but deprecated or partially functioning features fall in this
-   category.
-
 -  Newly added entries to a Bib file should be as complete as possible.
   Use a tool such as JabRef or Zotero that can automate creation of
   these entries from just a DOI.

 **Forbidden:**

-  Including images instead of using lstlisting sections for text.
+-  Including images instead of text tables.

-  Using packages the LaTeX community considers `deprecated`_.
-
-  Using packages, features, or fonts not included in texlive 2017
-   unless you ensure they degrade reasonably for 2017.
-
-  Adding packages unless they bring great value and are supported by
-   tex4ht (unless you are willing to add the support).
-
-  Saving Tex files and Bib files in encodings other than UTF8. Some may
+-  Saving files in encodings other than UTF8. Some may
   report being ASCII encoded since they contain no unicode characters.

-.. _deprecated: https://latex.org/forum/viewtopic.php?f=37&t=6637
-
 **Missing sections (these are opinions, not decided priorities):**

 -  Description of XML input in general. Discuss XML format, use of
--- a/docs/developing.rst
+++ b/docs/developing.rst
@ -3,22 +3,33 @@
 Development Guide
 =================

-The section gives guidance on how to extend the functionality of QMCPACK. Future examples will likely include topics such as the addition of a Jastrow function or a new QMC method.
+The section gives guidance on how to extend the functionality of QMCPACK. Future examples will likely include topics such as the
+addition of a Jastrow function or a new QMC method.

 QMCPACK coding standards
 ------------------------

-This chapter presents what we collectively have agreed are best practices for the code. This includes formatting style, naming conventions, documentation conventions, and certain prescriptions for C++ language use. At the moment only the formatting can be enforced in an objective fashion.
+This chapter presents what we collectively have agreed are best practices for the code. This includes formatting style, naming
+conventions, documentation conventions, and certain prescriptions for C++ language use. At the moment only the formatting can be
+enforced in an objective fashion.

-New development should follow these guidelines, and contributors are expected to adhere to them as they represent an integral part of our effort to continue QMCPACK as a world-class, sustainable QMC code. Although some of the source code has a ways to go to live up to these ideas, new code, even in old files, should follow the new conventions not the local conventions of the file whenever possible. Work on the code with continuous improvement in mind rather than a commitment to stasis.
+New development should follow these guidelines, and contributors are expected to adhere to them as they represent an integral part
+of our effort to continue QMCPACK as a world-class, sustainable QMC code. Although some of the source code has a ways to go to
+live up to these ideas, new code, even in old files, should follow the new conventions not the local conventions of the file
+whenever possible. Work on the code with continuous improvement in mind rather than a commitment to stasis.

-The `current workflow conventions`_ for the project are described in the wiki on the GitHub repository. It will save you and all the maintainers considerable time if you read these and ask questions up front.
+The `current workflow conventions`_ for the project are described in the wiki on the GitHub repository. It will save you and all
+the maintainers considerable time if you read these and ask questions up front.

-A PR should follow these standards before inclusion in the mainline. You can be sure of properly following the formatting conventions if you use clang-format.  The mechanics of clang-format setup and use can be found at https://github.com/QMCPACK/qmcpack/wiki/Source-formatting.
+A PR should follow these standards before inclusion in the mainline. You can be sure of properly following the formatting
+conventions if you use clang-format.  The mechanics of clang-format setup and use can be found at
+https://github.com/QMCPACK/qmcpack/wiki/Source-formatting.

-The clang-format file found at ``qmcpack/src/.clang-format`` should be run over all code touched in a PR before a pull request is prepared. We also encourage developers to run clang-tidy with the ``qmcpack/src/.clang-tidy`` configuration over all new code.
+The clang-format file found at ``qmcpack/src/.clang-format`` should be run over all code touched in a PR before a pull request is
+prepared. We also encourage developers to run clang-tidy with the ``qmcpack/src/.clang-tidy`` configuration over all new code.

-As much as possible, try to break up refactoring, reformatting, feature, and bugs into separate, small PRs. Aim for something that would take a reviewer no more than an hour. In this way we can maintain a good collective development velocity.
+As much as possible, try to break up refactoring, reformatting, feature, and bugs into separate, small PRs. Aim for something that
+would take a reviewer no more than an hour. In this way we can maintain a good collective development velocity.

 .. _current workflow conventions: https://github.com/QMCPACK/qmcpack/wiki/Development-workflow

@ -33,7 +44,7 @@ Each file should start with the header.
  // This file is distributed under the University of Illinois/NCSA Open Source License.
  // See LICENSE file in top directory for details.
  //
-  // Copyright (c) 2020 QMCPACK developers
+  // Copyright (c) 2021 QMCPACK developers
  //
  // File developed by: Name, email, affiliation
  //
@ -45,37 +56,78 @@ If you make significant changes to an existing file, add yourself to the list of
 File organization
 ~~~~~~~~~~~~~~~~~

-Header files should be placed in the same directory as their implementations.
-Unit tests should be written for all new functionality. These tests should be placed in a ``tests`` subdirectory below the implementations.
+Header files should be placed in the same directory as their implementations. Unit tests should be written for all new
+functionality. These tests should be placed in a ``tests`` subdirectory below the implementations.

 File names
 ~~~~~~~~~~

-Each class should be defined in a separate file with the same name as the class name. Use separate ``.cpp`` implementation files whenever possible to aid in incremental compilation.
+Each class should be defined in a separate file with the same name as the class name. Use separate ``.cpp`` implementation files
+whenever possible to aid in incremental compilation.

-The filenames of tests are composed by the filename of the object tested and the prefix ``test_``.
-The filenames of *fake* and *mock* objects used in tests are composed by the prefixes ``fake_`` and ``mock_``, respectively, and the filename of the object that is imitated.
+The filenames of tests are composed by the filename of the object tested and the prefix ``test_``. The filenames of *fake* and
+*mock* objects used in tests are composed by the prefixes ``fake_`` and ``mock_``, respectively, and the filename of the object
+that is imitated.

 Header files
 ~~~~~~~~~~~~

-All header files should be self-contained (i.e., not dependent on following any other header when it is included). Nor should they include files that are not necessary for their use (i.e., headers needed only by the implementation). Implementation files should not include files only for the benefit of files they include.
+All header files should be self-contained (i.e., not dependent on following any other header when it is included). Nor should they
+include files that are not necessary for their use (i.e., headers needed only by the implementation). Implementation files should
+not include files only for the benefit of files they include.

-There are many header files that currently violate this.
-Each header must use ``#define`` guards to prevent multiple inclusion.
+There are many header files that currently violate this. Each header must use ``#define`` guards to prevent multiple inclusion.
 The symbol name of the ``#define`` guards should be ``NAMESPACE(s)_CLASSNAME_H``.

 Includes
 ~~~~~~~~

-Header files should be included with the full path based on the ``src`` directory.
-For example, the file ``qmcpack/src/QMCWaveFunctions/SPOSet.h`` should be included as
+Related header files should be included without any path. Header files from external projects and standard libraries should be
+includes using the ``<iostream>`` convention, while headers that are part of the QMCPACK project should be included using the
+``"our_header.h"`` convention.
+
+We are now using a new header file inclusion style following the modern CMake transition in QMCPACK, while the legacy code may
+still use the legacy style. Newly written code and refactored code should be transitioned to the new style.
+
+New style for modern CMake
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In QMCPACK, include paths are handled by modern CMake target dependency. Every top level folder is at least one target. For
+example, ``src/Particle/CMakeLists.txt`` defines `qmcparticle` target. It propagates include path ``qmcpack/src/Particle`` to
+compiling command lines in CMake via
+
+::
+
+  TARGET_INCLUDE_DIRECTORIES(qmcparticle PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
+
+For this reason, the file ``qmcpack/src/Particle/Lattice/ParticleBConds3DSoa.h`` should be included as
+
+::
+
+  #include "Lattice/ParticleBConds3DSoa.h"
+
+If the compiled file is not part of the same target as `qmcparticle`, the target it belongs to should have a dependency on
+`qmcparticle`. For example, test source files under ``qmcpack/src/Particle/tests`` are not part of `qmcparticle` and thus requires
+the following additional CMake setting
+
+::
+
+  TARGET_LINK_LIBRARIES(${UTEST_EXE} qmcparticle)
+
+Legacy style
+^^^^^^^^^^^^
+
+Header files should be included with the full path based on the ``src`` directory. For example, the file
+``qmcpack/src/QMCWaveFunctions/SPOSet.h`` should be included as

 ::

  #include "QMCWaveFunctions/SPOSet.h"

-Even if the included file is located in the same directory as the including file, this rule should be obeyed. Header files from external projects and standard libraries should be includes using the ``<iostream>`` convention, while headers that are part of the QMCPACK project should be included using the ``"our_header.h"`` convention.
+Even if the included file is located in the same directory as the including file, this rule should be obeyed.
+
+Ordering
+^^^^^^^^

 For readability, we suggest using the following standard order of includes:

@ -94,7 +146,10 @@ In each section the included files should be sorted in alphabetical order.
 Naming
 ------

-The balance between description and ease of implementation should be balanced such that the code remains self-documenting within a single terminal window.  If an extremely short variable name is used, its scope must be shorter than :math:`\sim 40` lines. An exception is made for template parameters, which must be in all CAPS.
+The balance between description and ease of implementation should be balanced such that the code remains self-documenting within a
+single terminal window.  If an extremely short variable name is used, its scope must be shorter than :math:`\sim 40` lines. An
+exception is made for template parameters, which must be in all CAPS. Legacy code contains a great variety of hard to read code
+style, read this section and do not immitate existing code that violates it.

 Namespace names
 ~~~~~~~~~~~~~~~
@ -104,13 +159,19 @@ Namespace names should be one word, lowercase.
 Type and class names
 ~~~~~~~~~~~~~~~~~~~~

-Type and class names should start with a capital letter and have a capital letter for each new word.
-Underscores (``_``) are not allowed.
+Type and class names should start with a capital letter and have a capital letter for each new word. Underscores (``_``) are not
+allowed. It's redundant to end these names with ``Type`` or ``_t``.
+
+::
+   \\no
+   using ValueMatrix_t = Matrix<Value>;
+   using RealType = double;

 Variable names
 ~~~~~~~~~~~~~~

-Variable names should not begin with a capital letter, which is reserved for type and class names. Underscores (``_``) should be used to separate words.
+Variable names should not begin with a capital letter, which is reserved for type and class names. Underscores (``_``) should be
+used to separate words.

 Class data members
 ~~~~~~~~~~~~~~~~~~
@ -122,6 +183,11 @@ Class private/protected data members names should follow the convention of varia

 Function names should start with a lowercase character and have a capital letter for each new word.

+Template Parameters
+~~~~~~~~~~~~~~~~~~~
+
+Template parameters names should be in all caps with (``_``) separating words.  It's redundant to end these names with ``_TYPE``,
+
 Lambda expressions
 ~~~~~~~~~~~~~~~~~~

@ -134,8 +200,7 @@ Named lambda expressions follow the naming convention for functions:
 Macro names
 ~~~~~~~~~~~

-Macro names should be all uppercase and can include underscores (``_``).
-The underscore is not allowed as first or last character.
+Macro names should be all uppercase and can include underscores (``_``). The underscore is not allowed as first or last character.

 Test case and test names
 ~~~~~~~~~~~~~~~~~~~~~~~~
@ -197,15 +262,15 @@ Do not put the file name after the ``\file`` Doxygen command. Doxygen will fill
 Class docs
 ^^^^^^^^^^

-Every class should have a short description (in the header of the file) of what it is and what is does.
-Comments for public class member functions follow the same rules as general function comments.
-Comments for private members are allowed but are not mandatory.
+Every class should have a short description (in the header of the file) of what it is and what is does. Comments for public class
+member functions follow the same rules as general function comments. Comments for private members are allowed but are not
+mandatory.

 Function docs
 ^^^^^^^^^^^^^

-For function parameters whose type is non-const reference or pointer to non-const memory,
-it should be specified if they are input (In:), output (Out:) or input-output parameters (InOut:).
+For function parameters whose type is non-const reference or pointer to non-const memory, it should be specified if they are input
+(In:), output (Out:) or input-output parameters (InOut:).

 Example:

@ -238,7 +303,9 @@ Formatting and "style"

 Use the provided clang-format style in ``src/.clang-format`` to format ``.h``, ``.hpp``, ``.cu``, and ``.cpp`` files. Many of the following rules will be applied to the code by clang-format, which should allow you to ignore most of them if you always run it on your modified code.

-You should use clang-format support and the ``.clangformat`` file with your editor, use a Git precommit hook to run clang-format or run clang-format manually on every file you modify.  However, if you see numerous formatting updates outside of the code you have modified, first commit the formatting changes in a separate PR.
+You should use clang-format support and the ``.clangformat`` file with your editor, use a Git precommit hook to run clang-format
+or run clang-format manually on every file you modify.  However, if you see numerous formatting updates outside of the code you
+have modified, first commit the formatting changes in a separate PR.

 Indentation
 ~~~~~~~~~~~
@ -253,8 +320,8 @@ The length of each line of your code should be at most *120* characters.
 Horizontal spacing
 ~~~~~~~~~~~~~~~~~~

-No trailing white spaces should be added to any line.
-Use no space before a comma (``,``) and a semicolon (``;``), and add a space after them if they are not at the end of a line.
+No trailing white spaces should be added to any line. Use no space before a comma (``,``) and a semicolon (``;``), and add a space
+after them if they are not at the end of a line.

 Preprocessor directives
 ~~~~~~~~~~~~~~~~~~~~~~~
@ -275,8 +342,8 @@ Do not put any space between an unary operator and its argument.
 Types
 ~~~~~

-The ``using`` syntax is preferred to ``typedef`` for type aliases.
-If the actual type is not excessively long or complex, simply use it; renaming simple types makes code less understandable.
+The ``using`` syntax is preferred to ``typedef`` for type aliases. If the actual type is not excessively long or complex, simply
+use it; renaming simple types makes code less understandable.

 Pointers and references
 ~~~~~~~~~~~~~~~~~~~~~~~
@ -306,10 +373,8 @@ The angle brackets of templates should not have any external or internal padding
 Vertical spacing
 ~~~~~~~~~~~~~~~~

-Use empty lines when it helps to improve the readability of the code, but do not use too many.
-Do not use empty lines after a brace that opens a scope
-or before a brace that closes a scope.
-Each file should contain an empty line at the end of the file.
+Use empty lines when it helps to improve the readability of the code, but do not use too many. Do not use empty lines after a
+brace that opens a scope or before a brace that closes a scope. Each file should contain an empty line at the end of the file.
 Some editors add an empty line automatically, some do not.

 Variable declarations and definitions
@ -352,9 +417,8 @@ Variable declarations and definitions
 Function declarations and definitions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The return type should be on the same line as the function name.
-Parameters should also be on the same line unless they do not fit on it, in which case one parameter
-per line aligned with the first parameter should be used.
+The return type should be on the same line as the function name. Parameters should also be on the same line unless they do not fit
+on it, in which case one parameter per line aligned with the first parameter should be used.

 Also include the parameter names in the declaration of a function, that is,

@ -366,11 +430,11 @@ Also include the parameter names in the declaration of a function, that is,
  // avoid
  double function(double, double, double);

-  //dont do this
+  // dont do this
  double function(BigTemplatedSomething<double> a, BigTemplatedSomething<double> b,
                  BigTemplatedSomething<double> c);

-  //do this
+  // do this
  double function(BigTemplatedSomething<double> a,
                  BigTemplatedSomething<double> b,
                  BigTemplatedSomething<double> c);
@ -530,9 +594,9 @@ Examples:
 Namespace formatting
 ~~~~~~~~~~~~~~~~~~~~

-The content of namespaces is not indented.
-A comment should indicate when a namespace is closed. (clang-format will add these if absent).
-If nested namespaces are used, a comment with the full namespace is required after opening a set of namespaces or an inner namespace.
+The content of namespaces is not indented. A comment should indicate when a namespace is closed. (clang-format will add these if
+absent). If nested namespaces are used, a comment with the full namespace is required after opening a set of namespaces or an
+inner namespace.

 Examples:

@ -572,22 +636,28 @@ Examples:
 QMCPACK C++ guidance
 --------------------

-The guidance here, like any advice on how to program, should not be treated as a set of rules but rather the hard-won wisdom of many hours of suffering development. In the past, many rules were ignored, and the absolute worst results of that will affect whatever code you need to work with. Your PR should go much smoother if you do not ignore them.
+The guidance here, like any advice on how to program, should not be treated as a set of rules but rather the hard-won wisdom of
+many hours of suffering development. In the past, many rules were ignored, and the absolute worst results of that will affect
+whatever code you need to work with. Your PR should go much smoother if you do not ignore them.

 Encapsulation
 ~~~~~~~~~~~~~

-A class is not just a naming scheme for a set of variables and functions. It should provide a logical set of methods, could contain the state of a logical object, and might allow access to object data through a well-defined interface related variables, while preserving maximally ability to change internal implementation of the class.
+A class is not just a naming scheme for a set of variables and functions. It should provide a logical set of methods, could
+contain the state of a logical object, and might allow access to object data through a well-defined interface related variables,
+while preserving maximally ability to change internal implementation of the class.

-Do not use ``struct`` as a way to avoid controlling access to the class. Only in rare cases where a class is a fully public data structure ``struct`` is this appropriate. Ignore (or fix one) the many examples of this in QMCPACK.
+Do not use ``struct`` as a way to avoid controlling access to the class. Only in rare cases where a class is a fully public data
+structure ``struct`` is this appropriate. Ignore (or fix one) the many examples of this in QMCPACK.

-Do not use inheritance primarily as a means to break encapsulation. If your class could aggregate or compose another class, do that, and access it solely through its public interface. This will reduce dependencies.
+Do not use inheritance primarily as a means to break encapsulation. If your class could aggregate or compose another class, do
+that, and access it solely through its public interface. This will reduce dependencies.

 Casting
 ~~~~~~~

-In C++ source, avoid C style casts; they are difficult to search for and imprecise in function.
-An exception is made for controlling implicit conversion of simple numerical types.
+In C++ source, avoid C style casts; they are difficult to search for and imprecise in function. An exception is made for
+controlling implicit conversion of simple numerical types.

 Explicit C++ style casts make it clear what the safety of the cast is and what sort of conversion is expected to be possible.

@ -609,8 +679,8 @@ Explicit C++ style casts make it clear what the safety of the cast is and what s
 Pre-increment and pre-decrement
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Use the pre-increment (pre-decrement) operator when a variable is incremented (decremented) and the value of the expression is not used.
-In particular, use the pre-increment (pre-decrement) operator for loop counters where i is not used:
+Use the pre-increment (pre-decrement) operator when a variable is incremented (decremented) and the value of the expression is not
+used. In particular, use the pre-increment (pre-decrement) operator for loop counters where i is not used:

 ::

@ -624,7 +694,8 @@ In particular, use the pre-increment (pre-decrement) operator for loop counters
    doSomething(i);
  }

-The post-increment and post-decrement operators create an unnecessary copy that the compiler cannot optimize away in the case of iterators or other classes with overloaded increment and decrement operators.
+The post-increment and post-decrement operators create an unnecessary copy that the compiler cannot optimize away in the case of
+iterators or other classes with overloaded increment and decrement operators.

 Alternative operator representations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -664,23 +735,47 @@ Use of const
      int getCount() const { return count_;}
    }

+Smart pointers
+~~~~~~~~~~~~~~
+
+Use of smart pointers is being adopted to help make QMCPACK memory leak free. Prior to C++11, C++ uses C-style pointers. A C-style
+pointer can have several meanings and the ownership of a piece of help memory may not be clear. This leads to confusion and causes
+memory leaks if pointers are not managed properly. Since C++11, smart pointers were introduced to resolve this issue. In addition,
+it demands developers to think about the ownership and lifetime of declared pointer objects.
+
+std::unique_ptr
+^^^^^^^^^^^^^^^
+
+A unique pointer is the unique owner of a piece of allocated memory. Pointers in per-walker data structure with distinct contents
+should be unique pointers. For example, every walker has a trial wavefunction object which contains an SPO object pointer. Because
+the SPO object has a vector to store SPO evaluation results, it cannot be shared between two trial wavefunction objects. For this
+reason the SPO object pointer should be an unique pointer.
+
+In QMCPACK, most raw pointers can be directly replaced with ``std::unique_ptr``.
+Corresponding use of ``new`` operator can be replaced with ``std:make_unique``.
+
+std::shared_ptr
+^^^^^^^^^^^^^^^
+
+A shared pointer is the shared owner of a piece of allocated memory. Moving a pointer ownership from one place to another should
+not use shared pointers but C++ move semantics. Shared contents between walkers may be candidates for shared pointers. For example,
+although the Jastrow factor object must be unique per walker, the pointer to the parameter data structure can be a shared pointer.
+During Jastrow optimization, any update to the parameter data managed by the shared pointer will be effective immediately in all
+the Jastrow objects. In another example, spline coefficients are managed by a shared pointer which achieves a single copy in
+memory shared by an SPOSet and all of its clones.
+
 Scalar estimator implementation
 -------------------------------

 Introduction: Life of a specialized OperatorBase
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Almost all observables in QMCPACK are implemented as specialized derived
-classes of the OperatorBase base class. Each observable is instantiated
-in HamiltonianFactory and added to QMCHamiltonian for tracking.
-QMCHamiltonian tracks two types of observables: main and auxiliary. Main
-observables contribute to the local energy. These observables are
-elements of the simulated Hamiltonian such as kinetic or potential
-energy. Auxiliary observables are expectation values of matrix elements
-that do not contribute to the local energy. These Hamiltonians do not
-affect the dynamics of the simulation. In the code, the main observables
-are labeled by “physical” flag; the auxiliary observables have
-“physical” set to false.
+Almost all observables in QMCPACK are implemented as specialized derived classes of the OperatorBase base class. Each observable
+is instantiated in HamiltonianFactory and added to QMCHamiltonian for tracking. QMCHamiltonian tracks two types of observables:
+main and auxiliary. Main observables contribute to the local energy. These observables are elements of the simulated Hamiltonian
+such as kinetic or potential energy. Auxiliary observables are expectation values of matrix elements that do not contribute to the
+local energy. These Hamiltonians do not affect the dynamics of the simulation. In the code, the main observables are labeled by
+“physical” flag; the auxiliary observables have“physical” set to false.

 Initialization
 ^^^^^^^^^^^^^^
@ -1999,7 +2094,7 @@ Distance tables

 Distance tables store distances between particles. There are symmetric
 (AA) tables for distance between like particles (electron-electron or
-ion-ion) and asymmetric (BA) tables for distance between unlike
+ion-ion) and asymmetric (AB) tables for distance between unlike
 particles (electron-ion)

 The ``Distances`` and ``Displacements`` members contain the data. The
--- a/docs/external_tools.rst
+++ b/docs/external_tools.rst
@ -5,21 +5,26 @@ External Tools

 This chapter provides some information on using QMCPACK with external tools.

-.. _LLVM-Sanitizer-Libraries:
+.. _Sanitizer-Libraries:

-LLVM Sanitizer Libraries
------------------------
+Sanitizer Libraries
+-------------------

 Using CMake, set one of these flags for using the clang sanitizer libraries with or without lldb.

 ::

-   -DLLVM_SANITIZE_ADDRESS    link with the Clang address sanitizer library
-   -DLLVM_SANITIZE_MEMORY     link with the Clang memory sanitizer library
+   -DENABLE_SANITIZER  link with the GNU or Clang sanitizer library for asan, ubsan, tsan or msan (default=none)
+   
+In general: 

-These set the basic flags required to build with either of these sanitizer libraries. They require a build of clang with dynamic libraries somehow visible (i.e., through ``LD_FLAGS=-L/your/path/to/llvm/lib``). You must link through clang, which is generally the default when building with it. Depending on your system and linker, this may be incompatible with the "Release" build, so set ``-DCMAKE_BUILD_TYPE=Debug``. They have been tested with the default spack install of llvm 7.0.0 and been manually built with llvm 7.0.1. See the following links for additional information on use, run time, and build options of the sanitizers: https://clang.llvm.org/docs/AddressSanitizer.html & https://clang.llvm.org/docs/MemorySanitizer.html.
+- address sanitizer (asan):  catches most pointer-based errors and memory leaks (via lsan) by default. 
+- undefined behavior sanitizer (ubsan): low-overhead, catches undefined behavior accessing misaligned memory or signed or float to integer overflows.
+- undefined behavior sanitizer (tsan): catches potential race conditions in threaded code.
+- memory sanitizer (msan): catches using uninitialized memory errors, but is difficult to use without a full set of msan-instrumented libraries.
+
+These set the basic flags required to build with either of these sanitizer libraries which are mutually exclusive. Depending on your system and linker, these may be incompatible with the "Release" build, so set ``-DCMAKE_BUILD_TYPE=Debug`` or ``-DCMAKE_BUILD_TYPE=RelWithDebInfo``. They are tested on GitHub Actions CI using deterministic tests ``ctest -L deterministic`` (currently ubsan). See the following links for additional information on use, run time, and build options of the sanitizers: https://clang.llvm.org/docs/AddressSanitizer.html & https://clang.llvm.org/docs/MemorySanitizer.html.

-In general, the address sanitizer libraries will catch most pointer-based errors. ASAN can also catch memory links but requires that additional options be set. MSAN will catch more subtle memory management errors but is difficult to use without a full set of MSAN-instrumented libraries.

 Intel VTune
 -----------
@ -30,29 +35,15 @@ VTune API
 ~~~~~~~~~

 If the variable ``USE_VTUNE_API`` is set, QMCPACK will check that the
-include file (``ittnotify.h``) and the library (``libittnotify.a``) can
-be found.
-To provide CMake with the VTune paths, add the include path to ``CMAKE_CXX_FLAGS`` and the library path to ``CMAKE_LIBRARY_PATH``.
+include file (``ittnotify.h``) and the library (``libittnotify.a``) can be found.
+To provide CMake with the VTune search paths, add ``VTUNE_ROOT`` which contains ``include`` and ``lib64`` sub-directories.

 An example of options to be passed to CMake:

 ::

-  -DCMAKE_CXX_FLAGS=-I/opt/intel/vtune_amplifier_xe/include \
-  -DCMAKE_LIBRARY_PATH=/opt/intel/vtune_amplifier_xe/lib64
-
-NVIDIA Tools Extensions
-----------------------
-
-NVIDIA's Tools Extensions (NVTX) API enables programmers to annotate their source code when used with the NVIDIA profilers.
-
-NVTX API
-~~~~~~~~
-
-If the variable ``USE_NVTX_API`` is set, QMCPACK will add the library (``libnvToolsExt.so``) to the QMCPACK target. To add NVTX annotations
-to a function, it is necessary to include the ``nvToolsExt.h`` header file and then make the appropriate calls into the NVTX API. For more information
-about the NVTX API, see https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx. Any additional calls to the NVTX API should be guarded by
-the ``USE_NVTX_API`` compiler define.
+  -DUSE_VTUNE_API=ON \
+  -DVTUNE_ROOT=/opt/intel/vtune_amplifier_xe

 Timers as Tasks
 ~~~~~~~~~~~~~~~
@ -71,6 +62,19 @@ For the command line, set the ``enable-user-tasks`` knob to ``true``. For exampl
 Collection with the timers set at "fine" can generate too much task data in the profile.
 Collection with the timers at "medium" collects a more reasonable amount of task data.

+NVIDIA Tools Extensions
+-----------------------
+
+NVIDIA's Tools Extensions (NVTX) API enables programmers to annotate their source code when used with the NVIDIA profilers.
+
+NVTX API
+~~~~~~~~
+
+If the variable ``USE_NVTX_API`` is set, QMCPACK will add the library (``libnvToolsExt.so``) to the QMCPACK target. To add NVTX annotations
+to a function, it is necessary to include the ``nvToolsExt.h`` header file and then make the appropriate calls into the NVTX API. For more information
+about the NVTX API, see https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx. Any additional calls to the NVTX API should be guarded by
+the ``USE_NVTX_API`` compiler define.
+
 Scitools Understand
 -------------------

--- a/docs/features.rst
+++ b/docs/features.rst
@ -51,7 +51,7 @@ feature that you are interested in, check the remainder of this manual or ask if
   orbitals.

 -  Interface and conversion utilities for plane-wave wavefunctions from
-   Quantum Espresso (Plane-Wave Self-Consistent Field package [PWSCF]).
+   Quantum ESPRESSO (Plane-Wave Self-Consistent Field package [PWSCF]).

 -  Interface and conversion utilities for Gaussian-basis wavefunctions
   from GAMESS, PySCF, and QP2. Many more are supported via the molden format and molden2qmc.
--- a/docs/index.rst
+++ b/docs/index.rst
@ -43,6 +43,7 @@ User's Guide and Developer's Manual
   external_tools
   contributing
   unit_testing
+   integration_tests
   design_features
   developing
   appendices
--- a/docs/input_overview.rst
+++ b/docs/input_overview.rst
@ -109,6 +109,10 @@ Output file names also contain the series number, starting at the value given by
 For the input file shown previously, the output files will start with ``vmc.s000``, for example, ``vmc.s000.scalar.dat``.
 If there were another ``<qmc>`` section in the input file, the corresponding output files would use the prefix ``vmc.s001``.

+``<project>`` tag accepts additional control parameters ``<parameters/>``. Batched drivers check against ``max_seconds`` and make efforts to stop the execution cleanly at the end of a block before reaching the maximum time. Classic drivers can also take the now-deprecated ``maxcpusecs`` parameter for the same effect in the per driver XML section.
+
+In addition, a file named ``id`` plus ``.STOP``, in this case ``vmc.STOP``, stops QMCPACK execution on the fly cleanly once being found in the working directory.
+
 Random number initialization
 ----------------------------

--- a/docs/installation.rst
+++ b/docs/installation.rst
@ -28,9 +28,9 @@ are given in the referenced sections.
   (:ref:`buildqe`).

 #. Run the cmake configure step and build with make
-   (:ref:`cmake` and :ref:`cmakequick`). Examples for
-   common systems are given in
-   :ref:`installexamples`.
+   (:ref:`cmake` and :ref:`cmakequick`). Examples for common systems are given in :ref:`installexamples`. To activate workflow
+   tests for Quantum ESPRESSO or PYSCF, be sure to specify QE_BIN or ensure that the python modules are available when cmake is
+   run.

 #. Run the tests to verify QMCPACK
   (:ref:`testing`).
@ -326,7 +326,7 @@ the path to the source directory.

  ::

-    QE_BIN                    Location of Quantum Espresso binaries including pw2qmcpack.x
+    QE_BIN                    Location of Quantum ESPRESSO binaries including pw2qmcpack.x
    QMC_DATA                  Specify data directory for QMCPACK performance and integration tests
    QMC_INCLUDE               Add extra include paths
    QMC_EXTRA_LIBS            Add extra link libraries
@ -377,18 +377,22 @@ the path to the source directory.
                           e.g. "-n", "-np", etc.
    MPIEXEC_PREFLAGS       Flags to pass to MPIEXEC_EXECUTABLE directly before the executable to run.

- LLVM/Clang Developer Options
+- Sanitizers Developer Options

  ::

-    LLVM_SANITIZE_ADDRES  link with the Clang address sanitizer library
-    LLVM_SANITIZE_MEMORY  link with the Clang memory sanitizer library
+    ENABLE_SANITIZER  link with the GNU or Clang sanitizer library for asan, ubsan, tsan or msan (default=none)
+    

-`Clang address sanitizer library <https://clang.llvm.org/docs/AddressSanitizer.html>`_
+`Clang address sanitizer library asan <https://clang.llvm.org/docs/AddressSanitizer.html>`_

-`Clang memory sanitizer library <https://clang.llvm.org/docs/MemorySanitizer.html>`_
+`Clang address sanitizer library ubsan <https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html>`_

-See :ref:`LLVM-Sanitizer-Libraries` for more information.
+`Clang thread sanitizer library tsan <https://clang.llvm.org/docs/ThreadSanitizer.html>`_
+
+`Clang thread sanitizer library msan <https://clang.llvm.org/docs/MemorySanitizer.html>`_
+
+See :ref:`Sanitizer-Libraries` for more information.

 Notes for OpenMP target offload to accelerators (experimental)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -885,9 +889,9 @@ For ease of reproducibility we provide build scripts for Summit.
  ./config/build_olcf_summit.sh
  ls bin

-Building Quantum Espresso
+Building Quantum ESPRESSO
 ^^^^^^^^^^^^^^^^^^^^^^^^^
-We provide a build script for the v6.4.1 release of Quantum Espresso (QE).
+We provide a build script for the v6.4.1 release of Quantum ESPRESSO (QE).
 The following can be used to build a CPU version of QE on Summit,
 placing the script in the external\_codes/quantum\_espresso directory.

@ -1282,10 +1286,6 @@ options and different versions of the application. A full list can be displayed
                                                    converter.
    qe [on]                 True, False             Install with patched Quantum
                                                    Espresso 6.4.0
-    soa [on]                True, False             Build with Structure-of-Array
-                                                    instead of Array-of-Structure
-                                                    code. Only for CPU codeand
-                                                    only in mixed precision
    timers [off]            True, False             Build with support for timers

  Installation Phases:
@ -1626,7 +1626,7 @@ See :ref:`unit-testing` for more details about unit tests.

 .. _integtestqe:

-Integration tests with Quantum Espresso
+Integration tests with Quantum ESPRESSO
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 As described in :ref:`buildqe`, it is possible to test entire
@ -1634,9 +1634,9 @@ workflows of trial wavefunction generation, conversion, and eventual
 QMC calculation. A patched QE must be installed so that the
 pw2qmcpack converter is available.

-By adding ``-D QE_BIN=your_QE_binary_path`` in the CMake command line when building your QMCPACK,
-tests named with the "qe-" prefix will be included in the test set of your build.
-You can test the whole ``pw > pw2qmcpack > qmcpack`` workflow by
+By adding ``-D QE_BIN=your_QE_binary_path`` in the CMake command line when building your QMCPACK, tests named with the "qe-"
+prefix will be included in the test set of your build. If CMake finds pw2qmcpack.x and pw.x in the same location on the PATH,
+these tests will also be activated. You can test the whole ``pw > pw2qmcpack > qmcpack`` workflow by

 ::

--- a/docs/integration_tests.rst
+++ b/docs/integration_tests.rst
@ -0,0 +1,76 @@
+.. _integration_tests:
+
+Integration Tests
+=================
+
+Unlike unit tests requiring only a specific part of QMCPACK being built for testing, integration tests require the qmcpack
+executable. In this category, tests are made based on realistic simulations although the amount of statistics collected depends on
+sub-categories:
+
+* Deterministic integration tests must be 100% reliable, quick to run, and always pass. They usually run one or a few walkers for
+  a very few steps in a few seconds. They are used to rapidly identify changes as part of the continuous integration testing, to
+  verify installations, and for development work.
+* Short integration tests mostly run 16 walkers in a few hundred steps within a minutes. These are usually stochastic and should
+  pass with very high reliability.
+* Long integration tests mostly run 16 walkers in a few thousand steps within 10 minutes. These are usually stochastic and should
+  pass with very high reliability.
+
+To keep overall testing costs down, electron counts are usually kept small while still being large enough to comprehensively test
+the code e.g. 3-10. The complete test set except for the long tests has to be able to be run on a laptop or modest workstation in
+a reasonable amount of time.
+
+Integration test organization
+-----------------------------
+
+Integration tests are placed under directories such as ``tests/heg``, ``tests/solids`` and ``tests/molecules`` from the top
+directory and one sub-directory for each simulation system. Each test source directory contains input XML files, orbital h5 files,
+pseudo-potential files and reference data (qmc_ref). These files may be shared by a few tests to minimize duplicated files. When
+cmake is invoked in the build directory, one directory per test is created and necessary files correspond to a given test are
+softlinked. It serves as a working directory when that test is being executed. To minimize the number file operation and make the
+cmake execution fast, there is limitation on file names used by tests. The filenames are given below and implemented in the
+COPY_DIRECTORY_USING_SYMLINK_LIMITED function in Cmake/macros.cmake.
+
+::
+
+  qmc-ref/qmc_ref for reference data folder.
+  *.opt.xml/*.ncpp.xml/*.BFD.xml/*.ccECP.xml for pseudo-potential files.
+  *.py/*.sh for result checking helper scripts.
+  *.wfj.xml/*.wfnoj.xml/*.wfs.xml for standalone wavefunction input files.
+  *.structure.xml/*.ptcl.xml for standalone structure/particleset input files.
+
+How to add a integration test
+-----------------------------
+
+#. Generate reference data using a very long (many blocks >=2000) and possibly wide run (many nodes). This reduces both the
+   error bar and the error bar of the error bar (10x samples than long test, 100x samples than short test). A folder named qmc-ref
+   containing input.xml, scalar.dat and output file is required with the commit. The number of blocks should be about 200 to avoid
+   large text files (a simple way to obtain these files is to repeat the reference run with 10x fewer blocks and 10x more steps).
+#. Generate the short/long run input files. Use the reference error bar to appropriately estimate the error bar for the long and
+   short tests. These error bars are sqrt(10+1) and sqrt(100+1) times larger than the very long reference. 10x grade is not a hard
+   requirement but ref >= 10 long, long >= 10 short are required.
+#. Short tests must be less than 20 sec VMC, 1 min OPT/DMC on a 16core Xeon processor. Long tests are preferably in the 5-10min
+   range. For systems containing more than just a few electrons submitting only a long test may be appropriate.
+#. Deterministic tests require a different approach: use of a fixed seed value, and for example, 3 blocks of 2 steps and a single
+   walker. The intent of these tests is to exercise the code paths but keep the run short enough that the numerical deviations do
+   not build up. Different reference data may be needed for mixed precision vs full precision runs.
+
+Suggested procedure to add a test
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+#. Study some of the existing tests and their CMakeLists.txt configuration file to see what is required and the typical system
+   sizes and run lengths used.
+#. Perform a run ~30s in length on a 16 core machine (200 blocks min) using the CPU version of the code with 16 MPI and 1 thread
+   per MPI. Decide if the resulting error bar is meaningful for a test. If so, short and long tests should be created. If not,
+   possibly only a long test is appropriate.
+#. Perform a reference run by increasing steps and blocks by 10x each (2000 blocks) and obtain reference mean and error bars.
+   Long and short test error bars are then sqrt(100+1) and sqrt(10+1) of the reference.
+#. Generate reference scalar data by redoing the reference run with 200 blocks and 100x steps. These data are should be committed
+   in a qmc-ref directory with the test.
+#. Create short (1x blocks, 1x steps) and long (1x blocks, 10x steps) input files (200 blocks each). Make one set of input files
+   for CPU runs (walkers=1) and another for GPU runs (walkers=16).
+#. Create CMakeLists.txt by following the example in other tests. CPU runs should include at least a 4 MPI, 4 thread test since
+   this tests OpenMP, MPI, and any possible interactions between them. A GPU test should have 1 MPI and 16 threads.
+#. Create a README file with information describing the tests and the reference data.
+#. Check that the tests run properly with ctest on your local machine.
+#. Submit a pull request with the final tests.
+
--- a/docs/intro_wavefunction.rst
+++ b/docs/intro_wavefunction.rst
@ -222,6 +222,8 @@ attribute:
 +-----------------------------+------------+--------------------------+---------+-------------------------------------------+
 | ``source``                  | Text       | Any                      | Ion0    | Particle set with atomic positions.       |
 +-----------------------------+------------+--------------------------+---------+-------------------------------------------+
+| ``skip_checks``             | Text       | Yes/no                   | No      | skips checks for ion information in h5    |
+-----------------------------+------------+--------------------------+---------+-------------------------------------------+

 .. centered:: Table 3 Options for the ``determinantset`` xml-block associated with B-spline single particle orbital sets.

@ -272,6 +274,10 @@ Additional information:
  access host memory via zero-copy. Although the performance penalty
  introduced by it is significant, it allows large calculations to go
  through.
+ 
+- ``skip_checks``. When converting the wave function from convertpw4qmc instead
+  of pw2qmcpack, there is missing ionic information. This flag bypasses the requirement
+  that the ionic information in the eshdf.h5 file match the input xml. 

 .. _gaussianbasis:

--- a/docs/introduction.rst
+++ b/docs/introduction.rst
@ -278,9 +278,10 @@ Please note the following guidelines for contributions:
 QMCPACK Roadmap
 ---------------

-A general outline of the QMCPACK roadmap is given in the following sections. Suggestions for improvements are welcome,
-particularly those that would facilitate new scientific applications. For example, if an interface to a particular quantum
-chemical or density functional code would help, this would be given strong consideration.
+A general outline of the QMCPACK roadmap is given in the following sections. Suggestions for improvements from current and
+potential users are very welcome, particularly those that would facilitate new uses or new users. For example, if an interface to
+a particular quantum chemical or density functional code, or an improved tutorial would be helpful, these would be given strong
+consideration.

 Code
 ~~~~
@ -290,22 +291,21 @@ improved workflow, integration with more quantum chemical and density functional
 are very welcome, both from new users of QMC and from those experienced with other QMC codes.

 A main development focus is the creation of a single performance portable version of the code. All features will consequently be
-available on all platforms, including accelerators (GPUs) from NVIDIA, AMD, and Intel. The internal design is being updated for
-greater simplicity. Overall we expect this to increase performance and improve the overall consistency and robustness of the code.
-It will also enable us to remove legacy implementations. 
+available on all platforms, including accelerators (GPUs) from NVIDIA, AMD, and Intel. These new implementations are currently
+referred to as the *batched code*. As the initial batched implementation is matured, observables and other functionality will be
+prioritized based on feedback received.

 Documentation and examples
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

-This manual describes the core features of QMCPACK that are required for
-routine research calculations and standard QMC workflows, i.e., the VMC and DMC methods, auxiliary field QMC, how to
-obtain and optimize trial wavefunctions, and simple observables. This covers at least 95% of use cases. 
+This manual describes the core features of QMCPACK that are required for routine research calculations and standard QMC workflows,
+i.e., the VMC and DMC methods, auxiliary field QMC, how to obtain and optimize trial wavefunctions, and simple observables. This
+covers at least 95% of use cases, and nearly all production research calculations.

-Because of its history as an academically developed research code, QMCPACK contains a variety of additional QMC methods, trial
-wavefunction forms, potentials, etc., that, although far from critical, might be very useful for specialized calculations or
-particular material or chemical systems. These “secret features” (every code has these) are not actually secret but simply lack
-descriptions, example inputs, and tests. You are encouraged to browse and read the source code to find them. New descriptions will
-be added over time but can also be prioritized and added on request (e.g., if a specialized Jastrow factor would help or a
-historical Jastrow form is needed for benchmarking).
+Because of its history as an academically developed research code, QMCPACK also contains a variety of additional QMC methods,
+trial wavefunction forms, potentials, etc., that, although far from critical, might be very useful for specialized calculations or
+particular material or chemical systems. If you are interested in these please ask - generally the features are immature, but we
+might have historical inputs available. New descriptions will be added over time but can also be prioritized and added on request
+(e.g., if a specialized Jastrow factor would help or a historical Jastrow form is needed for benchmarking).

 .. bibliography:: /bibs/introduction.bib
--- a/docs/methods.rst
+++ b/docs/methods.rst
@ -36,6 +36,8 @@ Quantum Monte Carlo Methods
  +----------------+--------------+--------------+-------------+---------------------------------+
  | ``trace``      | text         |              | no          | ???                             |
  +----------------+--------------+--------------+-------------+---------------------------------+
+  | ``profiling``  | text         | yes/no       | no          | Activate resume/pause control   |
+  +----------------+--------------+--------------+-------------+---------------------------------+
  | ``checkpoint`` | integer      | -1, 0, n     | -1          | Checkpoint frequency            |
  +----------------+--------------+--------------+-------------+---------------------------------+
  | ``record``     | integer      | n            | 0           | Save configuration ever n steps |
@ -59,14 +61,25 @@ Additional information:
   computing device can be chosen by this switch. With a regular
   CPU-only compilation, this option is not effective.

+-  ``profiling``: Performance profiling tools by default profile complete application executions.
+   This is largely unnecessary if the focus is a QMC section instead of any initialization
+   and additional QMC sections for equilibrating walkers.
+   Setting this flag to ``yes`` for the QMC sections of interest and starting the tool with
+   data collection paused from the beginning help reducing the profiling workflow
+   and amount of collected data. Additional restriction may be imposed by profiling tools.
+   For example, NVIDIA profilers can only be turned on and off once and thus only the first QMC
+   section with ``profiling="yes"`` will be profiled.
+   VTune instead allows pause and resume for unlimited times and thus multiple selected QMC sections
+   can be profiled in a single run.
+
 -  ``checkpoint``: This enables and disables checkpointing and
   specifying the frequency of output. Possible values are:

   - **[-1]** No checkpoint (default setting).

-   - **[0]** Dump after the completion of a QMC section.
+   - **[0]** Write the checkpoint files after the completion of the QMC section.

-   - **[n]** Dump after every :math:`n` blocks.  Also dump at the end of the run.
+   - **[n]** Write the checkpoint files after every :math:`n` blocks, and also at the end of the QMC section.

 The particle configurations are written to a ``.config.h5`` file.

@ -106,7 +119,8 @@ In the project id section, make sure that the series number is different from an
 Variational Monte Carlo
 -----------------------

-``vmc`` method:
+``vmc`` driver
+~~~~~~~~~~~~~~

  parameters:

@ -133,7 +147,7 @@ Variational Monte Carlo
  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
  | ``samplesperthread``           | integer      | :math:`\geq 0`          | 0           | Number of samples per thread                  |
  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
-  | ``storeconfigs``               | integer      | all values              | 0           | Show configurations o                         |
+  | ``storeconfigs``               | integer      | all values              | 0           | Write configurations to files                 |
  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
  | ``blocks_between_recompute``   | integer      | :math:`\geq 0`          | dep.        | Wavefunction recompute frequency              |
  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
@ -214,11 +228,11 @@ Additional information:
  recompute) by default when not using mixed precision. Recomputing
  introduces a performance penalty dependent on system size.

-  ``spinMoves`` Determines whether or not the spin variables are sampled following 
+- ``spinMoves`` Determines whether or not the spin variables are sampled following
  :cite:`Melton2016-1` and :cite:`Melton2016-2`. If a relativistic calculation is desired using pseudopotentials,
  spin variable sampling is required.

-  ``spinMass`` If spin sampling is on using ``spinMoves`` == yes, the spin mass determines the rate 
+- ``spinMass`` If spin sampling is on using ``spinMoves`` == yes, the spin mass determines the rate
  of spin sampling, resulting in an effective spin timestep :math:`\tau_s = \frac{\tau}{\mu_s}`.

 An example VMC section for a simple VMC run:
@ -254,6 +268,110 @@ The following is an example of VMC section storing configurations (walker sample
     <parameter name="usedrift">   no </parameter>
   </qmc>

+``vmc_batch`` driver (experimental)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  parameters:
+
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | **Name**                       | **Datatype** | **Values**              | **Default** | **Description**                               |
+  +================================+==============+=========================+=============+===============================================+
+  | ``total_walkers``              | integer      | :math:`> 0`             | 1           | Total number of walkers over all MPI ranks    |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``walkers_per_rank``           | integer      | :math:`> 0`             | 1           | Number of walkers per MPI rank                |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``crowds``                     | integer      | :math:`> 0`             | dep.        | Number of desynchronized dwalker crowds       |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``blocks``                     | integer      | :math:`\geq 0`          | 1           | Number of blocks                              |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``steps``                      | integer      | :math:`\geq 0`          | 1           | Number of steps per block                     |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``warmupsteps``                | integer      | :math:`\geq 0`          | 0           | Number of steps for warming up                |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``substeps``                   | integer      | :math:`\geq 0`          | 1           | Number of substeps per step                   |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``usedrift``                   | text         | yes,no                  | yes         | Use the algorithm with drift                  |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``timestep``                   | real         | :math:`> 0`             | 0.1         | Time step for each electron move              |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``samples`` (not ready)        | integer      | :math:`\geq 0`          | 0           | Number of walker samples for in this VMC run  |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``storeconfigs`` (not ready)   | integer      | all values              | 0           | Write configurations to files                 |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``blocks_between_recompute``   | integer      | :math:`\geq 0`          | dep.        | Wavefunction recompute frequency              |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``crowd_serialize_walkers``    | integer      | yes, no                 | no          | Force use of single walker APIs (for testing) |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+
+Additional information:
+
+- ``crowds`` The number of crowds that the walkers are subdivided into on each MPI rank. If not provided, it is set equal to the number of OpenMP threads.
+
+- ``walkers_per_rank`` The number of walkers per MPI rank. The exact number of walkers will be generated before performing random walking.
+  It is not required to be a multiple of the number of OpenMP threads. However, to avoid any idle resources, it is recommended to be at
+  least the number of OpenMP threads for pure CPU runs. For GPU runs, a scan of this parameter is necessary to reach reasonable single rank
+  efficiency and also get a balanced time to solution.
+  If neither ``total_walkers`` nor ``walkers_per_rank`` is provided, ``walkers_per_rank`` is set equal to ``crowds``.
+
+- ``total_walkers`` Total number of walkers over all MPI ranks. if not provided, it is computed as ``walkers_per_rank`` times the number of MPI ranks. If both ``total_walkers`` and ``walkers_per_rank`` are provided, ``total_walkers`` must be equal to ``walkers_per_rank`` times the number MPI ranks.
+
+- ``blocks`` This parameter is universal for all the QMC methods. The MC processes are divided into a number of
+  ``blocks``, each containing a number of steps. At the end of each block, the statistics accumulated in the block are dumped into files,
+  e.g., ``scalar.dat``. Typically, each block should have a sufficient number of steps that the I/O at the end of each block is negligible
+  compared with the computational cost. Each block should not take so long that monitoring its progress is difficult. There should be a
+  sufficient number of ``blocks`` to perform statistical analysis.
+
+- ``warmupsteps`` - ``warmupsteps`` are used only for
+  equilibration. Property measurements are not performed during
+  warm-up steps.
+
+- ``steps`` - ``steps`` are the number of energy and other property measurements to perform per block.
+
+- ``substeps``  For each substep, an attempt is made to move each of the electrons once only by either particle-by-particle or an
+  all-electron move.  Because the local energy is evaluated only at
+  each full step and not each substep, ``substeps`` are computationally cheaper
+  and can be used to de-correlation at a low computational cost.
+
+- ``usedrift`` The VMC is implemented in two algorithms with
+  or without drift. In the no-drift algorithm, the move of each
+  electron is proposed with a Gaussian distribution. The standard
+  deviation is chosen as the time step input. In the drift algorithm,
+  electrons are moved by Langevin dynamics.
+
+- ``timestep`` The meaning of time step depends on whether or not
+  the drift is used. In general, larger time steps reduce the
+  time correlation but might also reduce the acceptance ratio,
+  reducing overall statistical efficiency. For VMC, typically the
+  acceptance ratio should be close to 50% for an efficient
+  simulation.
+
+- ``samples`` (not ready)
+
+- ``storeconfigs`` If ``storeconfigs`` is set to a nonzero value, then electron configurations during the VMC run are saved to
+  files.
+
+- ``blocks_between_recompute`` Recompute the accuracy critical determinant part of the wavefunction
+  from scratch: =1 by default when using mixed precision. =0 (no
+  recompute) by default when not using mixed precision. Recomputing
+  introduces a performance penalty dependent on system size.
+
+An example VMC section for a simple ``vmc_batch`` run:
+
+::
+
+  <qmc method="vmc_batch" move="pbyp">
+    <estimator name="LocalEnergy" hdf5="no"/>
+    <parameter name="walkers_per_rank">    256 </parameter>
+    <parameter name="warmupSteps">  100 </parameter>
+    <parameter name="substeps">  5 </parameter>
+    <parameter name="blocks">  20 </parameter>
+    <parameter name="steps">  100 </parameter>
+    <parameter name="timestep">  1.0 </parameter>
+    <parameter name="usedrift">   yes </parameter>
+  </qmc>
+
+Here we set 256 walkers per MPI rank, have a brief initial equilibration of 100 ``steps``, and then have 20 ``blocks`` of 100 ``steps`` with 5 ``substeps`` each.
+
 .. _optimization:

 Wavefunction optimization
@ -490,7 +608,7 @@ optimization strategy. To track the progress of optimization, use the
 command ``qmca -q ev *.scalar.dat`` to look at the VMC energy and
 variance for each optimization step.

-Adaptive Organizer
+Adaptive Optimizer
 ~~~~~~~~~~~~~~~~~~

 The default setting of the adaptive optimizer is to construct the linear
@ -765,7 +883,7 @@ Additional information and recommendations:
 
 -  For reporting quantities such as a final energy and associated uncertainty,
    an average over many descent steps can be taken. The parameters for 
-   ``collection_step`` and ``compute_step`` help automate this task. 
+    ``collection_step`` and ``compute_step`` help automate this task.
    After the descent iteration specified by ``collection_step``, a 
    history of local energy values will be kept for determining a final 
    error and average, which will be computed and given in the output 
@ -1014,7 +1132,7 @@ the tag is not added coefficients will not be saved.

  The rest of the optimization block remains the same.

-When running the optimization, the new coefficients will be stored in a *\**.sXXX.opt.h5 file,  where XXX coressponds to the series number. The H5 file contains only the optimized coefficients. The corresponding *\**.sXXX.opt.xml  will be updated for each optimization block as follows:
+When running the optimization, the new coefficients will be stored in a ``*.sXXX.opt.h5`` file,  where XXX coressponds to the series number. The H5 file contains only the optimized coefficients. The corresponding ``*.sXXX.opt.xml`` will be updated for each optimization block as follows:

 ::

@ -1036,9 +1154,10 @@ inconsistencies.
 Diffusion Monte Carlo
 ---------------------

-Main input parameters are given in :numref:`table9`, additional in :numref:`table10`.
+``dmc`` driver
+~~~~~~~~~~~~~~

-``dmc`` method:
+Main input parameters are given in :numref:`table9`, additional in :numref:`table10`.

 parameters:

@ -1064,7 +1183,9 @@ parameters:
  |                                |              |                         |             |                                               |
  |                                | string       | classic/DRV/ZSGMA/YL    | classic     | Branch cutoff scheme                          |
  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
-  | ``maxcpusecs``                 | real         | :math:`\geq 0`          | 3.6e5       | Maximum allowed walltime in seconds           |
+  | ``maxcpusecs``                 | real         | :math:`\geq 0`          | 3.6e5       | Deprecated. Superseded by ``max_seconds``     |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``max_seconds``                | real         | :math:`\geq 0`          | 3.6e5       | Maximum allowed walltime in seconds           |
  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
  | ``blocks_between_recompute``   | integer      | :math:`\geq 0`          | dep.        | Wavefunction recompute frequency              |
  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
@ -1153,7 +1274,9 @@ Additional information:
   variable specifies how often to reset all the variables kept in the
   buffer.

-  ``maxcpusecs``: The default is 100 hours. Once the specified time has
+-  ``maxcpusecs``: Deprecated. Superseded by ``max_seconds``.
+
+-  ``max_seconds``: The default is 100 hours. Once the specified time has
   elapsed, the program will finalize the simulation even if all blocks
   are not completed.

@ -1363,6 +1486,74 @@ Combining VMC and DMC in a single run (wavefunction optimization can be combined
    <parameter name="timestep">0.005</parameter>
  </qmc>

+``dmc_batch`` driver (experimental)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  parameters:
+
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | **Name**                       | **Datatype** | **Values**              | **Default** | **Description**                               |
+  +================================+==============+=========================+=============+===============================================+
+  | ``total_walkers``              | integer      | :math:`> 0`             | 1           | Total number of walkers over all MPI ranks    |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``walkers_per_rank``           | integer      | :math:`> 0`             | 1           | Number of walkers per MPI rank                |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``crowds``                     | integer      | :math:`> 0`             | dep.        | Number of desynchronized dwalker crowds       |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``blocks``                     | integer      | :math:`\geq 0`          | 1           | Number of blocks                              |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``steps``                      | integer      | :math:`\geq 0`          | 1           | Number of steps per block                     |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``warmupsteps``                | integer      | :math:`\geq 0`          | 0           | Number of steps for warming up                |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``timestep``                   | real         | :math:`> 0`             | 0.1         | Time step for each electron move              |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``nonlocalmoves``              | string       | yes, no, v0, v1, v3     | no          | Run with T-moves                              |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``branching_cutoff_scheme``    | string       | classic/DRV/ZSGMA/YL    | classic     | Branch cutoff scheme                          |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``blocks_between_recompute``   | integer      | :math:`\geq 0`          | dep.        | Wavefunction recompute frequency              |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``feedback``                   | double       | :math:`\geq 0`          | 1.0         | Population feedback on the trial energy       |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``sigmaBound``                 | 10           | :math:`\geq 0`          | 10          | Parameter to cutoff large weights             |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``reconfiguration``            | string       | yes/pure/other          | no          | Fixed population technique                    |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``storeconfigs``               | integer      | all values              | 0           | Store configurations                          |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``use_nonblocking``            | string       | yes/no                  | yes         | Using nonblocking send/recv                   |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``debug_disable_branching``    | string       | yes/no                  | no          | Disable branching for debugging               |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+  | ``crowd_serialize_walkers``    | integer      | yes, no                 | no          | Force use of single walker APIs (for testing) |
+  +--------------------------------+--------------+-------------------------+-------------+-----------------------------------------------+
+
+- ``crowds`` The number of crowds that the walkers are subdivided into on each MPI rank. If not provided, it is set equal to the number of OpenMP threads.
+
+- ``walkers_per_rank`` The number of walkers per MPI rank. This number does not have to be a multiple of the number of OpenMP
+  threads. However, to avoid any idle resources, it is recommended to be at least the number of OpenMP threads for pure CPU runs.
+  For GPU runs, a scan of this parameter is necessary to reach reasonable single rank efficiency and also get a balanced time to
+  solution. For highest throughput on GPUs, expect to use hundreds of walkers_per_rank, or the largest number that will fit in GPU
+  memory. If neither ``total_walkers`` nor ``walkers_per_rank`` is provided, ``walkers_per_rank`` is set equal to ``crowds``.
+
+- ``total_walkers`` Total number of walkers summed over all MPI ranks, or equivalently the total number of walkers in the DMC
+  calculation. If not provided, it is computed as ``walkers_per_rank`` times the number of MPI ranks. If both ``total_walkers``
+  and ``walkers_per_rank`` are provided, which is not recommended, ``total_walkers`` must be consistently set equal to
+  ``walkers_per_rank`` times the number MPI ranks.
+
+.. code-block::
+  :caption: The following is an example of a minimal DMC section using the ``dmc_batch`` driver
+  :name: Listing 48
+
+  <qmc method="dmc_batch" move="pbyp" target="e">
+    <parameter name="walkers_per_rank">256</parameter>
+    <parameter name="blocks">100</parameter>
+    <parameter name="steps">400</parameter>
+    <parameter name="timestep">0.010</parameter>
+    <parameter name="warmupsteps">100</parameter>
+  </qmc>
+
 .. _rmc:

 Reptation Monte Carlo
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1 +1,4 @@
-sphinxcontrib.bibtex
+sphinx>=3.0
+sphinx_rtd_theme
+sphinxcontrib-bibtex<2.0
+
--- a/docs/running.rst
+++ b/docs/running.rst
@ -45,6 +45,13 @@ Output files

 QMCPACK generates multiple files documented in :ref:`output-overview`.

+Stopping a running simulation
+-----------------------------
+
+As detailed in :ref:`input-overview`, QMCPACK will cleanly stop execution at the end of the current block if it finds a file named
+``project_id.STOP``, where ``project_id`` is the name of the project given in the input XML. You can also set the ``max_seconds``
+parameter to establish an overall time limit.
+
 .. _parallelrunning:

 Running in parallel with MPI
--- a/docs/spin_orbit.rst
+++ b/docs/spin_orbit.rst
@ -29,7 +29,7 @@ The single particle spinors used in QMCPACK take the form

 where :math:`s` is the spin variable and using the complex spin representation.
 In order to carry out spin-orbit calculations in solids, the single-particle spinors
-can be obtained using Quantum Espresso. After carrying out the spin-orbit calculation in QE
+can be obtained using Quantum ESPRESSO. After carrying out the spin-orbit calculation in QE
 (with flags ``noncolin`` = .true., ``lspinorb`` = .true., and a relativistic ``.UPF`` pseudopotential), 
 the spinors can be obtained by using the converter *convertpw4qmc*:

@ -65,7 +65,7 @@ where we now utilize determinants of spinors, as opposed to the usual product of
      </sposet_builder>
      <determinantset>
        <slaterdeterminant>
-          <determinant id="det" group="u" sposet="myspo" size="10"\>
+          <determinant id="det" group="u" sposet="myspo" size="10"/>
        </slaterdeterminant>
      </determinantset>
      <jastrow type="One-Body" name="J1" function="bspline" source="ion0" print="yes">
--- a/examples/afqmc/06-methane_converge_back_prop/check_h1e_conv.py
+++ b/examples/afqmc/06-methane_converge_back_prop/check_h1e_conv.py
@ -61,7 +61,7 @@ def plot_convergence(filename):
        plt.errorbar(tau_bps, energies, yerr=errs, fmt='o')
        plt.xlabel(r'$\tau_{BP}$')
        plt.ylabel(r'$E_{1B}$ (Ha)')
-        plt.savefig('h1e_conv.pdf', fmt='pdf', bbox_inches='tight')
+        plt.savefig('h1e_conv.pdf', bbox_inches='tight')

 if __name__ == '__main__':
    plot_convergence('qmc.s000.stat.h5')
--- a/examples/afqmc/07-diamond_2x2x2_supercell/scf.py
+++ b/examples/afqmc/07-diamond_2x2x2_supercell/scf.py
@ -26,7 +26,7 @@ from afqmctools.utils.linalg import get_ortho_ao
 hcore = mf.get_hcore()
 fock = (hcore + mf.get_veff())
 X, nmo_per_kpt = get_ortho_ao(cell,kpts,1e-14)
-with h5py.File(mf.chkfile) as fh5:
+with h5py.File(mf.chkfile, 'r+') as fh5:
  fh5['scf/hcore'] = hcore
  fh5['scf/fock'] = fock
  fh5['scf/orthoAORot'] = X
--- a/examples/afqmc/08-diamond_2x2x2_kpoint_sym/scf.py
+++ b/examples/afqmc/08-diamond_2x2x2_kpoint_sym/scf.py
@ -26,7 +26,7 @@ from afqmctools.utils.linalg import get_ortho_ao
 hcore = mf.get_hcore()
 fock = (hcore + mf.get_veff())
 X, nmo_per_kpt = get_ortho_ao(cell,kpts,1e-14)
-with h5py.File(mf.chkfile) as fh5:
+with h5py.File(mf.chkfile, 'r+') as fh5:
  fh5['scf/hcore'] = hcore
  fh5['scf/fock'] = fock
  fh5['scf/orthoAORot'] = X
--- a/examples/solids/pyscf-inputs/diamondC_1x1x1_pp_Bspline.py
+++ b/examples/solids/pyscf-inputs/diamondC_1x1x1_pp_Bspline.py
@ -55,7 +55,7 @@ mf.exxdiv = 'ewald'
 mf.with_df = mydf

 e_scf=mf.kernel()
-
+assert mf.converged

 ener = open('e_scf','w')
 ener.write('%s\n' % (e_scf))
--- a/examples/solids/pyscf-inputs/diamondC_1x1x1_pp_LCAO.py
+++ b/examples/solids/pyscf-inputs/diamondC_1x1x1_pp_LCAO.py
@ -54,7 +54,7 @@ mf.exxdiv = 'ewald'
 mf.with_df = mydf

 e_scf=mf.kernel()
-
+assert mf.converged

 ener = open('e_scf','w')
 ener.write('%s\n' % (e_scf))
--- a/external_codes/boost_multi/multi/.gitignore
+++ b/external_codes/boost_multi/multi/.gitignore
@ -0,0 +1,3 @@
+build*
+.build*
+
--- a/external_codes/boost_multi/multi/.gitlab-ci.yml
+++ b/external_codes/boost_multi/multi/.gitlab-ci.yml
@ -1,121 +1,617 @@
 # -*-indent-tabs-mode:nil;c-basic-offset:2;tab-width:4;-*-
-# This file is a template, and might need editing before it works on your project.
-#image: golang:latest
-
-#variables:
-#  # Please edit to your GitLab project
-#  REPO_NAME: gitlab.com/namespace/project
-
-# The problem is that to be able to use go get, one needs to put
-# the repository in the $GOPATH. So for example if your gitlab domain
-# is gitlab.com, and that your repository is namespace/project, and
-# the default GOPATH being /go, then you'd need to have your
-# repository in /go/src/gitlab.com/namespace/project
-# Thus, making a symbolic link corrects this.
-#before_script:
-#  - mkdir -p $GOPATH/src/$(dirname $REPO_NAME)
-#  - ln -svf $CI_PROJECT_DIR $GOPATH/src/$REPO_NAME
-#  - cd $GOPATH/src/$REPO_NAME
-#    - apt upgrade -y --quiet
-# nvidia-cuda-dev  nvidia-smi
-#      - sh ./array_ptr.cpp
-#      - sh ./array_ref.cpp
-#      - sh ./sort.cpp
-#      - sh ./comparisons.cpp
-#      - sh ./constructors.cpp
-#      - sh ./initializer_list.cpp
-#      - sh ./layout.cpp
-#      - sh ./fill.cpp
-#      - sh ./rotated.cpp
-#      - sh ./allocator.cpp
-#      - sh ./reextent.cpp
-
-#      - DEBIAN_FRONTEND=noninteractive apt install --assume-yes  build-essential 
-#      - DEBIAN_FRONTEND=noninteractive apt-get update
-#      - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends --assume-yes --quiet g++-7
+# © Alfredo A. Correa 2020

 image: debian:testing

-variables:
-  CXX: "c++"
-  DOCKER_DRIVER: overlay2
+g++-latest:
+  stage: test
+  script:
+    - export CXX="g++"
+    - apt-get -qq update && apt-get -qq install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake ..
+    - cmake --build .
+    - ctest --output-on-failure

-before_script:
-    - perl -pi -e 's/main/main\ contrib\ non-free/g' /etc/apt/sources.list
-    - apt update --quiet
-    - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends --assume-yes --quiet libboost-test-dev libboost-timer-dev libtbb-dev libboost-serialization-dev libboost-iostreams-dev librange-v3-dev valgrind
+g++-latest-std20:
+  stage: test
+  script:
+    - export CXX="g++"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake .. -DCMAKE_CXX_STANDARD=20
+    - cmake --build . --verbose
+    - ctest --output-on-failure

-stages:
-    - test
-#    - build
+g++-current-debug:
+  stage: test
+  script:
+    - export CXX="g++"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake .. -DCMAKE_BUILD_TYPE=Debug
+    - cmake --build . --verbose
+    - ctest --output-on-failure
+    
+g++-current-release:
+  stage: test
+  script:
+    - export CXX="g++"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake .. -DCMAKE_BUILD_TYPE=Release
+    - cmake --build . --verbose
+    - ctest --output-on-failure

-gcc:
+g++-7-std17:
+  stage: test
+  image: debian:stable-backports
+  script:
+    - export CXX="g++-7"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake .. -DCMAKE_CXX_STANDARD=17
+    - cmake --build .
+    - ctest --output-on-failure
+
+g++-8:
+  stage: test
+  image: debian:stable-backports
+  script:
+    - export CXX="g++-8"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake ..
+    - cmake --build .
+    - ctest --output-on-failure
+
+g++-9:
+  stage: test
+  script:
+    - export CXX="g++-9"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake ..
+    - cmake --build . --verbose
+    - ctest --output-on-failure
+
+g++-9-std17:
+  stage: test
+  script:
+    - export CXX="g++-9"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake .. -DCMAKE_CXX_STANDARD=17
+    - cmake --build . --verbose
+    - ctest --output-on-failure
+
+g++-9-openblas:
+  stage: test
+  script:
+    - export CXX="g++-9"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libopenblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake ..
+    - cmake --build . --verbose
+    - ctest --output-on-failure
+
+memcheck:
+  stage: test
+  script:
+    - export CXX="g++-9"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet $CXX valgrind cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake ..
+    - cmake --build . --verbose
+    - ctest -T memcheck --output-on-failure
+
+clang++-9:
+  stage: test
+  script:
+    - export CXX="clang++-9"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet clang-9 cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake ..
+    - cmake --build . --verbose
+    - ctest --output-on-failure
+
+clang++-9-std17:
+  stage: test
+  script:
+    - export CXX="clang++-9"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet clang-9 cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake .. -DCMAKE_CXX_STANDARD=17
+    - cmake --build . --verbose
+    - ctest --output-on-failure
+
+clang++-9-asan:
+  stage: test
+  script:
+    - export CXX="clang++-9"
+    - export CXXFLAGS="-fsanitize=undefined -fsanitize=address -fno-omit-frame-pointer"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet clang-9 cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev llvm
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake ..
+    - cmake --build . --verbose
+    - export ASAN_OPTIONS="halt_on_error=1 detect_leaks=1"
+    - ctest --output-on-failure
+
+# clang 10 and clang 11 have a bug when compiling in c++17 mode
+clang++:
+  stage: test
+  script:
+    - export CXX="clang++"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet clang cmake make libboost-test-dev libblas-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake ..
+    - cmake --build . --verbose -- --quiet --no-print-directory
+    - ctest --output-on-failure
+
+clang++-tidy:
+  stage: test
+  script:
+    - export CXX="clang++"
+    - apt-get -qq update && apt-get -qq install --no-install-recommends -y --quiet clang clang-tidy cmake make libboost-test-dev libblas-dev libblas-dev liblapack-dev libfftw3-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - clang-tidy --version
+    - mkdir build && cd build
+    - clang-tidy -checks=*,-fuchsia-default-arguments-calls,-fuchsia-statically-constructed-objects,-cppcoreguidelines-pro-type-vararg,-hicpp-vararg,-cppcoreguidelines-avoid-magic-numbers,-readability-magic-numbers,-cppcoreguidelines-macro-usage,-cppcoreguidelines-avoid-non-const-global-variables,-llvmlibc-implementation-in-namespace,-llvmlibc-callee-namespace,-llvmlibc-restrict-system-libc-headers,-cert-err58-cpp --warnings-as-errors=* --list-checks
+    - cmake .. -DCMAKE_CXX_CLANG_TIDY="clang-tidy;-checks=*,-fuchsia-default-arguments-calls,-fuchsia-statically-constructed-objects,-fuchsia-overloaded-operator,-cppcoreguidelines-pro-type-vararg,-hicpp-vararg,-cppcoreguidelines-avoid-magic-numbers,-readability-magic-numbers,-cppcoreguidelines-macro-usage,-cppcoreguidelines-avoid-non-const-global-variables,-llvmlibc-implementation-in-namespace,-llvmlibc-callee-namespace,-llvmlibc-restrict-system-libc-headers,-cert-err58-cpp;--warnings-as-errors=*"
+    - cmake --build . --verbose -- --quiet --no-print-directory
+    - ctest --output-on-failure
+
+clang++-std17:
+  stage: test
+  script:
+    - export CXX="clang++"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet clang cmake make libboost-test-dev libblas-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake .. -DCMAKE_CXX_STANDARD=17
+    - cmake --build . --verbose -- --quiet --no-print-directory
+    - ctest --output-on-failure
+
+clang++-11:
+  stage: test
+  script:
+    - export CXX="clang++-11"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet clang-11 cmake make libboost-test-dev libblas-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake ..
+    - cmake --build . --verbose -- --quiet --no-print-directory
+    - ctest --output-on-failure
+
+clang++-11-gcc9:
+  stage: test
+  image: vistart/cuda:10.2-ubuntu20.04
+  script:
+    - export CXX="clang++-11"
+    - apt-get update 
+    - DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y --quiet gcc-9 clang-11 cmake make libboost-test-dev libblas-dev
+    - ln --symbolic --force . ../multi
+    - $CXX -v
+    - mkdir build && cd build
+    - cmake ..
+    - cmake --build . --verbose -- --quiet --no-print-directory
+    - ctest --output-on-failure
+
+clang++-11-std17:
+  stage: test
+  script:
+    - export CXX="clang++-11"
+    - apt-get update && apt-get install --no-install-recommends -y --quiet clang-11 cmake make libboost-test-dev libblas-dev
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - mkdir build && cd build
+    - cmake .. -DCMAKE_CXX_STANDARD=17
+    - cmake --build . --verbose -- --quiet --no-print-directory
+    - ctest --output-on-failure
+    
+icc:
+    image: meteocima/dkr-intel
    stage: test
    script:
-      - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends --assume-yes --quiet g++
+      - . /opt/intel/bin/compilervars.sh intel64
+      - export CXX="icpc"
+      - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libblas-dev liblapack-dev libfftw3-dev
      - ln --symbolic --force . ../multi
-      - export CXX="g++     -x c++  -Wall -Wextra -Wpedantic -Ofast "
-      - $CXX --version
-      - cd tests
-      - for a in ./*.cpp; do echo $a; sh $a || exit; echo "\n"; done;
+      - $CXX -v
+      - mkdir build && cd build
+      - cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
+      - cmake --build . -- --quiet --no-print-directory
+      - export MKL_VERBOSE=1
+      - ctest --output-on-failure

-gcc8:
+icc-nomkl:
+    image: meteocima/dkr-intel
    stage: test
    script:
-      - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends --assume-yes --quiet g++-8 g++-10
+      - export CXX="/opt/intel/bin/icpc"
+      - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libblas-dev liblapack-dev libfftw3-dev
      - ln --symbolic --force . ../multi
-      - export CXX="g++-8     -x c++  -Wall -Wextra -Wpedantic -Ofast "
-      - $CXX --version
-      - cd tests
-      - for a in ./*.cpp; do echo $a; sh $a || exit; echo "\n"; done;
-
-clang:
+      - $CXX -v
+      - mkdir build && cd build
+      - cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
+      - cmake --build . -- --quiet --no-print-directory
+      - export MKL_VERBOSE=1
+      - ctest --output-on-failure
+      
+icc-memcheck:
+    image: meteocima/dkr-intel
    stage: test
    script:
-      - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends --assume-yes --quiet clang
+      - . /opt/intel/bin/compilervars.sh intel64
+      - export CXX="icpc"
+      - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make valgrind libboost-test-dev libblas-dev liblapack-dev libfftw3-dev
      - ln --symbolic --force . ../multi
-      - export CXX="clang++ -x c++ -Wall -Wextra -Wpedantic -Wmove -Ofast "
-      - $CXX --version
-      - cd tests
-      - for a in ./*.cpp; do echo $a; sh $a || exit; echo "\n"; done;
+      - $CXX -v
+      - mkdir build && cd build
+      - cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DMEMORYCHECK_COMMAND_OPTIONS="--leak-check=full --show-reachable=yes --track-origins=yes --malloc-fill=0xEE --free-fill=0xFF --leak-check-heuristics=none -v --track-fds=yes --error-limit=no --show-below-main=yes --read-var-info=yes --gen-suppressions=all"
+      - cmake --build . -- --quiet --no-print-directory
+      - export MKL_VERBOSE=1
+      - ctest -T memcheck --output-on-failure || (cat Testing/Temporary/MemoryChecker.*.log && exit 0)

-clang8:
+icc-std17:
+    image: meteocima/dkr-intel
    stage: test
    script:
-      - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends --assume-yes --quiet clang-8
+      - . /opt/intel/bin/compilervars.sh intel64
+      - export CXX="icpc"
+      - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libblas-dev liblapack-dev libfftw3-dev
      - ln --symbolic --force . ../multi
-      - export CXX="clang++-8 -x c++ -Wall -Wextra -Wpedantic -Wmove -Ofast "
-      - $CXX --version
-      - cd tests
-      - for a in ./*.cpp; do echo $a; sh $a || exit; echo "\n"; done;
+      - $CXX -v
+      - mkdir build && cd build
+      - cmake .. -DCMAKE_CXX_STANDARD=17 -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
+      - cmake --build . -- --quiet --no-print-directory
+      - export MKL_VERBOSE=1
+      - ctest --output-on-failure

-clang10:
+cuda-10.0:
+    image: vistart/cuda:10.2-ubuntu20.04
    stage: test
    script:
-      - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends --assume-yes --quiet clang-10
+      - export DEBIAN_FRONTEND=noninteractive
+      - apt-get update && apt-get install --no-install-recommends -y --quiet gcc-8 g++-8 cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
      - ln --symbolic --force . ../multi
-      - export CXX="clang++-10 -x c++ -Wall -Wextra -Wpedantic -Wmove -Ofast "
-      - $CXX --version
-      - cd tests
-      - for a in ./*.cpp; do echo $a; sh $a || exit; echo "\n"; done;
+      - nvcc --version
+      - g++-8 --version
+      - cd test
+      - mkdir build && cd build
+      - cmake -DENABLE_CUDA=1 -DCMAKE_CUDA_FLAGS="-ccbin=g++-8" ..
+      - cmake --build . --verbose
+      - ctest --output-on-failure

-nvcc:
+cuda-11.0:
+    image: nvidia/cuda:11.0-devel
    stage: test
    script:
-      - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends --assume-yes --quiet g++ nvidia-cuda-toolkit 
+      - export DEBIAN_FRONTEND=noninteractive
+      - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
      - ln --symbolic --force . ../multi
-      - export CXX="nvcc    -x cu --compiler-options=-Wall,-Wextra,-O3"
-      - $CXX --version
-      - cd tests
-      - for a in ./*.cpp; do echo $a; sh $a || exit; echo "\n"; done;
+      - nvcc --version
+      - mkdir build && cd build
+      - cmake .. -DENABLE_CUDA=1
+      - cmake --build . --verbose
+      - ctest --output-on-failure

-blas:
+cuda-11.1:
+    image: nvidia/cuda:11.1-devel
    stage: test
    script:
-      - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends --assume-yes --quiet g++ pkg-config libblas-dev libblas64-dev nvidia-cuda-toolkit
+      - export DEBIAN_FRONTEND=noninteractive
+      - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
      - ln --symbolic --force . ../multi
-      - export CXX="g++     -x c++  -Wall -Wextra -Wpedantic -Ofast "
-      - $CXX --version
-      - cd adaptors/blas
-      - for a in ./*.hpp; do echo $a; sh $a || exit; echo "\n"; done;
+      - nvcc --version
+      - mkdir build && cd build
+      - cmake .. -DENABLE_CUDA=1
+      - cmake --build . --verbose
+      - ctest --output-on-failure
+
+cuda-11.0-std17:
+    image: nvidia/cuda:11.0-devel
+    stage: test
+    script:
+      - export DEBIAN_FRONTEND=noninteractive
+      - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+      - ln --symbolic --force . ../multi
+      - nvcc --version
+      - mkdir build && cd build
+      - cmake .. -DCMAKE_CXX_STANDARD=17 -DENABLE_CUDA=1
+      - cmake --build . --verbose
+      - ctest --output-on-failure
+
+cuda-11.1-std17:
+    image: nvidia/cuda:11.1-devel
+    stage: test
+    script:
+      - export DEBIAN_FRONTEND=noninteractive
+      - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+      - ln --symbolic --force . ../multi
+      - nvcc --version
+      - mkdir build && cd build
+      - cmake .. -DCMAKE_CXX_STANDARD=17 -DENABLE_CUDA=1
+      - cmake --build . --verbose
+      - ctest --output-on-failure
+
+cuda-11.2-std17:
+    image: nvidia/cuda:11.2.0-devel
+    stage: test
+    script:
+      - export DEBIAN_FRONTEND=noninteractive
+      - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
+      - ln --symbolic --force . ../multi
+      - nvcc --version
+      - mkdir build && cd build
+      - cmake .. -DCMAKE_CXX_STANDARD=17 -DENABLE_CUDA=1
+      - cmake --build . --verbose
+      - ctest --output-on-failure
+
+g++-cppcheck:
+  stage: test
+  script:
+    - export CXX="g++"
+    - apt-get -qq update && apt-get -qq install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev liblapack-dev libfftw3-dev cppcheck
+    - ln --symbolic --force . ../multi
+    - $CXX --version
+    - cppcheck --version
+    - mkdir build && cd build
+    - cmake -DCMAKE_CXX_CPPCHECK="cppcheck;--enable=all;--suppress=missingIncludeSystem;--suppress=unmatchedSuppression;--suppress=missingInclude;--inline-suppr;-D__align__;-DCUDARTAPI;--language=c++;--std=c++17;--error-exitcode=666" ..
+    - cmake --build .
+    - ctest --output-on-failure
+
+qmcpack-g++:
+  stage: test
+  script:
+    - apt-get -qq update && apt-get -qq install --no-install-recommends -y libblas-dev liblapack-dev libfftw3-dev libboost-serialization-dev libopenmpi-dev gfortran g++ cmake make git ca-certificates   numdiff  python3 python3-numpy python3-h5py python3-mpi4py python3-scipy libxml2-dev libhdf5-dev
+    - git clone https://github.com/QMCPACK/qmcpack.git
+    - cd qmcpack
+    - git config --global user.email "alfredo.correa@gmail.com" && git config --global user.name "Alfredo Correa"
+    - git rm -r external_codes/boost_multi/multi && git commit -m "remove multi subtree"
+    - git subtree add --squash -P external_codes/boost_multi/multi https://gitlab.com/correaa/boost-multi.git $CI_COMMIT_BRANCH
+    - cd build
+    - cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DBUILD_AFQMC=1 -DBUILD_PPCONVERT=1 -DQMC_MIXED_PRECISION=1 -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS="-Werror" -DMPIEXEC_PREFLAGS="--allow-run-as-root;--bind-to;none" ..
+    - make ppconvert afqmc test_afqmc_matrix test_afqmc_numerics test_afqmc_slaterdeterminantoperations test_afqmc_walkers test_afqmc_hamiltonians test_afqmc_hamiltonian_operations test_afqmc_phmsd test_afqmc_wfn_factory test_afqmc_prop_factory test_afqmc_estimators qmc-afqmc-performance
+    - ctest -R ppconvert --output-on-failure
+    - ctest -R afqmc     --output-on-failure
+
+qmcpack-cuda-11.2-compileonly:
+    image: nvidia/cuda:11.2.0-devel
+    stage: test
+    script:
+      - export DEBIAN_FRONTEND=noninteractive
+      - apt-get -qq update && apt-get -qq install --no-install-recommends -y libblas-dev liblapack-dev libfftw3-dev libboost-serialization-dev libopenmpi-dev gfortran g++ cmake make git ca-certificates    numdiff  python3 python3-numpy python3-h5py python3-mpi4py python3-scipy libxml2-dev libhdf5-dev
+      - git clone https://github.com/QMCPACK/qmcpack.git
+      - cd qmcpack
+      - git config --global user.email "alfredo.correa@gmail.com" && git config --global user.name "Alfredo Correa"
+      - git rm -r external_codes/boost_multi/multi && git commit -m "remove multi subtree"
+      - git subtree add --squash -P external_codes/boost_multi/multi https://gitlab.com/correaa/boost-multi.git $CI_COMMIT_BRANCH
+      - cd build
+      - cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DBUILD_AFQMC=1 -DBUILD_PPCONVERT=1 -DQMC_CXX_STANDARD=17 -DENABLE_CUDA=1 ..
+      - make ppconvert afqmc test_afqmc_matrix test_afqmc_numerics test_afqmc_slaterdeterminantoperations test_afqmc_walkers test_afqmc_hamiltonians test_afqmc_hamiltonian_operations test_afqmc_phmsd test_afqmc_wfn_factory test_afqmc_prop_factory test_afqmc_estimators qmc-afqmc-performance
+      - ctest -R ppconvert --output-on-failure
+
+inq-g++-latest:
+  stage: test
+  script:
+    - apt-get update && apt-get install --no-install-recommends -y --quiet libblas-dev liblapack-dev libfftw3-dev libboost-serialization-dev libopenmpi-dev gfortran g++ cmake make git ca-certificates
+    - git clone --recurse-submodules --remote-submodules https://gitlab.com/npneq/inq.git
+    - cd inq
+    - cd external_libs/multi
+    - git checkout $CI_COMMIT_BRANCH
+    - cd ../..
+    - mkdir build && cd build
+    - CXX=mpic++ ../configure --prefix=$HOME
+    - make
+    - make install
+    - ctest --output-on-failure
+    
+inq-cuda-11.2-compileonly:
+    image: nvidia/cuda:11.2.0-devel
+    stage: test
+    script:
+      - export DEBIAN_FRONTEND=noninteractive
+      - apt-get update && apt-get install --no-install-recommends -y --quiet libblas-dev liblapack-dev libfftw3-dev libboost-serialization-dev libopenmpi-dev gfortran g++ cmake make git ca-certificates
+      - git clone --recurse-submodules https://gitlab.com/npneq/inq.git
+      - cd inq
+      - cd external_libs/multi
+      - git checkout $CI_COMMIT_BRANCH
+      - cd ../..
+      - mkdir build && cd build
+      - export CUDACXX=/usr/local/cuda/bin/nvcc
+      - export CUDAFLAGS="$(for x in `mpic++ --showme:incdirs`; do echo -n -I$x" " ; done) -std=c++17 -DFMT_USE_UDL_TEMPLATE=0 -D_DISABLE_CUDA_SLOW -O0 --gpu-architecture sm_70 --expt-relaxed-constexpr --expt-extended-lambda --Werror=cross-execution-space-call --compiler-options -std=c++17,-O0,-Wall,-Wfatal-errors"
+      - export LDFLAGS=$(for x in `mpic++ --showme:libdirs`; do echo -n -L$x" " ; done)
+      - export LIBS=$(for x in `mpic++ --showme:libs`; do echo -n -l$x" " ; done)
+      - $CUDACXX -V
+      - ../configure --prefix=$PREFIX --enable-cuda --with-cuda-prefix=/usr/local/cuda
+      - make silicon
+
+g++-runner:
+    stage: test
+    tags:
+      - intel_compiler
+    script:
+      - export CXX="g++"
+      - $CXX --version
+      - mkdir build && cd build
+      - cmake --version
+      - cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
+      - cmake --build . -j 12
+      - ctest --output-on-failure
+
+#icpc-nomkl-runner:
+#    stage: test
+#    tags:
+#      - intel_compiler
+#    script:
+#      - export CXX="/opt/intel/system_studio_2020/bin/icpc"
+#      - $CXX --version
+#      - mkdir build && cd build
+#      - cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
+#      - cmake --build . -j 12
+#      - export MKL_VERBOSE=1
+#      - ctest --output-on-failure
+
+#icpc-runner:
+#    stage: test
+#    tags:
+#      - intel_compiler
+#    script:
+#      - . /opt/intel/system_studio_2020/bin/compilervars.sh intel64
+#      - export CXX="icpc"
+#      - $CXX --version
+#      - mkdir build && cd build
+#      - cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
+#      - cmake --build . -j 12
+#      - export MKL_VERBOSE=1
+#      - ctest --output-on-failure
+
+#icpc-memcheck-runner:
+#    stage: test
+#    tags:
+#      - intel_compiler
+#    script:
+#      - . /opt/intel/system_studio_2020/bin/compilervars.sh intel64
+#      - export CXX="icpc"
+#      - $CXX --version
+#      - mkdir build && cd build
+#      - cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
+#      - cmake --build . -j 12
+#      - ctest -T memcheck --output-on-failure || (cat Testing/Temporary/MemoryChecker.*.log && exit 0)
+
+#icpc-std17-runner:
+#    stage: test
+#    tags:
+#      - intel_compiler
+#    script:
+#      - . /opt/intel/system_studio_2020/bin/compilervars.sh intel64
+#      - export CXX="icpc"
+#      - $CXX --version
+#      - mkdir build && cd build
+#      - cmake .. -DCMAKE_CXX_STANDARD=17 -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
+#      - cmake --build . -j 12
+#      - export MKL_VERBOSE=1
+#      - ctest --output-on-failure
+
+inq-nvcc-ompi:
+  stage: test
+  tags:
+    - intel_compiler
+  script:
+    - export PREFIX=`mktemp -d`
+    - git clone --recurse-submodules https://gitlab.com/npneq/inq.git
+    - cd inq
+    - cd external_libs/multi
+    - git checkout $CI_COMMIT_BRANCH
+    - cd ../..
+    - mkdir build && cd build
+    - export CUDACXX=/usr/local/cuda/bin/nvcc
+    - export CUDAFLAGS="$(for x in `mpic++ --showme:incdirs`; do echo -n -I$x" " ; done) -std=c++17 -DFMT_USE_UDL_TEMPLATE=0 -D_DISABLE_CUDA_SLOW -O3 --gpu-architecture sm_70 --expt-relaxed-constexpr --expt-extended-lambda --Werror=cross-execution-space-call --compiler-options -Ofast,-std=c++17,-Wall,-Wfatal-errors"
+    - export LDFLAGS=$(for x in `mpic++ --showme:libdirs`; do echo -n -L$x" " ; done)
+    - export LIBS=$(for x in `mpic++ --showme:libs`; do echo -n -l$x" " ; done)
+    - $CUDACXX -V
+    - ../configure --prefix=$PREFIX --enable-cuda --with-cuda-prefix=/usr/local/cuda
+    - make -j8
+    - make -j8 install
+    - ctest --output-on-failure --timeout 600
+    - cd src; INQ_EXEC_ENV="mpirun --oversubscribe -np 4" ctest --output-on-failure --timeout 600; cd ..
+    - rm -rf $PREFIX
+
+#blas&fft:
+#    stage: test
+#    script:
+#      - perl -pi -e 's/main/main\ contrib\ non-free/g' /etc/apt/sources.list
+#      - apt update --quiet
+#      - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends -f-assume-yes --quiet libboost-test-dev libboost-timer-dev libtbb-dev libboost-serialization-dev libboost-iostreams-dev librange-v3-dev valgrind
+#      - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends --assume-yes --quiet clang pkg-config libblas-dev libblas64-dev libfftw3-dev nvidia-cuda-toolkit
+#      - ln --symbolic --force . ../multi
+#      - export CXXX="clang++ -x c++"
+#      - export CXXFLAGS="-Wall -Wextra -Wpedantic -O3 -lcudart -lfftw3 -lcublas -lcufft -lboost_timer -lboost_unit_test_framework `pkg-config --libs blas`"
+#      - export CXX="${CXXX} ${CXXFLAGS}"
+#      - $CXX --version
+#      - cd adaptors/blas
+#      - for a in ./*.hpp; do echo $a; $CXX $a || exit; done;
+#      - cd tests
+#      - for a in ./*.cpp; do echo $a; $CXX $a || exit; done;
+#      - cd ..
+#      - cd ../..
+#      - cd adaptors
+#      - sh ./fftw.hpp
+#      - $CXX fft.hpp
+
+#blas&fftGPU-11:
+#    stage: build
+#    tags:
+#      - cuda_gpu
+#    stage: test
+#    script:
+#      - export PATH=/usr/local/cuda-11.0/bin:$PATH #export PATH=/usr/local/cuda/bin:$PATH
+#      - export LD_LIBRARY_PATH=/usr/local/cuda-11.0/lib64:$LD_LIBRARY_PATH
+#      - export CXXX="clang++ -x c++"
+#      - export CXXFLAGS="`#-Wall -Wextra -Wpedantic` -Ofast -Wl,-rpath=/usr/local/cuda/lib64 -L/usr/local/cuda-11.0/lib64 -I/usr/local/cuda-11.0/include -lcudart -lfftw3 -lcublas -lcufft -lboost_timer -lboost_unit_test_framework `pkg-config --libs blas` "
+#      - export CXX="${CXXX} ${CXXFLAGS}"
+#      - $CXX --version
+#      - cd adaptors/blas
+#      - for a in ./*.hpp; do echo $a; sh $a || exit; echo "\n"; done;
+#      - cd tests
+#      - for a in ./*.cpp; do echo $a; sh $a || exit; echo "\n"; done;
+#      - cd ..
+#      - cd ../..
+#      - cd adaptors
+#      - sh ./fftw.hpp
+#      - sh ./fft.hpp
+
+#blas&fftGPU:
+#    stage: build
+#    tags:
+#      - cuda_gpu
+#    stage: test
+#    script:
+#      - export PATH=/usr/local/cuda/bin:$PATH #export PATH=/usr/local/cuda/bin:$PATH
+#      - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+#      - export CXXX="clang++ -x c++"
+#      - export CXXFLAGS="`#-Wall -Wextra -Wpedantic` -Ofast -Wl,-rpath=/usr/local/cuda/lib64 -L/usr/local/cuda/lib64 -I/usr/local/cuda/include -lcudart -lfftw3 -lcublas -lcufft -lboost_timer -lboost_unit_test_framework `pkg-config --libs blas` "
+#      - export CXX="${CXXX} ${CXXFLAGS}"
+#      - $CXX --version
+#      - cd adaptors/blas
+#      - for a in ./*.hpp; do echo $a; sh $a || exit; echo "\n"; done;
+#      - cd tests
+#      - for a in ./*.cpp; do echo $a; sh $a || exit; echo "\n"; done;
+#      - cd ..
+#      - cd ../..
+#      - cd adaptors
+#      - sh ./fftw.hpp
+#      - sh ./fft.hpp

--- a/external_codes/boost_multi/multi/CMakeLists.txt
+++ b/external_codes/boost_multi/multi/CMakeLists.txt
@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.11)
+project(boost-multi CXX)
+set(CMAKE_CXX_STANDARD 14)
+
+add_library(${PROJECT_NAME} INTERFACE)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") 
+
+#configure_file(config.h.in config.h)
+
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_RULE_MESSAGES OFF)
+
+enable_testing()
+find_program(MEMORYCHECK_COMMAND valgrind)
+set(MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --error-exitcode=1")
+include(CTest)
+
+add_subdirectory(test)
+add_subdirectory(adaptors/blas)
+
--- a/external_codes/boost_multi/multi/README.md
+++ b/external_codes/boost_multi/multi/README.md
@ -5,7 +5,7 @@

 (not an official Boost library)

-_© Alfredo A. Correa, 2018-2020_
+_© Alfredo A. Correa, 2018-2021_

 `Multi` provides multidimensional array access to contiguous or regularly contiguous memory (or ranges).
 It shares the goals of [Boost.MultiArray](https://www.boost.org/doc/libs/1_69_0/libs/multi_array/doc/index.html), 
@ -14,10 +14,6 @@ although the code is completely independent and the syntax has slight difference

 Multi aims to simplify the semantics of Boost.MultiArray and make it more compatible with the Standard (STL) Algorithms and special memory.
 It requires C++14. 
-The code was developed on `clang` (9.0) and `gcc` (9.2) compilers, and [tested regularly ](https://gitlab.com/correaa/boost-multi/pipelines) with NVCC (10.1) awith Intel (19.1) and  compilers.
-
-Before testing speed, please make sure that you are compiling in release mode (`-DNDEBUG`) and with optimizations (`-O3`), 
-if your test involves mathematical operations add arithmetic optimizations (`-Ofast`) to compare with Fortran code.

 Some features:

@ -30,6 +26,38 @@ Some features:

 (Do not confuse this library with Boost.MultiArray or Boost.MultiIndex.)

+
+## Contents
+[[_TOC_]]
+
+## Installation and Tests
+
+`Multi` doesn't require instalation, single file `#include<multi/array.hpp>` is enough to use the full core library.
+`Multi`'s _only_ dependecy is the standard C++ library.
+
+It is important to compile programs that use the library with a decent level of optimization (e.g. `-O2`) to avoid slowdown if indiviudual element-access is intensively used.
+For example, when testing speed, please make sure that you are compiling in release mode (`-DNDEBUG`) and with optimizations (`-O3`), 
+if your test involves mathematical operations add arithmetic optimizations (`-Ofast`) to compare with Fortran code.
+
+A CMake build system is provided to automatically run basic tests.
+Test do depend on Boost.Test.
+
+```bash
+git clone https://gitlab.com/correaa/boost-multi.git multi
+cd multi
+```
+```bash
+#export CXX="nvcc -DBOOST_PP_VARIADICS=1 -x cu -O3"  #optional spec. compiler
+mkdir -p test/build
+cd test/build
+cmake ..
+make -j
+make test -j
+```
+
+The code is developed on `clang` (10.0), `gcc` (9.3) and `nvcc` 11 compilers, and [tested regularly ](https://gitlab.com/correaa/boost-multi/pipelines) with clang 9.0, NVCC 10.1, Intel (19.1), and PGI(nvc++) 20.7 compilers.
+For detailed compilation instructions of test see the Continuous Integration (CI) definition file https://gitlab.com/correaa/boost-multi/-/blob/master/.gitlab-ci.yml
+
 ## Types

 * `multi::array<T, D, A>`: Array of dimension `D`, it has value semantics if `T` has value semantics. Memory is requested by allocator of type `A`, should support stateful allocators.
@ -42,42 +70,50 @@ Some features:

 Declare an array specifying the element type and the dimension.
 Elements can be input with nested braced notation.
-```c++
+```cpp
 std::array<double, 2> A = {
 	{1, 2, 3}
 	{4, 5, 6}
 };
 ```

-The size is automatically deduced; the first dimension are the "rows" above.
+The size is automatically deduced; the first dimension are the (two) "rows" above.

-```c++
+```cpp
 assert( A.size()==2 );
 assert( std::get<1>(A.sizes()) == 3 );
 ```

-Arrays can be copied, or moved, and compared. Copies are independent.
-```c++
+The value of an array can be copied, moved, and compared.
+Copies are equal but independent.
+```cpp
 std::array<double, 2> B = A;
-assert( B[0][1] == A[0][1] );
-assert( &B[0][1] == &A[0][1] );
 assert( extensions(B) == extensions(A) );
+assert(  B[0][1] ==  A[0][1] );
+assert( &B[0][1] != &A[0][1] );
 assert( B == A );
 ```

-Arrays can be passed by value or by reference, most of the time they should be passed through generic parameters,
+Array can be initialized by the size alone, in which case the element values are default constructed:

-```c++
+```cpp
+std::array<double, 3> C({3, 4, 5}); // 3*4*5 = 60 elements
+```
+
+Arrays can be passed by value or by reference, most of the time they should be passed through generic parameters.
+Most useful function work on the concept of array rather than on a concrete type.
+
+```cpp
 template<class ArrayDouble2D> // instead of the over specific argument std::array<double, 2>
 double const& element_1_1(ArrayDouble2D const& m){return m[1][1];}
 ...
 assert( element_1_1(A) == A[1][1] );
 ```

-In this way the functions can be called on blocks of larger matrices.
+These generic function arguments that are not intended to be modified are passed by `const&`; otherwise pass by forward-reference `&&`.
+In this way the functions can be called on subblocks of larger matrices.

-```c++
-std::array<double, 3> C3D;
+```cpp
 assert( &element_1_1(C3D[0]) == &C3D[0][1][1] );
 ```

@ -85,7 +121,7 @@ assert( &element_1_1(C3D[0]) == &C3D[0][1][1] );

 We create a static C-array of `double`s, and refer to it via a bidimensional array `multi::array_ref<double, 2>`.

-```c++
+```cpp
 	#include "../array_ref.hpp"
 	#include "../array.hpp"
 	
@ -103,38 +139,44 @@ We create a static C-array of `double`s, and refer to it via a bidimensional arr
 			{ 50,  6,  7,  8,  9} 
 		};
 		multi::array_ref<double, 2> d2D_ref{&d2D[0][0], {4, 5}};
-															...
+		...
 ```

 Note that the syntax of creating a reference array involves passing the pointer to a memory block (20 elements here) and the logical dimensions of that memory block (4 by 5 here).

 Next we print the elements in a way that corresponds to the logical arrangement:

-```c++
+```cpp
+		...
 		for(auto i : d2D_ref.extension(0)){
 			for(auto j : d2D_ref.extension(1))
 				cout << d2D_ref[i][j] <<' ';
 			cout <<'\n';
 		}
+		...
 ```

 This will output:

+> ```cpp
 > 150 16 17 18 19  
 > 30 1 2 3 4  
 > 100 11 12 13 14  
 > 50 6 7 8 9
+> ```

 It is sometimes said (by Sean Parent) that the whole of STL algorithms can be seen as intermediate pieces to implement`std::stable_sort`. 
 Pressumably if one can sort over a range, one can perform any other standard algorithm.

-```c++
+```cpp
+		...
 		std::stable_sort( begin(d2D_ref), end(d2D_ref) );
+		...
 ```

 If we print this we will get

-> ```c++
+> ```cpp
 > 30 1 2 3 4  
 > 50 6 7 8 9  
 > 100 11 12 13 14  
@ -145,24 +187,30 @@ If we print this we will get
 The array has been changed to be in row-based lexicographical order.
 Since the sorted array is a reference to the original data, the original array has changed. 

-```c++
+```cpp
+		...
 		assert( d2D[1][1] == 6 );
+		...
 ```

 (Note that `std::*sort` cannot be applied directly to a multidimensional C-array or to Boost.MultiArray types.)

 If we want to order the matrix in a per-column basis we need to "view" the matrix as range of columns. This is done in the bidimensional case, by accessing the matrix as a range of columns:

-```c++
-	    std::stable_sort( d2D_ref.begin(1), d2D_ref.end(1) );
+```cpp
+		...
+		std::stable_sort( d2D_ref.begin(1), d2D_ref.end(1) );
+	}
 ```

 Which will transform the matrix into. 

+> ```cpp
 > 1 2 3 4 30  
 > 6 7 8 9 50  
 > 11 12 13 14 100  
 > 16 17 18 19 150 
+> ```

 In other words, a matrix of dimension `D` can be viewed simultaneously as `D` different ranges of different "transpositions" by passing an interger value to `begin` and `end` indicating the preferred dimension.
 `begin(0)` is equivalent to `begin()`.
@ -171,7 +219,7 @@ In other words, a matrix of dimension `D` can be viewed simultaneously as `D` di

 `array_ref` is initialized from a preexisting contiguous range, the index extensions should compatible with the total number of elements.

-```c++
+```cpp
 double* dp = new double[12];
 multi::array_ref<double, 2> A({3,4}, dp);
 multi::array_ref<double, 2> B({2,6}, dp);
@ -180,7 +228,7 @@ delete[] dp;
 ```
 `array` is initialized by specifying the index extensions (and optionally a default value) or alternatively from a rectangular list. 

-```c++
+```cpp
 /*In C++17 the element-type and the dimensionality can be omitted*/
 multi::array/*<double, 1>*/ A1 = {1.,2.,3.}; 
                     assert(A1.dimensionality==1 and A1.num_elements()==3);
@ -212,7 +260,7 @@ For example in three dimensional array,

 As an example, this function allows printing arrays of arbitrary dimension into a linear comma-separated form.

-```c++
+```cpp
 void print(double const& d){cout<<d;};
 template<class MultiArray> 
 void print(MultiArray const& ma){
@ -261,7 +309,7 @@ Transpositions are also multi-dimensional arrays views in which the index are *l
 As an illustration of an algorithm based on index access (as opposed to iterators), 
 this example code implements Gauss Jordan Elimination without pivoting:

-```c++
+```cpp
 template<class Matrix, class Vector>
 auto gj_solve(Matrix&& A, Vector&& y)->decltype(y[0]/=A[0][0], y){
 	std::ptrdiff_t Asize = size(A); 
@ -287,7 +335,7 @@ auto gj_solve(Matrix&& A, Vector&& y)->decltype(y[0]/=A[0][0], y){

 This function can be applied to a `multi::array` container:

-```c++
+```cpp
 multi::array<double, 2> A = {{-3., 2., -4.},{0., 1., 2.},{2., 4., 5.}};
 multi::array<double, 1> y = {12.,5.,2.}; //(M); assert(y.size() == M); iota(y.begin(), y.end(), 3.1);
 gj_solve(A, y);
@ -295,7 +343,7 @@ gj_solve(A, y);

 and also to a combination of `MultiArrayView`-type objects:

-```c++
+```cpp
 multi::array<double, 2> A({6000, 7000}); std::iota(A.data(), A.data() + A.num_elements(), 0.1);
 std::vector<double> y(3000); std::iota(y.begin(), y.end(), 0.2);
 gj_solve(A({1000, 4000}, {0, 3000}), y);
@ -305,7 +353,7 @@ gj_solve(A({1000, 4000}, {0, 3000}), y);

 Given an array, a slice in the first dimension can be taken with the `sliced` function. `sliced` takes two arguments, the first index of the slice and the last index (not included) of the slice. For example,

-```c++
+```cpp
 multi::array<double, 2> d2D({4, 5});
 assert( d2D.size(0) == 4 and d2D.size(1) == 5 );

@ -317,7 +365,7 @@ The number of rows in the sliced matrix is 2 because we took only two rows, row

 In the same way a strided view of the original array can be taken with the `strided` function.

-```c++
+```cpp
 auto&& d2D_strided = d2D.strided(2); // {{ d2D[0], d2D[1] }};
 assert( d2D_strided.size(0) == 2 and d2D_strided.size(1) == 5 );
 ```
@ -326,7 +374,7 @@ In this case the number of rows is 2 because, out of the 4 original rows we took

 Operations can be combined in a single line:

-```c++
+```cpp
 auto&& d2D_slicedstrided = d2D.sliced(1, 3).strided(2); // {{ d2D[1] }};
 assert( d2D_slicedstrided.size(0) == 1 and d2D_slicedstrided.size(1) == 5 );
 ```
@ -336,7 +384,7 @@ For convenience, `A.sliced(a, b, c)` is the same as `A.sliced(a, b).strided(c)`.
 By combining `rotated`, `sliced` and `strided` one can take sub arrays at any dimension. 
 For example in a two dimensional array one can take a subset of columns by defining.

-```c++
+```cpp
 auto&& subA = A.rotated(1).strided(1, 3).sliced(2).rotated(-1);
 ```

@ -345,7 +393,7 @@ Other notations are available, but when in doubt the `rotated/strided/sliced/rot

 Blocks (slices) in multidimensions can be obtained but pure index notation using `.operator()`:

-```c++
+```cpp
 multi::array<double, 2> A({6, 7}); // 6x7 array
 A({1, 4}, {2, 4}) // 3x2 array, containing indices 1 to 4 in the first dimension and 2 to 4 in the second dimension.
 ```
@ -355,7 +403,7 @@ A({1, 4}, {2, 4}) // 3x2 array, containing indices 1 to 4 in the first dimension
 The design tries to impose the minimum possible requirements over the used referred types.
 Pointer-like random access types can be used as substitutes of built-in pointers.

-```c++
+```cpp
 namespace minimal{
    template<class T> class ptr{ // minimalistic pointer
    	T* impl_;
@ -381,7 +429,7 @@ int main(){
 An `array_ref` can reference to an arbitrary random access iterator sequence.
 This way, any linear (random access) sequence (e.g. `raw memory`, `std::vector`, `std::queue`) can be efficiently arranged as a multidimensional array. 

-```c++
+```cpp
 std::vector<double> buffer(100);
 multi::array_ref<double, 2, std::vector<double>::iterator> A({10, 10}, buffer.begin());
 A[1][1] = 9;
@ -398,7 +446,7 @@ Associated fancy pointers and fancy reference (if any) are deduced from the allo

 The behavior regarding memory managament of the [fancy pointers](https://en.cppreference.com/w/cpp/named_req/Allocator#Fancy_pointers) can be customized (if necessary) by specializations of some or all of these functions:

-```c++
+```cpp
 destroy(a, first, last)
 destroy_n(a, first, n) -> last
 uninitialized_copy_n(a, first, n, dest) -> last;
@ -411,7 +459,7 @@ where `a` is the special allocator, `n` is a size (usually the number of element

 Copying underlying memory can be customized by specializing 

-```c++
+```cpp
 copy_n(first, n, dest)
 fill_n(first, n, value)
 ```
@ -420,7 +468,7 @@ Specific cases of fancy memory are file-mapped memory or interprocess shared mem
 This example illustrates memory persistency by combining with Boost.Interprocess library. 
 The arrays support their allocators and fancy pointers (`boost::interprocess::offset_ptr`).

-```c++
+```cpp
 #include <boost/interprocess/managed_mapped_file.hpp>
 using namespace boost::interprocess;
 using manager = managed_mapped_file;
@ -446,13 +494,18 @@ int main(){
 }
 ```

-# Interoperability
+# Interoperability with other software
+
+## STL (Standard Template Library)
+
+The fundamental goal of the library is that the arrays and iterators can be used with STL algorithms out-of-the-box with a reasonable efficiency.
+The most dramatic example of this is that `std::sort` works with array as it is shown in a previous example.

 Along with STL itself, the library tries to interact with other existing C++ libraries.

 ## Range v3

-```c++
+```cpp
 #include <range/v3/all.hpp>
 int main(){

@ -474,7 +527,7 @@ int main(){

 Using Interprocess allows for shared memory and for persistent mapped memory.

-```c++
+```cpp
 #include <boost/interprocess/managed_mapped_file.hpp>
 #include "multi/array.hpp"
 #include<cassert>
@ -507,7 +560,7 @@ int main(){

 ## Cuda thrust

-```c++
+```cpp
 #include "multi/adaptors/thrust/allocator_traits.hpp"
 #include "multi/adaptors/thrust/algorithms.hpp"
 #include "multi/array.hpp"
@ -522,6 +575,36 @@ int main(){
 }
 ```

+## TotalView
+
+TotalView visual debugger (commercial) can display arrays in human-readable form (for simple types, like `double` or `std::complex`).
+To use it, simply `#include "multi/adaptors/totalview.hpp"` and link to the TotalView libraries, compile and run the code with the debugger.
+
+## Memory Resources
+
+The library is compatible with C++17's polymorphic memory resources which allows using preallocated buffers. 
+This enables the use of stack memory or in order to reduce the number of allocations.
+For example, this code ends up with `buffer` containing the string `"aaaabbbbbb  "`.
+
+```cpp
+#include<pmr>
+int main(){
+	char buffer[13] = "____________"; // a small buffer on the stack
+	std::pmr::monotonic_buffer_resource pool{std::data(buffer), std::size(buffer)}; // or multi::memory::monotonic<char*>
+
+	multi::array<char, 2, std::pmr::polymorphic_allocator<char>> A({2, 2}, 'a', &pool); // or multi::memory::monotonic_allocator<double>
+	multi::array<char, 2, std::pmr::polymorphic_allocator<char>> B({3, 2}, 'b', &pool);
+}
+```
+
+The library comes with its own customized (non-polymorphic) memory resources if, for any reason, the standard PMRs are not sufficiently general.
+The headers to include are:
+
+```cpp
+#include<multi/memory/monotonic.hpp> // multi::memory::monotonic<char*> : no memory reclaim
+#include<multi/memory/stack.hpp>     // multi::memory::stack<char*>     : FIFO memory reclaim
+```
+
 # Technical points

 ### What's up with the multiple bracket notation? 
@ -533,23 +616,23 @@ Moreover, this goes against [historical recommendations](https://isocpp.org/wiki
 It turns out that [modern compilers with a fair level of optimization (`-O2`)](https://godbolt.org/z/3fYd5c) can elide these temporary objects, so that `A[i][j][k]` generates identical assembly code as `A.base() + i*stride1 + j*stride2 + k*stride3` (+offsets not shown).

 In a subsequent optimization, constant indices can have their "partial stride" computation removed from loops. 
-As a result, these two loops lead to the [same machine code](https://godbolt.org/z/p_ELwQ):
+As a result, these two loops lead to the [same machine code](https://godbolt.org/z/z1se74):

-```c++
+```cpp
    for(int j = 0; j != nj; ++j)
        ++A[i][j][k];
 ```
-```c++
+```cpp
    double* Ai_k = A.base() + i*A_stride1 + k*A_stride3;
    for(int j = 0; j != nj; ++jj)
        ++(*(Ai_k + j*A_stride2));
 ```

 Incidentally, the library also supports parenthesis notation with multiple indices `A(i, j, k)` for element or partial access, but it does so for accidental reasons as part of a more general syntax to generate sub-blocks.
-In any case `A(i, j, k)` is expanded to `A[i][j][k]` internally in the library when `i, j, k` are integer indices. 
+In any case `A(i, j, k)` is expanded to `A[i][j][k]` internally in the library when `i, j, k` are normal integer indices.
 Additionally, array coordinates can be directly stored in tuple-like data structures, allowing this functional syntax:

-```c++
+```cpp
 std::array p = {2,3,4};
 std::apply(A, p) = 234; // A[2][3][4] = 234;
 ```
@ -567,7 +650,7 @@ The base pointer, the strides and the size of the arrow can be accessed by `base

 The template arguments of the iterator can be used to customize operations that are recursive (and possibly inefficient in certain context) in the library:

-```c++
+```cpp
 namespace boost{namespace multi{
 template<class It, class T>  // custom copy 1D (aka strided copy)
 void copy(It first, It last, multi::array_iterator<T, 1, fancy::ptr<T> > dest){
--- a/external_codes/boost_multi/multi/adaptors/blas.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas.hpp
@ -1,7 +1,8 @@
-#ifdef COMPILATION_INSTRUCTIONS
-(echo '#include"'$0'"'>$0.cpp)&&$CXX -Wall -Wextra -D_TEST_MULTI_ADAPTORS_BLAS $0.cpp -o$0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x $0.cpp;exit
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+$CXXX $CXXFLAGS $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
 #endif
 // © Alfredo A. Correa 2018-2020
+
 #ifndef MULTI_ADAPTORS_BLAS_HPP
 #define MULTI_ADAPTORS_BLAS_HPP

@ -20,7 +21,7 @@
 #include "../adaptors/blas/scal.hpp"
 #include "../adaptors/blas/swap.hpp"

-#if _TEST_MULTI_ADAPTORS_BLAS
+#if not __INCLUDE_LEVEL__

 #define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS"
 #define BOOST_TEST_DYN_LINK
@ -35,11 +36,9 @@
 #include<algorithm> // transform

 namespace multi = boost::multi;
-using complex = std::complex<double>;
-complex const I{0, 1};

 BOOST_AUTO_TEST_CASE(multi_blas_herk_complex){
-
+	using complex = std::complex<double>; complex const I{0, 1};
 	using multi::blas::herk;
 	{
 		multi::array<complex, 2> const A = {
@ -54,10 +53,9 @@ BOOST_AUTO_TEST_CASE(multi_blas_herk_complex){
 	}
 }

-using T = std::complex<double>;
-
 BOOST_AUTO_TEST_CASE(multi_blas_asum_complex){
-	multi::array<T, 1> arr(1000000000, 0.);
+	using complex = std::complex<double>;
+	multi::array<complex, 1> arr(1000, 0.);
 //	std::iota(begin(arr), end(arr), -700.);
 //	std::transform(cbegin(arr), cend(arr), begin(arr), [](auto&& a){return sqrt(a);});
 	{
@ -68,7 +66,7 @@ BOOST_AUTO_TEST_CASE(multi_blas_asum_complex){
 }

 BOOST_AUTO_TEST_CASE(multi_blas_nrm2_complex){
-	multi::array<T, 1> arr(1000000000, 0.);
+	multi::array<complex, 1> arr(1000, 0.);
 //	std::iota(begin(arr), end(arr), -700.);
 //	std::transform(cbegin(arr), cend(arr), begin(arr), [](auto&& a){return sqrt(a);});
 	{
@ -77,333 +75,6 @@ BOOST_AUTO_TEST_CASE(multi_blas_nrm2_complex){
 	}
 }

-#if 0
-	multi::array<double, 2> const CA = {
-		{1.,  2.,  3.,  4.},
-		{5.,  6.,  7.,  8.},
-		{9., 10., 11., 12.}
-	};
-	{
-		double const a0 = 2./3.; 
-		double const b0 = 4./5.;
-		double a = a0, b = b0;
-		double c, s;
-		using multi::blas::rotg;
-		rotg(a, b, c, s);
-		using std::abs; using std::sqrt;
-		assert( abs(c - 5./sqrt(61.)) < 1e-15 );
-		assert( abs(s - 6./sqrt(61.)) < 1e-15 );
-		assert( abs(a - (b0>0?1:-1)*sqrt(a0*a0 + b0*b0)) < 1e-15 );
-		assert( abs(  c*c  + s*s  - 1 ) < 1e-15 );
-		assert( abs(  c*a0 + s*b0 - a ) < 1e-15 );
-		assert( abs( -s*a0 + c*b0     ) < 1e-15 );
-	}
-	{
-		using multi::blas::rotmg;
-		double const x0 = 2./3.; 
-		double const y0 = 4./5.;
-		double const D1 = 1.;
-		double const D2 = 1.;
-		{
-			double d1 = D1;
-			double d2 = D2;
-			double x1 = x0;
-			double const y1 = y0;
-			auto m = rotmg(d1, d2, x1, y1);
-			assert( std::abs(x1 -( m.h()[0][0]*x0*std::sqrt(D1) + m.h()[0][1]*y0*std::sqrt(D2) )) < 1e-15 );
-			assert( std::abs(      m.h()[1][0]*x0*std::sqrt(D1) + m.h()[1][1]*y0*std::sqrt(D2)  ) < 1e-15 );
-		}
-		{
-			double x1 = x0;
-			double const y1 = y0;
-			double d1 = D1;
-			double d2 = D2;
-			multi::array<double, 1> X0 = {x0*std::sqrt(D1)};
-			multi::array<double, 1> Y0 = {y0*std::sqrt(D2)};
-			multi::array<double, 1> X1 = X0;
-			multi::array<double, 1> Y1 = Y0;
-			rotm(X1, Y1, rotmg(d1, d2, x1, y1));
-			assert( std::abs( X1[0] - x1 ) <1e-15 );
-			assert( Y1[0] == 0. );
-		}
-	}
-	{
-		multi::array<double, 1> X = CA[0];
-		multi::array<double, 1> Y = CA[2];
-		using multi::blas::rot;
-		using std::cos; using std::sin;
-		rot(X, Y, cos( 1.2), sin( 1.2));
-		assert(X[1] == CA[0][1]*cos(1.2) + CA[2][1]*sin(1.2));
-		assert(Y[1] == CA[2][1]*cos(1.2) - CA[0][1]*sin(1.2));
-	}
-	{
-		multi::array<double, 1> const a0 = {2./3.};
-		multi::array<double, 1> const b0 = {4./5.};
-		using multi::blas::rotg;
-		{
-			double a = a0[0], b = b0[0];
-			auto cs = rotg(a, b);
-			multi::array<double, 1> a1 = a0;
-			multi::array<double, 1> b1 = b0;
-			rot(a1, b1, cs);
-			assert( std::abs(a1[0] - a) < 1e-15 );
-			assert( std::abs(b1[0]    ) < 1e-15 );
-		}
-		{
-			double a = a0[0], b = b0[0];
-			multi::array<double, 1> a1 = a0;
-			multi::array<double, 1> b1 = b0;
-			rot(a1, b1, rotg(a, b));
-			assert( std::abs(a1[0] - a) < 1e-15 );
-			assert( std::abs(b1[0]    ) < 1e-15 );
-		}
-	}
-	{
-		using multi::blas::dot;
-		auto d = dot(CA[1], CA[2]);
-		assert(d == std::inner_product(begin(CA[1]), begin(CA[2]), end(CA[1]), 0.));
-	}
-	using dcomplex = std::complex<double>;
-	{
-		multi::array<dcomplex, 2> A = CA;
-		A[1][1] += dcomplex{1.1, 2.1};
-		A[2][1] -= dcomplex{1.1, 2.1};
-		using multi::blas::dotu;
-		using multi::blas::dotc;
-		using multi::blas::nrm2;
-		using multi::blas::asum;
-		assert(dotu(A[1], A[2]) == std::inner_product(begin(A[1]), begin(A[2]), end(A[1]), dcomplex{}, std::plus<>{}, [](auto&& a, auto&& b){return a*b;}));
-		assert(dotc(A[1], A[2]) == std::inner_product(begin(A[1]), begin(A[2]), end(A[1]), dcomplex{}, std::plus<>{}, [](auto&& a, auto&& b){return conj(a)*b;}));
-		assert(nrm2(A[1]) == std::sqrt(dotc(A[1], A[1])));
-		assert(dotu(A[1], A[2]) == std::inner_product(begin(A[1]), begin(A[2]), end(A[1]), dcomplex{}, std::plus<>{}, [](auto&& a, auto&& b){return a*b;}));
-		assert(asum(A[1]) == std::accumulate(begin(A[1]), end(A[1]), 0., [](auto&& a, auto&& b){return a + std::abs(real(b)) + std::abs(imag(b));}));
-	}
-	{
-		auto const& A = CA.rotated(1)[1]; (void)A;
-		using multi::blas::iamax;
-		assert(iamax(A) == std::distance(begin(A), std::max_element(begin(A), end(A), [](auto&& a, auto&& b){
-			return std::abs(a) < std::abs(b);
-		})));
-	}
-	
-///////////////////////////////////////////////////////////////////////////////
-	{
-		multi::array<double, 2> const M = {
-			{ 9., 24., 30., 9.},
-			{ 4., 10., 12., 7.},
-			{14., 16., 36., 1.}
-		};
-		assert( M[2][0] == 14. );
-		multi::array<double, 1> const X = {1.1,2.1,3.1, 4.1};
-		multi::array<double, 1> Y = {4.,5.,6.};
-		multi::array<double, 1> Y2 = Y;
-		multi::array<double, 1> Y3 = {214.02, 106.43, 188.37};
-		double a = 1.1, b = 1.2;
-		multi::blas::gemv(a, M, X, b, Y);
-		multi::blas::gemv<double, double>(a, M, X, b, Y2);
-		assert( Y == Y2 );
-		assert( std::abs(Y[1] - Y3[1]) < 1e-14 );
-	}
-	{
-		multi::array<double, 2> const M = {
-			{ 9., 24., 30., 9.},
-			{ 4., 10., 12., 7.},
-			{14., 16., 36., 1.}
-		};
-		assert( M[2][0] == 14. );
-		multi::array<double, 1> const X = {1.1,2.1,3.1};
-		multi::array<double, 1> Y = {4.,5.,6., 7.};
-		multi::array<double, 1> Y2 = Y;
-		multi::array<double, 1> Y3 = {72.67, 112.7, 193.98, 38.87};
-		double a = 1.1, b = 1.2;
-		multi::blas::gemv(a, M.rotated(1), X, b, Y);
-		multi::blas::gemv<double, double>(a, M.rotated(1), X, b, Y2);
-		assert( std::abs(Y[1] - Y2[1]) < 1e-13 );
-		assert( std::abs(Y[1] - Y3[1]) < 1e-13 );
-	}
-	auto const I = dcomplex{0.,1.};
-	{
-		multi::array<dcomplex, 2> const M = {
-			{ 9. + 1.*I, 24. + 2.*I, 30. + 3.*I, 9. + 1.*I}, 
-			{ 4. + 1.*I, 10. + 1.*I, 12. - 2.*I, 7. + 2.*I}, 
-			{14. + 3.*I, 16. - 4.*I, 36. + 1.*I, 1. - 2.*I}
-		};
-		multi::array<dcomplex, 1> const X = {1.1+I*2., 2.1+I*1.1, 3.1+I*8. , 4.1+I*1.2};
-		multi::array<dcomplex, 1> Y = {4.+I*3.1,5.-I*9.,6.+I*1.};
-		multi::array<dcomplex, 1> Y2 = Y;
-		multi::array<dcomplex, 1> const Y3 = {-486.81+698.69*I, -125.08+359.44*I, -504.21+707.01*I};
-		dcomplex a = 1.1+I*2.1, b = 1.2+I*3.;
-		cout<<">>"<<__LINE__ <<std::endl;
-		multi::blas::gemv(a, M, X, b, Y);
-		cout<<">>"<<__LINE__ <<std::endl;
-		multi::blas::gemv<dcomplex, dcomplex>(a, M, X, b, Y2);
-		using std::abs;
-		assert( abs(Y[0] - Y3[0]) < 1e-12 && abs(Y[1] - Y3[1]) < 1e-12 && abs(Y[2] - Y3[2]) < 1e-12 );
-	}
-	{
-		multi::array<dcomplex, 2> const M = {
-			{9. + 1.*I, 4. + 1.*I, 14. + 3.*I}, 
-			{24. + 2.*I, 10. + 1.*I, 16. - 4.*I}, 
-			{30. + 3.*I, 12. - 2.*I, 36. + 1.*I}, 
-			{9. + 1.*I,   7. + 2.*I, 1. - 2.*I}
-		};
-		multi::array<dcomplex, 1> const X = {1.1+I*2., 2.1+I*1.1, 3.1+I*8. , 4.1+I*1.2};
-		multi::array<dcomplex, 1> Y = {4.+I*3.1,5.-I*9.,6.+I*1.};
-		multi::array<dcomplex, 1> Y2 = Y;
-		multi::array<dcomplex, 1> const Y3 = {-486.81+698.69*I, -125.08+359.44*I, -504.21+707.01*I};
-		std::complex<double> a = 1.1+I*2.1, b = 1.2+I*3.;
-		cout<<">>"<<__LINE__ <<std::endl;
-		multi::blas::gemv(a, M.rotated(), X, b, Y);
-		cout<<">>"<<__LINE__ <<std::endl;
-		multi::blas::gemv<dcomplex, dcomplex>(a, M.rotated(), X, b, Y2);
-		assert( abs(Y[0] - Y3[0]) < 1e-12 && abs(Y[1] - Y3[1]) < 1e-12 && abs(Y[2] - Y3[2]) < 1e-12 );
-		assert( abs(Y[0] - Y2[0]) < 1e-12 && abs(Y[1] - Y2[1]) < 1e-12 && abs(Y[2] - Y2[2]) < 1e-12 );
-	}
-#if 0
-	{
-		multi::array<dcomplex, 2> const M = {
-			{9. + 1.*I, 4. + 1.*I, 14. + 3.*I}, 
-			{24. + 2.*I, 10. + 1.*I, 16. - 4.*I}, 
-			{30. + 3.*I, 12. - 2.*I, 36. + 1.*I}, 
-			{9. + 1.*I,   7. + 2.*I, 1. - 2.*I}
-		};
-		multi::array<dcomplex, 1> const X = {1.1+I*2., 2.1+I*1.1, 3.1+I*8. , 4.1+I*1.2};
-		multi::array<dcomplex, 1> Y = {4.+I*3.1,5.-I*9.,6.+I*1.};
-	//	multi::array<dcomplex, 1> Y2 = Y;
-	//	multi::array<dcomplex, 1> const Y3 = {-486.81+698.69*I, -125.08+359.44*I, -504.21+707.01*I};
-		std::complex<double> a = 1.1+I*2.1, b = 1.2+I*3.;
-		cout<<">>"<<__LINE__ <<std::endl;
-		multi::blas::gemv(a, M.rotated(), X, b, Y, multi::blas::conj{});
-	//	cout<<">>"<<__LINE__ <<std::endl;
-	//	multi::blas::gemv<dcomplex, dcomplex>(a, M.rotated(), X, b, Y2);
-		cout<< Y[0] <<' '<< Y[1] <<' '<< Y[2] <<std::endl;
-	//	assert( abs(Y[0] - Y3[0]) < 1e-12 && abs(Y[1] - Y3[1]) < 1e-12 && abs(Y[2] - Y3[2]) < 1e-12 );
-	//	assert( abs(Y[0] - Y2[0]) < 1e-12 && abs(Y[1] - Y2[1]) < 1e-12 && abs(Y[2] - Y2[2]) < 1e-12 );
-	}
-#endif
-	return 0;
-//	return 0;
-	{
-//		multi::array<dcomplex, 2> const M = {
-//			{ 9.+I*1., 24.+I*2., 30.+I*3.},
-//			{ 4.+I*1., 10.+I*1., 12.-I*2.},
-//			{14.+I*3., 16.-I*4., 36.+I*1.},
-//			{ 9.+I*1.,  7.+I*2.,  1.-I*2.}
-//		};
-		multi::array<dcomplex, 2> const M = {
-			{ 9. + 1.*I,  4. + 1.*I, 14. + 3.*I, 9. + 1.*I}, 
-			{24. + 2.*I, 10. + 1.*I, 16. - 4.*I, 7. + 2.*I}, 
-			{30. + 3.*I, 12. - 2.*I, 36. + 1.*I, 1. - 2.*I}
-		};
-		multi::array<dcomplex, 1> const X = {1.1+I*2., 2.1+I*1.1, 3.1+I*8., 4.1+I*1.2};
-		multi::array<dcomplex, 1> Y = {4.+I*3.1,5.-I*9.,6.+I*1.};
-		multi::array<dcomplex, 1> Y2 = Y;
-		multi::array<dcomplex, 1> const Y3 = {-134.97+423.67*I, -265.81+431.55*I, -567.81+809.37*I};
-		dcomplex const a = 1.1+I*2.1, b = 1.2+I*3.;
-		cout<< "708" <<std::endl;
-	//	multi::blas::gemv(a, M.rotated(), X, b, Y, multi::blas::conj<>{});
-		zgemv_('N', std::get<0>(M.shape()), std::get<1>(M.shape()), a, M.base(), M.stride(), X.base(), X.stride(), b, Y.base(), Y.stride());
-	//	zgemv_('T', std::get<1>(M.shape()), std::get<0>(M.shape()), a, M.base(), 2*std::get<0>(M.strides()), X.base(), stride(X), b, Y.base(), stride(Y));
-		multi::blas::gemv<std::complex<double>, std::complex<double>>(a, M, X, b, Y2);
-	//	multi::blas::gemv<std::complex<double>, std::complex<double>>(a, M.rotated(1), X, b, Y2);
-
-	//	multi::blas::gemv<dcomplex, dcomplex>(a, M.rotated(), X, b, Y2, multi::blas::conj<>{});
-		cout << Y[0] <<' '<< Y[1] <<' '<< Y[2] <<std::endl;
-		cout << Y2[0] <<' '<< Y2[1] <<' '<< Y2[2] <<std::endl;
-		cout << "finished" << std::endl;
-	//	assert( std::abs(Y[1] - Y3[1]) < 1e-12 );
-	//	assert( Y[1] == Y2[1] );
-	}
-//	assert(0);
-#endif
-
-#if 0
-
-namespace boost{
-namespace multi{
-namespace blas{
-
-template<class T> struct cs{
-	T c; T s;
-	operator multi::array<T, 2>() const{return {{c, s}, {-s, c}};}
-};
-template<class T> struct ab{T a; T b; using value_type = T;};
-template<class T> struct modified_rotation{
-	T data_[5];
-	int flag() const{return data_[0];}
-	multi::array<T, 2> h() const{
-		switch(flag()){
-			case -1: return {{data_[1], data_[2]}, {data_[3], data_[4]}};
-			case  0: return {{T{+1}   , data_[2]}, {data_[3], T{+1}   }};
-			case  1: return {{data_[1], T{+1}   }, {T{-1}   , data_[4]}};
-			case -2: return {{T{+1}   , T{ 0}   }, {T{ 0}   , T{+1}   }};
-			default: assert(0); return {};
-		}
-	}
-};
-
-template<class T>
-auto rotg(T& a, T& b){
-	cs<T> ret;
-//	using blas::rotg;
-	rotg(a, b, ret.c, ret.s );
-	return ret;
-}
-
-template<class T>
-modified_rotation<T> rotmg(T& d1, T& d2, T& x1, T const& y1){
-	modified_rotation<T> ret;
-	rotmg(d1, d2, x1, y1, ret.data_);
-	return ret;
-}
-
-//template<class T>
-//auto rotmg(T& d1, T& d2, T& b1, T const& b2){
-//	modified_rotation<T> ret;
-//	rotmg(d1, d2, b1, b2, ret);
-//	return ret;
-//}
-
-template<class X1D, class Y1D, class T>
-auto rot(X1D&& x, Y1D&& y, T const& c, T const& s){
-	assert( size(x) == size(y) );
-	assert( offset(x) == 0 and offset(y) == 0 );
-//	using blas::rot;
-	rot(size(x), origin(x), stride(x), origin(y), stride(y), c, s);
-	return std::tie(x, y);
-}
-template<class X1D, class Y1D, class CS>
-auto rot(X1D&& x, Y1D&& y, CS const& cs){
-	return rot(std::forward<X1D>(x), std::forward<Y1D>(y), cs.c, cs.s);
-}
-template<class X1D, class Y1D, class M>
-auto rotm(X1D&& x, Y1D&& y, M const& param){
-	using boost::multi::size;
-	assert( size(x) == size(y) );
-	assert( offset(x) == 0 and offset(y) == 0);
-	rotm(size(x), origin(x), stride(x), origin(y), stride(y), param.data_);
-}
-
-template<class It>
-auto iamax(It first, It last){
-	assert( stride(first) == stride(last) );
-//	using blas::iamax;
-	return iamax(std::distance(first, last), base(first), stride(first))
-	#ifndef CBLAS_H
-		- 1
-	#endif
-	;
-}
-
-template<class X1D> 
-auto iamax(X1D const& x){
-	assert( not offset(x) );
-	return iamax(begin(x), end(x));
-}
-
-}}
-
-#endif
 #endif
 #endif

--- a/external_codes/boost_multi/multi/adaptors/blas/CMakeLists.txt
+++ b/external_codes/boost_multi/multi/adaptors/blas/CMakeLists.txt
@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 3.11)
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+project(boost-multi-adaptors-blas VERSION 0.1 LANGUAGES CXX)
+
+set(BLA_VENDOR Intel10_64lp)
+find_package(BLAS)
+if(BLAS_FOUND) # in some systems with MKL, regular BLAS headers need to be found for it to work
+	message("Multi/BLAS: MKL environment detected")
+	add_definitions(-DRETURN_BY_STACK)
+else()
+	message("Multi/BLAS: MKL environment not detected, looking for other BLAS")
+	unset(BLA_VENDOR)
+	find_package(BLAS REQUIRED)
+endif()
+
+#find_path(BLAS_INCLUDE_DIRS cblas.h
+#  /usr/include
+#  /usr/local/include
+#  $ENV{BLAS_HOME}/include)
+
+link_libraries(${BLAS_LIBRARIES})
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+include_directories(${CMAKE_BINARY_DIR})
+
+add_subdirectory(test)
+
--- a/external_codes/boost_multi/multi/adaptors/blas/README.md
+++ b/external_codes/boost_multi/multi/adaptors/blas/README.md
@ -0,0 +1,71 @@
+<!--
+(pandoc `#--from gfm` --to html --standalone --metadata title=" " $0 > $0.html) && firefox --new-window $0.html; sleep 5; rm $0.html; exit
+-->
+# [Boost.]Multi BLAS Adaptor
+
+(not an official Boost library)
+
+_© Alfredo A. Correa, 2018-2021_
+
+The BLAS Adaptor provides an interface for BLAS-like libraries.
+
+## Contents
+[[_TOC_]]
+
+## Numeric Arrays, Conjugation Real and Imaginary parts
+
+This functions produce views (not copies) related to conjugation, real and imaginary parts.
+
+```cpp
+	using complex = std::complex<double>; 
+	complex const I{0, 1};
+	multi::array<complex, 2> B = {
+		{1. - 3.*I, 6. + 2.*I},
+		{8. + 2.*I, 2. + 4.*I},
+		{2. - 1.*I, 1. + 1.*I}
+	};
+
+	namespace blas = multi::blas;
+	multi::array<complex, 2> conjB = blas::conj(B);
+
+	assert( blas::conj(B)[2][1] == std::conj(B[2][1]) );
+
+	assert( blas::transposed(B)[1][2] == B[2][1] );
+	assert( blas::transposed(B) == ~B );
+
+	assert( blas::hermitized(B)[2][1] == blas::conj(B)[1][2] );
+	assert( blas::hermitized(B)       == blas::conj(blas::transposed(B)) );
+
+	assert( blas::real(B)[2][1] == std::real(B[2][1]) );
+	assert( blas::imag(B)[2][1] == std::imag(B[2][1]) );
+
+	multi::array<double, 2> B_real_doubled = {
+		{ 1., -3., 6., 2.},
+		{ 8.,  2., 2., 4.},
+		{ 2., -1., 1., 1.}
+	};
+	assert( blas::real_doubled(B) == B_real_doubled );
+```
+
+Usage:
+```cpp
+	multi::array<double, 2> const a_real = {
+		{ 1., 3., 1.},
+		{ 9., 7., 1.},
+	};
+
+	multi::array<complex, 2> const b = {
+		{ 11.+1.*I, 12.+1.*I, 4.+1.*I, 8.-2.*I},
+		{  7.+8.*I, 19.-2.*I, 2.+1.*I, 7.+1.*I},
+		{  5.+1.*I,  3.-1.*I, 3.+8.*I, 1.+1.*I}
+	};
+
+	multi::array<complex, 2> c({2, 4});
+
+	blas::real_doubled(c) = blas::gemm(1., a_real, blas::real_doubled(b)); // c = a_real*b
+```
+
+## Installation and Tests
+
+...
+
--- a/external_codes/boost_multi/multi/adaptors/blas/asum.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/asum.hpp
@ -1,7 +1,8 @@
 #ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x `pkg-config --cflags --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
+$CXXX $CXXFLAGS $0 -o $0x `pkg-config --cflags --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
 #endif
 // © Alfredo A. Correa 2019-2020
+// TODO make it work with thrust complex

 #ifndef MULTI_ADAPTORS_BLAS_ASUM_HPP
 #define MULTI_ADAPTORS_BLAS_ASUM_HPP
@ -38,7 +39,7 @@ auto asum(X1D const& x)
 #define BOOST_TEST_MODULE "C++ Unit Tests for Multi.BLAS asum"
 #define BOOST_TEST_DYN_LINK
 #include<boost/test/unit_test.hpp>
-#include<boost/test/floating_point_comparison.hpp>
+//#include<boost/test/tools/floating_point_comparison.hpp>

 #include "../../array.hpp"
 //#include "../../utility.hpp"
@ -57,17 +58,16 @@ BOOST_AUTO_TEST_CASE(multi_blas_asum_double){
 	BOOST_REQUIRE(asum(A[1]) == std::accumulate(begin(A[1]), end(A[1]), 0., [](auto&& a, auto&& b){return a+std::abs(b);}));
 }

-using complex = std::complex<double>;
-constexpr complex I{0, 1};
-
 BOOST_AUTO_TEST_CASE(multi_blas_asum_complex){

+	using complex = std::complex<double>; complex const I{0, 1};
 	multi::array<complex, 2> const A = {
 		{ 1. + 1.*I,  2.,  3.,  4.},
 		{-5. + 3.*I,  6.,  -7.,  8.},
 		{ 9. - 2.*I, 10., 11., 12.}
 	};
 	BOOST_REQUIRE(asum(rotated(A)[0]) == 1.+1. + 5.+3. + 9.+2.);
+
 }

 BOOST_AUTO_TEST_CASE(multi_blas_asum_double_carray){
--- a/external_codes/boost_multi/multi/adaptors/blas/axpy.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/axpy.hpp
@ -1,86 +1,91 @@
-#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
-#endif
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
 // © Alfredo Correa 2019-2020

 #ifndef MULTI_ADAPTORS_BLAS_AXPY_HPP
 #define MULTI_ADAPTORS_BLAS_AXPY_HPP

-#include "../blas/core.hpp"
+#include "../../adaptors/blas/core.hpp"
+#include "../../config/NODISCARD.hpp"
+#include "../../array_ref.hpp"

 namespace boost{
 namespace multi{namespace blas{

-template<class T, class It1, class Size, class OutIt>
-OutIt axpy_n(T alpha, It1 first, Size n, OutIt d_first){
-	axpy(n, alpha, base(first), stride(first), base(d_first), stride(d_first));
-	return d_first + n;
+using core::axpy;
+
+template<class It1, class Size, class OutIt>
+auto axpy_n(typename It1::value_type alpha, It1 first, Size n, OutIt d_first)
+->decltype(axpy(n, &alpha, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n){
+	return axpy(n, &alpha, base(first) , stride(first) , base(d_first) , stride(d_first) ) , d_first + n;}
+
+template<class Context, class It1, class Size, class OutIt, class=std::enable_if_t<is_context<Context>{}>>
+auto axpy_n(Context&& ctxt, typename It1::value_type alpha, It1 first, Size n, OutIt d_first)
+->decltype(std::forward<Context>(ctxt).axpy(n, &alpha, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n){
+	return std::forward<Context>(ctxt).axpy(n, &alpha, base(first) , stride(first) , base(d_first) , stride(d_first)) , d_first + n;}
+
+template<class X1D, class Y1D, typename = decltype( std::declval<Y1D&&>()[0] = 0. )>
+auto axpy(typename X1D::element alpha, X1D const& x, Y1D&& y)
+->decltype(axpy_n(alpha, x.begin(), x.size(), y.begin()), std::forward<Y1D>(y)){assert(size(x)==size(y)); // intel doesn't like ADL in deduced/sfinaed return types
+	return axpy_n(alpha, begin(x), size(x), begin(y)), std::forward<Y1D>(y);
 }

-template<class T, class It1, class OutIt>
-OutIt axpy(T alpha, It1 first, It1 last, OutIt d_first){
-	assert( stride(first) == stride(last) );
-	return axpy_n(alpha, first, std::distance(first, last), d_first);
+template<class Context, class X1D, class Y1D, typename = decltype( std::declval<Y1D&&>()[0] = 0. )>
+auto axpy(Context&& ctxt, typename X1D::element alpha, X1D const& x, Y1D&& y)
+->decltype(axpy_n(std::forward<Context>(ctxt), alpha, x.begin( ), x.size( ), y.begin( )), std::forward<Y1D>(y)){assert(size(x)==size(y)); // intel doesn't like ADL in deduced/sfinaed return types
+	return axpy_n(std::forward<Context>(ctxt), alpha,   begin(x),   size(x),   begin(y)), std::forward<Y1D>(y);
 }

-template<class T, class X1D, class Y1D>
-Y1D&& axpy(T alpha, X1D const& x, Y1D&& y){
-	assert( size(x) == size(y) );
-	assert( not offset(x) and not offset(y) );
-	auto e = axpy(alpha, begin(x), end(x), begin(y));
-	assert( e == end(y));
-	return std::forward<Y1D>(y);
+template<class X1D, class Y1D>
+Y1D&& axpy(X1D const& x, Y1D&& y){return axpy(+1., x, std::forward<Y1D>(y));}
+
+template<class Context, class X1D, class Y1D, std::enable_if_t<is_context<Context>{}> >
+Y1D&& axpy(Context&& ctxt, X1D const& x, Y1D&& y){return axpy(std::forward<Context>(ctxt), +1., x, std::forward<Y1D>(y));}
+
+template<class Context, class Scale, class ItX>
+class axpy_range{
+	Context ctxt_;
+	Scale alpha_;
+	ItX x_begin_;
+	size_type count_;
+public:
+	axpy_range(axpy_range const&) = delete;
+	axpy_range(Context ctxt, Scale alpha, ItX x_first, ItX x_last)
+		: ctxt_{ctxt}, alpha_{alpha}, x_begin_{x_first}, count_{x_last - x_first}{}
+	template<class Other>
+	friend Other&& operator+=(Other&& other, axpy_range const& self){
+		assert(other.size() == self.count_);
+		blas::axpy_n(std::forward<Context>(self.ctxt_), +self.alpha_, self.x_begin_, self.count_, other.begin());
+		return std::forward<Other>(other);
+	}
+	template<class Other>
+	friend Other&& operator-=(Other&& other, axpy_range const& self){
+		assert(other.size() == self.count_);
+		blas::axpy_n(std::forward<Context>(self.ctxt_), -self.alpha_, self.x_begin_, self.count_, other.begin());
+		return std::forward<Other>(other);
+	}
+	axpy_range& operator*=(Scale s)&{alpha_ *= s;}
+};
+
+template<class Context, class Scale, class X, class=std::enable_if_t<is_context<Context>{}>>
+axpy_range<Context, Scale, typename X::const_iterator> axpy(Context&& ctxt, Scale a, X const& x){
+	return {std::forward<Context>(ctxt), a, begin(x), end(x)};}
+
+template<class Scale, class X>
+axpy_range<blas::context const&, Scale, typename X::const_iterator> axpy(Scale a, X const& x){return {blas::context{}, a, begin(x), end(x)};}
+
+namespace operators{
+
+template<class X1D, class Y1D> auto operator+=(X1D&& x, Y1D const& other) DECLRETURN(axpy(+1., other, std::forward<X1D>(x)))
+template<class X1D, class Y1D> auto operator-=(X1D&& x, Y1D const& other) DECLRETURN(axpy(-1., other, std::forward<X1D>(x)))
+
+template<class X1D, class Y1D> auto operator+(X1D const& x, Y1D const& y)->std::decay_t<decltype(x.decay())>{auto X=x.decay(); X+=y; return X;}
+template<class X1D, class Y1D> auto operator-(X1D const& x, Y1D const& y)->std::decay_t<decltype(x.decay())>{auto X=x.decay(); X-=y; return X;}
+
 }

-template<class T, class X1D, class Y1D>
-Y1D&& axpy(X1D const& x, Y1D&& y){return axpy(+1., x, y);}

 }}
+
 }
-
-#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_AXPY
-
-#define BOOST_TEST_MODULE "C++ Unit Tests for Multi.BLAS axpy"
-#define BOOST_TEST_DYN_LINK
-#include<boost/test/unit_test.hpp>
-//#include<boost/test/floating_point_comparison.hpp>
-
-#include "../../array.hpp"
-#include "../../utility.hpp"
-
-#include<complex>
-#include<cassert>
-#include<iostream>
-#include<numeric>
-#include<algorithm>
-
-#include "../blas/numeric.hpp"
-
-using std::cout;
-namespace multi = boost::multi;
-namespace blas = multi::blas;
-
-BOOST_AUTO_TEST_CASE(multi_blas_axpy_double){
-	multi::array<double, 2> const cA = {
-		{1.,  2.,  3.,  4.},
-		{5.,  6.,  7.,  8.},
-		{9., 10., 11., 12.}
-	};
-	multi::array<double, 2> A = cA;
-	multi::array<double, 1> const b = cA[2];
-
-	blas::axpy(2., b, A[1]); // y = a*x + y, y+= a*x
-	assert( A[1][2] == 2.*b[2] + cA[1][2] );
-
-	using complex = std::complex<double>;
-	complex const I = {0., 1.};
-	multi::array<complex, 1> AC = {1. + 2.*I, 3. + 4.*I, 4. - 8.*I};
-	multi::array<complex, 1> BC(size(AC), complex{0.});
-	blas::axpy(+1., begin(blas::real(AC)), end(blas::real(AC)), begin(blas::real(BC)));
-	blas::axpy(-1., begin(blas::imag(AC)), end(blas::imag(AC)), begin(blas::imag(BC)));
-	assert( BC[2] == std::conj(AC[2]) );
-}
-
-#endif
 #endif

--- a/external_codes/boost_multi/multi/adaptors/blas/copy.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/copy.hpp
@ -1,7 +1,5 @@
-#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
-#endif
-// © Alfredo A. Correa 2019-2020
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
+// © Alfredo A. Correa 2020

 #ifndef MULTI_ADAPTORS_BLAS_COPY_HPP
 #define MULTI_ADAPTORS_BLAS_COPY_HPP
@ -11,84 +9,118 @@ $CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x

 #include "../../config/NODISCARD.hpp"

+#include<type_traits>
+
 namespace boost{
-namespace multi{
-namespace blas{
+namespace multi::blas{

 using core::copy;

 template<class It, typename Size, class OutIt>
 auto copy_n(It first, Size n, OutIt d_first)
->decltype(copy(n, base(first), stride(first), base(d_first), stride(d_first)), d_first + n){
-	return copy(n, base(first), stride(first), base(d_first), stride(d_first)), d_first + n;}
+->decltype(copy(n, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n){
+	return copy(n, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n;}
+
+template<class Context, class It, typename Size, class OutIt, class=std::enable_if_t<blas::is_context<Context>{}> >
+auto copy_n(Context&& ctxt, It first, Size n, OutIt d_first)
+->decltype(copy(std::forward<Context>(ctxt), n, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n){
+	return copy(std::forward<Context>(ctxt), n, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n;}
+
+template<class It, class OutIt>
+auto copy(It first, It last, OutIt d_first)
+->decltype(copy_n(first, last - first, d_first)){
+	return copy_n(first, last - first, d_first);}
+
+template<class Context, class It, class OutIt, class=std::enable_if_t<blas::is_context<Context>{}>>
+auto copy(Context&& ctxt, It first, It last, OutIt d_first)
+->decltype(copy_n(std::forward<Context>(ctxt), first, last - first, d_first)){
+	return copy_n(std::forward<Context>(ctxt), first, last - first, d_first);}

 template<class X1D, class Y1D>
-Y1D&& copy(X1D const& x, Y1D&& y){assert(size(x)==size(y)); assert(offset(x)==0 and offset(y)==0);
-	copy(size(x), base(x), stride(x), base(y), stride(y));
-	return std::forward<Y1D>(y);
+auto copy(X1D const& x, Y1D&& y)
+->decltype(blas::copy_n(x.begin(), x.size(), y.begin()), std::forward<Y1D>(y)){assert(x.size()==y.size());
+	return blas::copy_n(x.begin(), x.size(), y.begin()), std::forward<Y1D>(y);}
+
+template<class Context, class X1D, class Y1D>
+auto copy(Context&& ctxt, X1D const& x, Y1D&& y)
+->decltype(blas::copy_n(std::forward<Context>(ctxt), x.begin(), x.size(), y.begin()), std::forward<Y1D>(y)){assert(x.size()==y.size());
+	return blas::copy_n(std::forward<Context>(ctxt), x.begin(), x.size(), y.begin()), std::forward<Y1D>(y);}
+
+template<class ContextPtr, class It1D>
+class copy_iterator{
+	ContextPtr ctxt = {};
+	It1D it_;
+public:
+	using difference_type = typename std::iterator_traits<It1D>::difference_type;
+	using value_type 	  = typename std::iterator_traits<It1D>::value_type;
+	using pointer 	      = void;
+	using reference 	  = void;
+	using iterator_category = std::output_iterator_tag;
+	using iterator_type   = It1D;
+	using context_type    = ContextPtr;
+	constexpr explicit copy_iterator(It1D it) : it_{it}{}
+	constexpr          copy_iterator(ContextPtr cp, It1D it) : ctxt{cp}, it_{it}{}
+	constexpr iterator_type base() const{return it_;}
+	template<class It1DOut>
+	friend constexpr It1DOut copy_n(copy_iterator first, difference_type count, It1DOut result){
+		return blas::copy_n(first.ctxt, first.base(), count, result);
+	}
+	template<class It1DOut> 
+	friend constexpr It1DOut copy(copy_iterator first, copy_iterator last, It1DOut d_first){
+		return copy_n(first, distance(first, last), d_first);
+	}
+	template<class It1DOut>
+	friend constexpr It1DOut uninitialized_copy(copy_iterator first, copy_iterator last, It1DOut d_first){
+		return copy_n(first, distance(first, last), d_first);
+	}
+	friend constexpr difference_type distance(copy_iterator const& a, copy_iterator const& b){assert(stride(b.it_) == stride(a.it_));
+		return b.it_-a.it_;
+	}
+	constexpr value_type operator*() const{return *it_;}
+};
+
+template<class ContextPtr, class It1D, class DecayType = void, class DiffType = typename std::iterator_traits<It1D>::difference_type>
+class copy_range{
+	ContextPtr ctxp_ = {};
+	It1D begin_, end_;
+public:
+	using difference_type = DiffType;
+	using iterator        = copy_iterator<ContextPtr, It1D>;
+	using decay_type      = DecayType;
+	copy_range(copy_range&&) = default;
+	constexpr copy_range(It1D first, It1D last) : begin_{first}, end_{last}{}
+	constexpr copy_range(ContextPtr ctxp, It1D first, It1D last) : ctxp_{ctxp}, begin_{first}, end_{last}{}
+	constexpr difference_type size() const{return end_ - begin_;}
+	constexpr auto begin() const{return iterator{ctxp_, begin_};}
+	constexpr auto end()   const{return iterator{ctxp_, end_  };}
+	constexpr typename decay_type::extensions_type extensions() const{return {size()};}
+	template<class Other, class=decltype(Other(std::declval<iterator>(), std::declval<iterator>()))>
+	operator Other() const{return Other(begin(), end());}
+	friend auto operator+(copy_range const& s){return s.operator decay_type();}
+};
+
+template<class DecayType, class It> NODISCARD()
+auto copy(It const& first, It const& last)
+->decltype(copy_range<void*, It, DecayType>{first, last}){
+	return copy_range<void*, It, DecayType>{first, last};}
+
+template<class DecayType, class Context, class It> NODISCARD()
+auto copy(Context&& ctxt, It const& first, It const& last)
+->decltype(copy_range<Context, It, DecayType>{ctxt, first, last}){
+	return copy_range<Context, It, DecayType>{ctxt, first, last};}
+
+template<class A> NODISCARD()
+auto copy(A const& a) // need to specify templates (instead of deduced for intel)
+->decltype(copy<typename A::decay_type, typename A::const_iterator>(a.begin(), a.end())){
+	return copy<typename A::decay_type, typename A::const_iterator>(a.begin(), a.end());}
+
+template<class Context, class A, class=std::enable_if_t<blas::is_context<Context>{}>> NODISCARD()
+auto copy(Context&& ctxt, A const& a)
+->decltype(copy<typename A::decay_type, Context, typename A::const_iterator>(std::forward<Context>(ctxt), a.begin(), a.end())){
+	return copy<typename A::decay_type, Context, typename A::const_iterator>(std::forward<Context>(ctxt), a.begin(), a.end());}
+
 }
-
-template<class X1D, class Ret = typename X1D::decay_type> // TODO multi::array_traits<X1D>::decay_type
-NODISCARD("a copied matrix should be assigned")
-Ret copy(X1D const& x){
-	assert( not offset(x) );
-	return copy(x, Ret(size(x), get_allocator(x)));
-}
-
-}}}
-
-#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_COPY
-
-#define BOOST_TEST_MODULE "C++ Unit Tests for Multi blas copy"
-#define BOOST_TEST_DYN_LINK
-#include<boost/test/unit_test.hpp>
-
-#include "../../array.hpp"
-#include "../../utility.hpp"
-
-#include<complex>
-
-namespace multi = boost::multi;
-namespace blas = multi::blas;
-
-using complex = std::complex<double>; constexpr complex I{0, 1};
-
-BOOST_AUTO_TEST_CASE(multi_blas_copy){
-	{
-		multi::array<double, 1> const A = {1., 2., 3., 4.};
-		multi::array<double, 1> B = {5., 6., 7., 8.};
-		blas::copy(A, B);
-		BOOST_REQUIRE( B == A );
-	}
-	{
-		using complex = std::complex<double>;
-		multi::array<complex, 1> const A = {1., 2., 3., 4.};
-		multi::array<complex, 1> B = {5., 6., 7., 8.};
-		blas::copy(A, B);
-		BOOST_REQUIRE( B == A );		
-	}
-	{
-		multi::array<double, 2> const A = {
-			{1., 2., 3.},
-			{4., 5., 6.},
-			{7., 8., 9.}
-		};
-		multi::array<double, 1> B(3);
-		blas::copy(rotated(A)[0], B);
-		BOOST_REQUIRE( B == rotated(A)[0] );
-	}
-	{
-		multi::array<complex, 2> const A = {
-			{1., 2., 3. + I},
-			{4., 5., 6.},
-			{7., 8., 9.}
-		};
-		multi::array<complex, 1> B = blas::copy(blas::T(A)[2]);
-		BOOST_REQUIRE( size(B) == 3 );
-		BOOST_REQUIRE( B[0] == 3. + I);
-	}
 }

 #endif
-#endif

--- a/external_codes/boost_multi/multi/adaptors/blas/core.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/core.hpp
@ -1,5 +1,5 @@
-#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x `pkg-config --libs blas`&&$0x&&rm $0x;exit
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
+$CXXX $CXXFLAGS $0 -o $0.$X `pkg-config --libs blas`&&$0.$X&&rm $0.$X;exit
 #endif
 //(for a in `find tests/ -name '*.cpp'`; do sh $a || break; done); exit

@ -14,8 +14,28 @@ $CXX $0 -o $0x `pkg-config --libs blas`&&$0x&&rm $0x;exit
 #include<iostream> // debug
 #include<cassert>
 #include<complex>
-#include<cstdint> // int64_t
+#include<stdint.h> // int64_t
 #include<limits> // numeric_limits
+#include<type_traits> // is_convertible
+#include<cstring> // std::memcpy
+
+#include "../blas/traits.hpp"
+#include "../../config/MARK.hpp"
+
+#if 0
+	#define MULTI_ASSERT1(ExpR)              assert       (ExpR)
+	#define MULTI_ASSERT2(ExpR, DescriptioN) MULTI_ASSERT1(ExpR && ##DescriptioN)
+#else
+	#if not defined(NDEBUG)
+		#include<stdexcept>
+		#include<string>
+		#define MULTI_ASSERT1(ExpR)              (void)((ExpR)?0:throw std::logic_error("\n" __FILE__ ":"+std::to_string(__LINE__)+"::\n"+std::string(__PRETTY_FUNCTION__)+"\nLogic assertion `" #ExpR "' failed."))
+		#define MULTI_ASSERT2(ExpR, DescriptioN) (void)((ExpR)?0:throw std::DescriptioN("\n" __FILE__ ":"+std::to_string(__LINE__)+"::\n"+std::string(__PRETTY_FUNCTION__)+"\nLogic assertion `" #ExpR "' failed."))
+	#else
+		#define MULTI_ASSERT1(ExpR)              assert(ExpR)
+		#define MULTI_ASSERT2(ExpR, DescriptioN) assert(EXpR)
+	#endif
+#endif

 #ifdef CBLAS_H
 #define BLAS(NamE) cblas_##NamE
@ -24,23 +44,40 @@ $CXX $0 -o $0x `pkg-config --libs blas`&&$0x&&rm $0x;exit
 extern "C"{

 #ifndef _BLAS_INT
-#define _BLAS_INT __INTPTR_WIDTH__
+#if defined(__INTPTR_WIDTH__)
+	#define _BLAS_INT __INTPTR_WIDTH__
+#endif
 #endif

 #define s float
 #define d double
 #define c std::complex<s>
 #define z std::complex<d>
-#define v void 
-#define C _Complex s
-#define Z _Complex d
-#if(_BLAS_INT==32)
-#define INT std::int32_t
-#elif(_BLAS_INT==64)
-#define INT std::int64_t
+#define v void
+
+typedef struct { float  real, imag; } Complex_float ;
+typedef struct { double real, imag; } Complex_double;
+
+#define C Complex_float // _Complex s
+#define Z Complex_double // _Complex d
+
+#if defined(_BLAS_INT)
+	#if   _BLAS_INT==32
+		#define INT int32_t
+	#elif _BLAS_INT==64
+		#define INT int64_t
+	#else
+		#define INT int32_t // 32bit safe? pesimistic?
+	#endif
 #else
-#define INT std::int32_t // 32bit safe? pesimistic?
+	#define INT int32_t // 32bit safe? pesimistic?
 #endif
+
+namespace core{
+	using size_t = INT;
+	using ssize_t = std::make_signed_t<size_t>;
+}
+
 #define INTEGER INT const&
 #define N INTEGER n
 #define INCX INTEGER incx
@ -48,21 +85,28 @@ extern "C"{

 static_assert(sizeof(INT)==32/8 or sizeof(INT)==64/8, "please set _BLAS_INT to int32_t or int64_t");

-#define xROTG(T1, T2)     v BLAS(   T1##rotg)(T1 const*, T1 const*, T2*, T1*)
-#define xROTMG(T)         v BLAS(   T##rotmg)(T*, T*, T*, T const&, T(&param)[5])
+// TODO indent declarations like here https://www.netlib.org/lapack/lug/node145.html
+
+#define xROTG(T1, T2)     v BLAS(   T1##rotg)(                           T1 const*, T1 const*, T2*, T1*)
+#define xROTMG(T)         v BLAS(   T##rotmg)(                           T*, T*, T*, T const&, T(&param)[5])
 #define xROT(TT, T, S)    v BLAS(  TT##rot  )(N,              T       *x, INCX, T       *y, INCY, S const&, S const&)
 #define xROTM(T)          v BLAS(   T##rotm )(N, T* x, INCX, T* y, INCY, T const(&p)[5])
-#define xSWAP(T)          v BLAS(   T##swap )(N,              T       *x, INCX, T       *y, INCY)
-#define xSCAL(TT, TA, TX) v BLAS(  TT##scal )(N, TA const& a, TX      *x, INCX                  )
-#define xCOPY(T)          v BLAS(   T##copy )(N,              T const *x, INCX, T       *y, INCY) 
-#define xAXPY(T)          v BLAS(   T##axpy )(N,  T const& a, T const *x, INCX, T       *y, INCY)
+#define xSWAP(T)          v T ##swap##_ (N,              T       *x, INCX, T       *y, INCY)
+#define xSCAL(TT, TA, TX) v TT##scal##_ (N, TA const& a, TX      *x, INCX                  )
+#define xCOPY(T)          v T ##copy##_ (N,              T const *x, INCX, T       *y, INCY) 
+#define xAXPY(T)          v T ##axpy##_ (N,  T const* a, T const *x, INCX, T       *y, INCY)
 #define xDOT(R, TT, T)    R BLAS(  TT##dot  )(N,              T const *x, INCX, T const *y, INCY)
-#define xDOTU(R, T)       R BLAS(   T##dotu )(N,              T const *x, INCX, T const *y, INCY)
-#define xDOTC(R, T)       R BLAS(   T##dotc )(N,              T const *x, INCX, T const *y, INCY)
-#define xxDOT(TT, T)      T BLAS(  TT##dot  )(N,  T const& a, T const *x, INCX, T const *y, INCY)
-#define xNRM2(R, TT, T)   R BLAS(  TT##nrm2 )(N,              T const *x, INCX                  )   
-#define xASUM(R, TT, T)   R BLAS(  TT##asum )(N,              T const *x, INCX                  )
-#define IxAMAX(T)       INT BLAS(i##T##amax )(N,              T const* x, INCX                  )
+#if defined(RETURN_BY_STACK) || (defined(FORTRAN_COMPLEX_FUNCTIONS_RETURN_VOID) && FORTRAN_COMPLEX_FUNCTIONS_RETURN_VOID)
+#define xDOTU(R, T)       v BLAS(   T##dotu )(R*, N,              T const *x, INCX, T const *y, INCY)
+#define xDOTC(R, T)       v    T##dotc ##_ (R*, N,              T const *x, INCX, T const *y, INCY)
+#else
+#define xDOTU(R, T)       R    T ##dotu##_ (    N,              T const *x, INCX, T const *y, INCY)
+#define xDOTC(R, T)       R    T ##dotc##_ (    N,              T const *x, INCX, T const *y, INCY)
+#endif
+#define xxDOT(TT, T)      T    TT##dot ##_ (    N,  T const& a, T const *x, INCX, T const *y, INCY)
+#define xNRM2(R, TT, T)   R    TT##nrm2##_ (    N,              T const *x, INCX                  )
+#define xASUM(R, TT, T)   R    TT##asum##_ (    N,              T const *x, INCX                  )
+#define IxAMAX(T)       INT i##T ##amax##_ (    N,              T const* x, INCX                  )

 xROTG(s, s)   ; xROTG(d,d)    ;// MKL extension xROTG(c, s); xROTG(z, d);
 xROTMG(s)     ; xROTMG(d)     ;
@ -73,8 +117,11 @@ xSCAL(s, s, s); xSCAL(d, d, d); xSCAL(c, c, c); xSCAL(z, z, z); xSCAL(zd, d, z);
 xCOPY(s)      ; xCOPY(d)      ; xCOPY(c)      ; xCOPY(z)      ;
 xAXPY(s)      ; xAXPY(d)      ; xAXPY(c)      ; xAXPY(z)      ;
 xDOT(s, s, s); xDOT(d, d, d);                                   xDOT(d, ds, s);
-xDOTU(C, c); xDOTU(Z, z); 
-xDOTC(C, c); xDOTC(Z, z); 
+
+xDOTU(C, c); xDOTU(Z, z);
+//xDOTU(c, c); xDOTU(z, z);
+
+xDOTC(C, c); xDOTC(Z, z);
 xxDOT(sds, s);
 xNRM2(s, s, s); xNRM2(d, d, d); xNRM2(s, sc, c); xNRM2(d, dz, z);
 xASUM(s, s, s); xASUM(d, d, d); xASUM(s, sc, c); xASUM(d, dz, z);
@ -87,11 +134,11 @@ IxAMAX(s); IxAMAX(d); IxAMAX(c); IxAMAX(z);
 #define UPLO const char& uplo
 #define DIAG const char& diag

-#define xGEMV(T) void BLAS(T##gemv)(TRANS, NR, NC, T const& a, T const* A, LDA, T const* X, INCX, T const& beta, T*       Y, INCY)
-#define xGER(T)  void BLAS(T##ger )(       NR, NC, T const& a,                  T const* X, INCX,                T const* Y, INCY, T* A, LDA)
-#define xGERU(T) void BLAS(T##geru)(       NR, NC, T const& a,                  T const* X, INCX,                T const* Y, INCY, T* A, LDA)
-#define xGERC(T) void BLAS(T##gerc)(       NR, NC, T const& a,                  T const* X, INCX,                T const* Y, INCY, T* A, LDA)
-#define xTRSV(T) void BLAS(T##trsv)(UPLO, TRANS, DIAG, N, T const* A, LDA, T* X, INCX)
+#define xGEMV(T) void  T## gemv ##_ (      TRANS,       NR, NC, T const& a, T const* A, LDA, T const* X, INCX, T const& beta, T*       Y, INCY           )
+#define xGER(T)  void  T## ger  ##_ (                   NR, NC, T const& a,                  T const* X, INCX,                T const* Y, INCY, T* A, LDA)
+#define xGERU(T) void  T## geru ##_ (                   NR, NC, T const& a,                  T const* X, INCX,                T const* Y, INCY, T* A, LDA)
+#define xGERC(T) void  T## gerc ##_ (                   NR, NC, T const& a,                  T const* X, INCX,                T const* Y, INCY, T* A, LDA)
+#define xTRSV(T) void  T## trsv ##_ (UPLO, TRANS, DIAG, N,                  T const* A, LDA, T* X      , INCX                                            )

 xGEMV(s); xGEMV(d); xGEMV(c); xGEMV(z);
 xGER(s); xGER(d);
@ -107,10 +154,11 @@ xTRSV(s); xTRSV(d); xTRSV(c); xTRSV(z);

 #define SIDE const char& side

-#define xGEMM(T)     void BLAS(T##gemm)(TRANSA, TRANSB, NR, NC, NK, T const& a, T const* A, LDA, T const* B, LDB, T const& b, T const* CC, LDC)
-#define xSYRK(T)     void BLAS(T##syrk)(UPLO, TRANSA, NR, NK, T const& a, T const* A, LDA, T const& b, T* CC, LDC) 
-#define xHERK(TT, T) void BLAS(T##herk)(UPLO, TRANSA, NR, NK, TT const& a, T const* A, LDA, TT const& b, T* CC, LDC) 
-#define xTRSM(T) void BLAS(T##trsm)(SIDE, UPLO, TRANSA, DIAG, NR, NK, T const& a, T const* A, LDA, T const* B, LDB) 
+#define xGEMM(T)     void T ##gemm ##_ (            TRANSA, TRANSB,       NR, NC, NK, T  const& a, T const* A, LDA, T const* B, LDB, T  const& b     , T const* CC, LDC)
+#define xSYRK(T)     void T ##syrk ##_ (      UPLO, TRANSA,               NR, NK,     T  const& a, T const* A, LDA,                  T  const& b     , T*       CC, LDC)
+#define xHERK(TT, T) void T ##herk ##_ (      UPLO, TRANSA,               NR, NK,     TT const& a, T const* A, LDA,                  TT const& b     , T*       CC, LDC)
+#define xTRSM(T)     void T ##trsm ##_ (SIDE, UPLO, TRANSA,         DIAG, NR, NK,     T  const& a, T const* A, LDA,                  T  const* B, LDB                  )
+
 xGEMM(s); xGEMM(d); xGEMM(c)   ; xGEMM(z)   ;
 xSYRK(s); xSYRK(d); xSYRK(c)   ; xSYRK(z)   ;
                    xHERK(s, c); xHERK(d, z);
@ -166,23 +214,53 @@ namespace boost{
 namespace multi{
 namespace blas{

-using s = float;
-using d = double;
-using c = std::complex<s>;
-using z = std::complex<d>;
-using v = void;
+template<class T> struct complex_ptr{
+	std::complex<T>* impl_;
+	template<class TT, class=std::enable_if_t<sizeof(*TT{})==sizeof(std::complex<T>) and sizeof(*TT{})==sizeof(TT{}->real())+sizeof(TT{}->imag())>>
+	explicit complex_ptr(TT tt) : impl_{reinterpret_cast<std::complex<T>*>(tt)}{}
+	complex_ptr(complex_ptr const&) = delete;
+	operator std::complex<T>*() const{return   impl_;}
+	std::complex<T>& operator*() const{return *impl_;}
+};
+
+template<class T> struct complex_const_ptr{
+	std::complex<T> const* impl_;
+	template<class TT, class=std::enable_if_t<sizeof(*TT{})==sizeof(std::complex<T>) and sizeof(*TT{})==sizeof(TT{}->real())+sizeof(TT{}->imag())>>
+	explicit complex_const_ptr(TT tt) : impl_{reinterpret_cast<std::complex<T> const*>(tt)}{}
+	complex_const_ptr(complex_const_ptr const&) = delete;
+	operator std::complex<T> const*() const{return impl_;}
+	std::complex<T> const& operator*() const{return *impl_;}
+};
+
+template<class T> struct add_ptr{using type = T*;};
+template<class T> struct add_const_ptr{using type = T const*;};
+
+template<class T> struct add_ptr<std::complex<T>>{using type = complex_ptr<T>;};
+template<class T> struct add_const_ptr<std::complex<T>>{using type = complex_const_ptr<T>;};
+
+template<class T> using add_ptr_t = typename add_ptr<T>::type;
+template<class T> using add_const_ptr_t = typename add_const_ptr<T>::type;
+
+namespace{
+	using s = float;
+	using d = double;
+	using c = std::complex<s>; using C = Complex_float ;
+	using z = std::complex<d>; using Z = Complex_double;
+	using v = void;
+}

 #define BC(x) [](auto xx){assert(xx>=std::numeric_limits<INT>::min() and xx<std::numeric_limits<INT>::max()); return xx;}(x)

-#define xrotg(T1, T2)                       v   rotg (T1 const& a, T1 const& b, T2& cc, T1& ss                                   ){BLAS(T1##rotg )(const_cast<T1*>(&a), const_cast<T1*>(&b), &cc, &ss);}
-#define xrotmg(T)                           v   rotmg(T& d1, T& d2, T& A, T const& B, T(&p)[5]                                   ){BLAS(T##rotmg )(&d1, &d2, &A, B, p);}
-#define xrot(T, TT, CS)   template<class S> v   rot  (S n,       T       *x, S incx, T       *y, S incy, CS const& c, CS const& s){BLAS(TT##rot )(BC(n),    x, BC(incx), y, BC(incy), c, s);}
-#define xrotm(T)          template<class S> v   rotm (S n,       T       *x, S incx, T       *y, S incy, T const(&p)[5]          ){BLAS( T##rotm)(BC(n),    x, BC(incx), y, BC(incy), p);              }
-#define xswap(T)          template<class S> v   swap (S n,       T       *x, S incx, T       *y, S incy                          ){BLAS( T##swap)(BC(n),    x, BC(incx), y, BC(incy));                 }
-#define xscal(XX, TA, TX) template<class S> TX* scal (S n, TA* a, TX      *x, S incx                                              ){BLAS(XX##scal)(BC(n), *a, x, BC(incx)             ); return x+n*incx;}
-#define xcopy(T)          template<class S> v   copy (S n,       T const *x, S incx, T       *y, S incy                          ){BLAS( T##copy)(BC(n),    x, BC(incx), y, BC(incy));                 }
-#define xaxpy(T)          template<class S> T*  axpy (S n, T  a, T const *x, S incx, T       *y, S incy                          ){BLAS( T##axpy)(BC(n), a, x, BC(incx), y, BC(incy)); return y+n*incy;}
-#define xdot(R, TT, T)    template<class S> v   dot  (S n,       T const *x, S incx, T const *y, S incy, R* r                    ){*r = BLAS(TT##dot )(BC(n),    x, BC(incx), y, BC(incy));                 }
+#define xrotg(T1, T2)                       v   rotg (T1 const& a, T1 const& b, T2& cc, T1& ss                                   ){     BLAS(T1##rotg )(const_cast<T1*>(&a), const_cast<T1*>(&b), &cc, &ss);  }
+#define xrotmg(T)                           v   rotmg(T& d1, T& d2, T& A, T const& B, T(&p)[5]                                   ){     BLAS( T##rotmg)(&d1, &d2, &A, B, p);                                  }
+#define xrot(T, TT, CS)   template<class S> v   rot  (S n,       T       *x, S incx, T       *y, S incy, CS const& cos, CS const& sin){     BLAS(TT##rot  )(BC(n),    x, BC(incx), y, BC(incy), cos, sin);    }
+#define xrotm(T)          template<class S> v   rotm (S n,       T       *x, S incx, T       *y, S incy, T const(&p)[5]          ){     BLAS( T##rotm )(BC(n),    x, BC(incx), y, BC(incy), p);               }
+#define xswap(T)          template<class S> v   swap (S n,       T       *x, S incx, T       *y, S incy                          ){     BLAS( T##swap )(BC(n),    x, BC(incx), y, BC(incy));                  }
+#define xscal(XX, TA, TX) TX* scal (INT n, TA const* a, TX       *x, INT incx                                                    ){     BLAS(XX##scal )(BC(n), *a, x, BC(incx)             ); return x+n*incx;}
+//#define xcopy(T)          v   copy (INT n,              T  const *x, INT incx, T       *y, INT incy                              ){     BLAS( T##copy )(BC(n),    x, BC(incx), y, BC(incy));                  }
+//#define xaxpy(T)          template<class S> T*  axpy (S n, T  a, T const *x, S incx, T       *y, S incy                          ){     BLAS( T##axpy )(BC(n), a, x, BC(incx), y, BC(incy)); return y+n*incy; }
+#define xdot(R, TT, T)    template<class S> v   dot  (S n,       T const* x, S incx, T const* y, S incy, R* r                    ){\
+		MULTI_MARK_SCOPE("cpu_dot"); *r = BLAS(TT##dot  )(BC(n),    x, BC(incx), y, BC(incy));                  }

 xrotg(s, s)    xrotg(d, d) //MKL extension xrotg(c, s); xrotg(z, d);
 xrotmg(s)      xrotmg(d)
@ -191,12 +269,37 @@ xrotm(s)       xrotm(d)
 xswap(s)       xswap(d)       xswap(c)       xswap(z)

 namespace core{
+
 xscal(s, s, s) xscal(d, d, d) xscal(c, c, c) xscal(z, z, z) xscal(zd, d, z) xscal(cs, s, c)
-xcopy(s)       xcopy(d)       xcopy(c)       xcopy(z)
+
+using std::enable_if_t;
+using std::is_assignable;
+template<class SX, class SY, enable_if_t<is_s<SX>{} and is_s<SY>{} and is_assignable<SY&, SX&>{},int> =0> void copy(size_t n, SX* x, size_t incx, SY* y, size_t incy){BLAS(scopy)(n, (             float   const*)(x), incx, (             float  *)(y), incy);}
+template<class DX, class DY, enable_if_t<is_d<DX>{} and is_d<DY>{} and is_assignable<DY&, DX&>{},int> =0> void copy(size_t n, DX* x, size_t incx, DY* y, size_t incy){BLAS(dcopy)(n, (             double  const*)(x), incx, (             double *)(y), incy);}
+template<class CX, class CY, enable_if_t<is_c<CX>{} and is_c<CY>{} and is_assignable<CY&, CX&>{},int> =0> void copy(size_t n, CX* x, size_t incx, CY* y, size_t incy){BLAS(ccopy)(n, (std::complex<float > const*)(x), incx, (std::complex<float >*)(y), incy);}
+template<class ZX, class ZY, enable_if_t<is_z<ZX>{} and is_z<ZY>{} and is_assignable<ZY&, ZX&>{},int> =0> void copy(size_t n, ZX* x, size_t incx, ZY* y, size_t incy){BLAS(zcopy)(n, (std::complex<double> const*)(x), incx, (std::complex<double>*)(y), incy);}

 xdot(s, s, s)  xdot(d, d, d)                                xdot(d, ds, s)
-}
+
+using std::pointer_traits;
+using std::enable_if_t;
+using std::is_convertible_v;
+
+#define xaxpy(T) \
+template<class ALPHA, class SXP, class SX = typename pointer_traits<SXP>::element_type, class SYP, class SY = typename pointer_traits<SYP>::element_type, enable_if_t< \
+	is_##T<ALPHA>{} and is_##T<SX>{} and is_##T<SY>{} and is_assignable<SY&, decltype(ALPHA{}*SX{})>{} \
+	and is_convertible_v<SXP, SX*> and is_convertible_v<SYP, SY*> \
+, int> =0> \
+void axpy(size_t n, ALPHA const* a, SXP x, size_t incx, SYP y, size_t incy){BLAS(T##axpy)(n, (T const *)a, (T const*)static_cast<SX*>(x), incx, (T*)static_cast<SY*>(y), incy);}
+
 xaxpy(s)       xaxpy(d)       xaxpy(c)       xaxpy(z)
+#undef  xaxpy
+//template<class A, class SX, class SY, enable_if_t<is_s<SX>{} and is_s<SY>{} and is_assignable<SY&, decltype(A{}*SX{})>{}, int> =0> void axpy(size_t n, A a, SX* x, size_t incx, SY* y, size_t incy){BLAS(saxpy)(n, a, (s const*)(x), incx, (s*)(y), incy);}
+//template<class A, class DX, class DY, enable_if_t<is_d<DX>{} and is_d<DY>{} and is_assignable<DY&, decltype(A{}*DX{})>{}, int> =0> void axpy(size_t n, A a, DX* x, size_t incx, DY* y, size_t incy){BLAS(daxpy)(n, a, (d const*)(x), incx, (d*)(y), incy);}
+//template<class A, class CX, class CY, enable_if_t<is_c<CX>{} and is_c<CY>{} and is_assignable<CY&, decltype(A{}*CX{})>{}, int> =0> void axpy(size_t n, A a, CX* x, size_t incx, CY* y, size_t incy){BLAS(caxpy)(n, a, (c const*)(x), incx, (c*)(y), incy);}
+//template<class A, class ZX, class ZY, enable_if_t<is_z<ZX>{} and is_z<ZY>{} and is_assignable<ZY&, decltype(A{}*ZX{})>{}, int> =0> void axpy(size_t n, A a, ZX* x, size_t incx, ZY* y, size_t incy){BLAS(zaxpy)(n, a, (z const*)(x), incx, (z*)(y), incy);}
+
+}

 template<class R, class S, class T> R dot(S n, T const* x, S incx, T const* y, S incy){
 	R ret;
@ -217,42 +320,65 @@ template<class S, class T> T dot(S n, T const* x, S incx, T const* y, S incy){
 #undef xdot

 #ifndef CBLAS_H
-//#define xdotu(T) template<class S> v dotu(S n, T const* x, S incx, T const* y, S incy, T* r){*r = BLAS(T##dotu)(BC(n), x, BC(incx), y, BC(incy));}
-#define xdotu(T) template<class S> v dotu(S n, T const* x, S incx, T const* y, S incy, T* r){*r = (T)(BLAS(T##dotu)(BC(n), x, BC(incx), y, BC(incy)));}
-#define xdotc(T) template<class S> v dotc(S n, T const* x, S incx, T const* y, S incy, T* r){*r = (T)(BLAS(T##dotc)(BC(n), x, BC(incx), y, BC(incy)));}
-namespace core{
-xdotu(c) xdotu(z)
-xdotc(c) xdotc(z)
-}
-//                 template<class S> z dot(S n,  c const *x, S incx, c const *y, S incy){return dotc(n, x, incx, y, incy);}
-//                 template<class S> z dot(S n,  z const *x, S incx, z const *y, S incy){return dotc(n, x, incx, y, incy);}

-#undef xdotu
-#undef xdotc
-#else
-#define xdotu(T) template<class S> v dotu(S n, T const* x, S incx, T const* y, S incy, T* r){BLAS(T##dotu_sub)(BC(n), x, BC(incx), y, BC(incy), r);}
-#define xdotc(T) template<class S> v dotc(S n, T const* x, S incx, T const* y, S incy, T* r){BLAS(T##dotc_sub)(BC(n), x, BC(incx), y, BC(incy), r);}
 namespace core{
-xdotu(c) xdotu(z)
-xdotc(c) xdotc(z)
+
+using std::enable_if_t;
+using std::is_assignable;
+
+#if defined(RETURN_BY_STACK) || (defined(FORTRAN_COMPLEX_FUNCTIONS_RETURN_VOID) && FORTRAN_COMPLEX_FUNCTIONS_RETURN_VOID)
+template<class X, class Y, class R, enable_if_t<is_c<X>{} and is_c<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotu(size_t n, X* x, size_t incx, Y* y, size_t incy, R* r){BLAS(cdotu)((Complex_float *)r, n, (c const*)x, incx, (c const*)y, incy);}
+template<class X, class Y, class R, enable_if_t<is_z<X>{} and is_z<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotu(size_t n, X* x, size_t incx, Y* y, size_t incy, R* r){BLAS(zdotu)((Complex_double*)r, n, (z const*)x, incx, (z const*)y, incy);}
+
+template<class X, class Y, class R, enable_if_t<is_c<X>{} and is_c<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotc(size_t n, X* x, size_t incx, Y* y, size_t incy, R* r){BLAS(cdotc)((Complex_float *)r, n, (c const*)x, incx, (c const*)y, incy);}
+template<class X, class Y, class R, enable_if_t<is_z<X>{} and is_z<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotc(size_t n, X* x, size_t incx, Y* y, size_t incy, R* r){BLAS(zdotc)((Complex_double*)r, n, (z const*)x, incx, (z const*)y, incy);}
+#else
+template<class XP, class X = typename std::pointer_traits<XP>::element_type, class YP, class Y = typename std::pointer_traits<YP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_c<X>{} and is_c<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotu(size_t n, XP x, size_t incx, YP y, size_t incy, RP r){auto rr = BLAS(cdotu)(n, (c const*)static_cast<X*>(x), incx, (c const*)static_cast<Y*>(y), incy); std::memcpy(reinterpret_cast<float (*)[2]>(static_cast<R*>(r)), &rr, sizeof(rr)); static_assert(sizeof(rr)==sizeof(*r));}
+template<class XP, class X = typename std::pointer_traits<XP>::element_type, class YP, class Y = typename std::pointer_traits<YP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_z<X>{} and is_z<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotu(size_t n, XP x, size_t incx, YP y, size_t incy, RP r){auto rr = BLAS(zdotu)(n, (z const*)static_cast<X*>(x), incx, (z const*)static_cast<Y*>(y), incy); std::memcpy(reinterpret_cast<double(*)[2]>(static_cast<R*>(r)), &rr, sizeof(rr)); static_assert(sizeof(rr)==sizeof(*r));}
+
+template<class XP, class X = typename std::pointer_traits<XP>::element_type, class YP, class Y = typename std::pointer_traits<YP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_c<X>{} and is_c<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotc(size_t n, XP x, size_t incx, YP y, size_t incy, RP r){auto rr = BLAS(cdotc)(n, (c const*)static_cast<X*>(x), incx, (c const*)static_cast<Y*>(y), incy); std::memcpy(reinterpret_cast<float (*)[2]>(static_cast<R*>(r)), &rr, sizeof(rr)); static_assert(sizeof(rr)==sizeof(*r));}
+template<class XP, class X = typename std::pointer_traits<XP>::element_type, class YP, class Y = typename std::pointer_traits<YP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_z<X>{} and is_z<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotc(size_t n, XP x, size_t incx, YP y, size_t incy, RP r){auto rr = BLAS(zdotc)(n, (z const*)static_cast<X*>(x), incx, (z const*)static_cast<Y*>(y), incy); std::memcpy(reinterpret_cast<double(*)[2]>(static_cast<R*>(r)), &rr, sizeof(rr)); static_assert(sizeof(rr)==sizeof(*r));}
+#endif
+
 }
+#else
+// TODO: make cblas version
+#define xdotu(T) template<class S> v dotu(S n, add_const_ptr_t<T> x, S incx, add_const_ptr_t<T> y, S incy, add_ptr_t<T> r){BLAS(T##dotu_sub)(BC(n), x, BC(incx), y, BC(incy), r);}
+#define xdotc(T) template<class S> v dotc(S n, add_const_ptr_t<T> x, S incx, add_const_ptr_t<T> y, S incy, add_ptr_t<T> r){BLAS(T##dotc_sub)(BC(n), x, BC(incx), y, BC(incy), r);}
+
+namespace core{
+	xdotu(c) xdotu(z)
+	xdotc(c) xdotc(z)
+}
+
 #undef xdotu
 #undef xdotc
 #endif

 namespace core{
-template<class S> s dot(S n, s const& b, s const* x, S incx, s const* y, S incy){return BLAS(sdsdot)(BC(n), b, x, BC(incx), y, BC(incy));}
+	template<class S> s dot(S n, s const& b, s const* x, S incx, s const* y, S incy){return BLAS(sdsdot)(BC(n), b, x, BC(incx), y, BC(incy));}

 //template<class S> void dot(S n, s const& b, s const* x, S incx, s const* y, S incy, s* result){*result = BLAS(sdsdot)(BC(n), b, x, BC(incx), y, BC(incy));}

 }

-#define xnrm2(R, T, TT) template<class S>    v nrm2 (S n, T const* x, S incx, R* r){*r = BLAS(TT##nrm2  )(BC(n), x, BC(incx));}
+//#define xnrm2(R, T, TT) template<class S>    v nrm2 (S n, add_const_ptr_t<T> x, S incx, R* r){*r = BLAS(TT##nrm2  )(BC(n), x, BC(incx));}
+
 #define xasum(T, TT)    template<class S> auto asum (S n, T const* x, S incx){return BLAS(TT##asum  )(BC(n), x, BC(incx));}
 #define ixamax(T)       template<class S> auto iamax(S n, T const* x, S incx){return BLAS(i##T##amax)(BC(n), x, BC(incx)) - 1;}
 xasum(s, s)    xasum(d, d)                        xasum (c, sc)                  xasum(z, dz)
 namespace core{
-	xnrm2(s, s, s) xnrm2(d, d, d)  xnrm2(s, c, sc) xnrm2(d, z, dz)
+//	xnrm2(s, s, s) xnrm2(d, d, d)  xnrm2(s, c, sc) xnrm2(d, z, dz)
+
+template<class XP, class X = typename std::pointer_traits<XP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_s<X>{} and is_s<R>{} and std::is_assignable<R&, decltype(X{})>{}           , int> =0> void nrm2(size_t n, XP x, size_t incx, RP r){auto rr = BLAS(snrm2) (n, (s const*)static_cast<X*>(x), incx); std::memcpy((s*)static_cast<R*>(r), &rr, sizeof(s));}
+template<class XP, class X = typename std::pointer_traits<XP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_d<X>{} and is_d<R>{} and std::is_assignable<R&, decltype(X{})>{}           , int> =0> void nrm2(size_t n, XP x, size_t incx, RP r){auto rr = BLAS(dnrm2) (n, (d const*)static_cast<X*>(x), incx); std::memcpy((s*)static_cast<R*>(r), &rr, sizeof(d));}
+
+template<class XP, class X = typename std::pointer_traits<XP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_c<X>{} and is_s<R>{} and std::is_assignable<R&, decltype(std::norm(X{}))>{}, int> =0> void nrm2(size_t n, XP x, size_t incx, RP r){auto rr = BLAS(scnrm2)(n, (c const*)static_cast<X*>(x), incx); std::memcpy((s*)static_cast<R*>(r), &rr, sizeof(s));}
+template<class XP, class X = typename std::pointer_traits<XP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_z<X>{} and is_d<R>{} and std::is_assignable<R&, decltype(std::norm(X{}))>{}, int> =0> void nrm2(size_t n, XP x, size_t incx, RP r){auto rr = BLAS(dznrm2)(n, (z const*)static_cast<X*>(x), incx); std::memcpy((s*)static_cast<R*>(r), &rr, sizeof(d));}
+
+
+//	template<class S>    v nrm2 (S n, typename add_const_ptr<std::complex<double>>::type x, S incx, d* r){*r = BLAS(dznrm2  )(BC(n), x, BC(incx));}
+	
 	ixamax(s)      ixamax(d)       ixamax(c)       ixamax(z)
 }
 #undef xnrm2
@ -263,29 +389,49 @@ namespace core{
 ///////////////////////////////////////////////////////////////////////////////
 // LEVEL2
 #define xgemv(T) template<class C, class S> v gemv(C trans, S m, S n, T const& a, T const* A, S lda, T const* X, S incx, T beta, T*       Y, S incy             ){BLAS(T##gemv)(trans, BC(m), BC(n), a, A, BC(lda), X, BC(incx), beta, Y, BC(incy)            );}
-#define xger(T)  template<         class S> v ger(          S m, S n, T const& a,                    T const* X, S incx,         T const* Y, S incy, T* A, S lda){BLAS(T##ger )(       BC(m), BC(n), a,             X, BC(incx),       Y, BC(incy), A, BC(lda));}
-                 template<         class S> v ger(          S m, S n, c const& a,                    c const* X, S incx,         c const* Y, S incy, c* A, S lda){BLAS(cgeru  )(       BC(m), BC(n), a,             X, BC(incx),       Y, BC(incy), A, BC(lda));}
-                 template<         class S> v ger(          S m, S n, z const& a,                    z const* X, S incx,         z const* Y, S incy, z* A, S lda){BLAS(zgeru  )(       BC(m), BC(n), a,             X, BC(incx),       Y, BC(incy), A, BC(lda));}
+#define xger(T)  template<         class S> v ger (         S m, S n, T const& a,                    T const* X, S incx,         T const* Y, S incy, T* A, S lda){BLAS(T##ger )(       BC(m), BC(n), a,             X, BC(incx),       Y, BC(incy), A, BC(lda));}
+                 template<         class S> v ger (         S m, S n, c const& a,                    c const* X, S incx,         c const* Y, S incy, c* A, S lda){BLAS(cgeru  )(       BC(m), BC(n), a,             X, BC(incx),       Y, BC(incy), A, BC(lda));}
+                 template<         class S> v ger (         S m, S n, z const& a,                    z const* X, S incx,         z const* Y, S incy, z* A, S lda){BLAS(zgeru  )(       BC(m), BC(n), a,             X, BC(incx),       Y, BC(incy), A, BC(lda));}
 #define xgeru(T) template<         class S> v geru(         S m, S n, T const& a,                    T const* X, S incx,         T const* Y, S incy, T* A, S lda){BLAS(T##geru)(       BC(m), BC(n), a,             X, BC(incx),       Y, BC(incy), A, BC(lda));}
 #define xgerc(T) template<         class S> v gerc(         S m, S n, T const& a,                    T const* X, S incx,         T const* Y, S incy, T* A, S lda){BLAS(T##gerc)(       BC(m), BC(n), a,             X, BC(incx),       Y, BC(incy), A, BC(lda));}
-xgemv(s) xgemv(d) xgemv(c) xgemv(z)
+
+namespace core{
+
+//xgemv(s) xgemv(d) xgemv(c) xgemv(z)
 xger(s)   xger(d)
                  xgeru(c) xgeru(z)
                  xgerc(c) xgerc(z)

+
+using std::enable_if_t;
+using std::is_assignable;
+
+template<class A, class M, class X, class B, class Y, enable_if_t<is_s<M>{} and is_s<X>{} and is_s<Y>{} and is_assignable<Y&, decltype(A{}*M{}*X{}+B{}*Y{})>{}, int> =0> void gemv(char trans, size_t m, size_t n, A const& a, M* ma, size_t lda, X* x, size_t incx, B b, Y* y, size_t incy){BLAS(sgemv)(trans, m, n, a, (s const*)ma, lda, (s const*)x, incx, b, (s*)y, incy);}
+template<class A, class M, class X, class B, class Y, enable_if_t<is_d<M>{} and is_d<X>{} and is_d<Y>{} and is_assignable<Y&, decltype(A{}*M{}*X{}+B{}*Y{})>{}, int> =0> void gemv(char trans, size_t m, size_t n, A const& a, M* ma, size_t lda, X* x, size_t incx, B b, Y* y, size_t incy){BLAS(dgemv)(trans, m, n, a, (d const*)ma, lda, (d const*)x, incx, b, (d*)y, incy);}
+template<class A, class M, class X, class B, class Y, enable_if_t<is_c<M>{} and is_c<X>{} and is_c<Y>{} and is_assignable<Y&, decltype(A{}*M{}*X{}+B{}*Y{})>{}, int> =0> void gemv(char trans, size_t m, size_t n, A const& a, M* ma, size_t lda, X* x, size_t incx, B b, Y* y, size_t incy){BLAS(cgemv)(trans, m, n, a, (c const*)ma, lda, (c const*)x, incx, b, (c*)y, incy);}
+template<class A, class M, class X, class B, class Y, enable_if_t<is_z<M>{} and is_z<X>{} and is_z<Y>{} and is_assignable<Y&, decltype(A{}*M{}*X{}+B{}*Y{})>{}, int> =0> void gemv(char trans, size_t m, size_t n, A const& a, M* ma, size_t lda, X* x, size_t incx, B b, Y* y, size_t incy){BLAS(zgemv)(trans, m, n, a, (z const*)ma, lda, (z const*)x, incx, b, (z*)y, incy);}
+
+//template<class SX, class SY, enable_if_t<is_s<SX>{} and is_s<SY>{} and is_assignable<SY&, SX&>{},int> =0> void copy(size_t n, SX* x, size_t incx, SY* y, size_t incy){BLAS(scopy)(n, (             float   const*)(x), incx, (             float  *)(y), incy);}
+//template<class DX, class DY, enable_if_t<is_d<DX>{} and is_d<DY>{} and is_assignable<DY&, DX&>{},int> =0> void copy(size_t n, DX* x, size_t incx, DY* y, size_t incy){BLAS(dcopy)(n, (             double  const*)(x), incx, (             double *)(y), incy);}
+//template<class CX, class CY, enable_if_t<is_c<CX>{} and is_c<CY>{} and is_assignable<CY&, CX&>{},int> =0> void copy(size_t n, CX* x, size_t incx, CY* y, size_t incy){BLAS(ccopy)(n, (std::complex<float > const*)(x), incx, (std::complex<float >*)(y), incy);}
+//template<class ZX, class ZY, enable_if_t<is_z<ZX>{} and is_z<ZY>{} and is_assignable<ZY&, ZX&>{},int> =0> void copy(size_t n, ZX* x, size_t incx, ZY* y, size_t incy){BLAS(zcopy)(n, (std::complex<double> const*)(x), incx, (std::complex<double>*)(y), incy);}
+
+
+}
+
 template<class T> 
 struct blas2{
 //	template<class S>
 //	static v trsv(char ulA, char transA, char di, S m, T const* A, S lda, T* X, S incx) = delete;
 };

-template<> struct blas2<s>{template<class... As> static v trsv(As... as){BLAS(strsv)(as...);}};
-template<> struct blas2<d>{template<class... As> static v trsv(As... as){BLAS(dtrsv)(as...);}};
-template<> struct blas2<c>{template<class... As> static v trsv(As... as){BLAS(ctrsv)(as...);}};
+template<> struct blas2<s>{template<class... As> static v    trsv(As... as)                              {BLAS(strsv)(as...);}};
+template<> struct blas2<d>{template<class... As> static v    trsv(As... as)                              {BLAS(dtrsv)(as...);}};
+template<> struct blas2<c>{template<class... As> static v    trsv(As... as)                              {BLAS(ctrsv)(as...);}};
 template<> struct blas2<z>{template<class... As> static auto trsv(As... as)->decltype(BLAS(ztrsv)(as...)){BLAS(ztrsv)(as...);}};

 namespace core{
-template<typename TconstP, typename TP, typename S=std::size_t, typename C=char> v trsv(C ulA, C transA, C diA, S n, TconstP A, S lda, TP X, S incx){blas2<std::decay_t<typename std::pointer_traits<TP>::element_type>>::trsv(ulA, transA, diA, n, A, lda, X, incx);}
+	template<typename TconstP, typename TP, typename S=std::size_t, typename C=char> v trsv(C ulA, C transA, C diA, S n, TconstP A, S lda, TP X, S incx){blas2<std::decay_t<typename std::pointer_traits<TP>::element_type>>::trsv(ulA, transA, diA, n, A, lda, X, incx);}
 }

 #undef xgemv
@ -295,55 +441,156 @@ template<typename TconstP, typename TP, typename S=std::size_t, typename C=char>

 ///////////////////////////////////////////////////////////////////////////////
 // LEVEL 3
+
+#define xsyrk(T) template<class UL, class C, class S>             v syrk(        UL ul, C transA,             S n, S k, T    alpha, T const* A, S lda,             T    beta, T* CC, S ldc){\
+	MULTI_MARK_SCOPE("cpu_syrk"); BLAS(T##syrk)(      ul, transA,            BC(n), BC(k), alpha, A, BC(lda),        beta, CC, BC(ldc));}
+
+namespace core{
+
+using std::is_convertible_v;
+using std::pointer_traits;
+using std::enable_if_t;
+using std::max;
+
+#define xherk(T) \
+template<class UL, class C, class S, class ALPHA, class AAP, class AA = typename pointer_traits<AAP>::element_type, class BETA, class CCP, class CC = typename pointer_traits<CCP>::element_type, class Real = typename T::value_type,\
+enable_if_t< \
+	is_##T<AA>{} and is_##T<CC>{} and is_assignable<CC&, decltype(ALPHA{}*AA{}*AA{})>{} and \
+	is_convertible_v<AAP, AA*> and is_convertible_v<CCP, CC*> \
+, int> =0> \
+v herk(        UL ul, C transA,             S n, S k, ALPHA const* alpha, AAP aa, S lda,             BETA const* beta, CCP cc, S ldc) \
+/*=delete;*/ \
+{ \
+	if(transA == 'N' or transA == 'n') MULTI_ASSERT1( lda >= max(1l, n) ); else MULTI_ASSERT1( lda >= max(1l, k) ); \
+	MULTI_ASSERT1( ldc >= max(1l, n) ); \
+	MULTI_MARK_SCOPE("cpu_herk"); BLAS(T##herk)(      ul, transA,            BC(n), BC(k), *(Real const*)alpha, aa, BC(lda),        *(Real const*)beta, cc, BC(ldc)); \
+}
+
 #define xgemm(T) \
-template<class C, class S> v gemm(C transA, C transB, S m, S n, S k, T const* a, T const* A, S lda, T const* B, S ldb, T const* beta, T* CC, S ldc){ \
-	if(transA == 'N' or transA == 'n') {assert(lda >= std::max(1l, m));} else {assert(lda >= std::max(1l, k));} \
-	assert(ldb >= std::max(1l, transB=='N'?k:n)); \
-	/*if(transB == 'N' or transB == 'n') {std::cerr<<"ldb,k,n,m ="<< ldb <<','<<k<<','<<n<<','<<m<<std::endl;*/ \
-		/*if(ldb==1 and n==1) return gemv(transA, m, k, a, A, lda, B, S{1}, beta, CC, S{1});*/ \
-		/*assert(ldb >= std::max(1l, k));} else {std::cerr<<"ldb ="<< ldb <<std::endl; assert(ldb >= std::max(1l, n));}*/ \
-	assert(ldc >= std::max(1l, m)); \
-	BLAS(T##gemm)(transA, transB, BC(m), BC(n), BC(k), *a, A, BC(lda), B, BC(ldb), *beta, CC, BC(ldc));\
+template<class ALPHA, class AAP, class AA = typename pointer_traits<AAP>::element_type, class BBP, class BB = typename pointer_traits<BBP>::element_type, class BETA, class CCP, class CC = typename pointer_traits<CCP>::element_type, \
+enable_if_t< \
+	is_##T<AA>{} and is_##T<BB>{} and is_##T<CC>{} and is_assignable<CC&, decltype(ALPHA{}*AA{}*BB{})>{} and \
+	is_convertible_v<AAP, AA*> and is_convertible_v<BBP, BB*> and is_convertible_v<CCP, CC*> \
+, int> =0 > \
+v gemm(char transA, char transB, ssize_t m, ssize_t n, ssize_t k, ALPHA const* alpha, AAP aa, ssize_t lda, BBP bb, ssize_t ldb, BETA const* beta, CCP cc, ssize_t ldc) \
+{ \
+	MULTI_MARK_SCOPE("cpu_gemm");			                                                                                                                            \
+	using std::max;                                                                                                                                                     \
+	if(transA =='N') MULTI_ASSERT1(lda >= max(1l, m)); else MULTI_ASSERT1(lda >= max(1l, k));                                                                           \
+	if(transB =='N') MULTI_ASSERT1(ldb >= max(1l, k)); else MULTI_ASSERT1(ldb >= max(1l, n));                                                                           \
+	MULTI_ASSERT1( aa != cc );                                                                                                                                                 \
+	MULTI_ASSERT1( bb != cc );                                                                                                                                                 \
+	MULTI_ASSERT1(ldc >= max(ssize_t{1}, m));                                                                                                                                          \
+	if(*beta != 0.) MULTI_ASSERT1((is_assignable<CC&, decltype(ALPHA{}*AA{}*BB{} + BETA{}*CC{})>{}));                                                                          \
+	BLAS(T##gemm)(transA, transB, BC(m), BC(n), BC(k), *(T const*)alpha, (T const*)static_cast<AA*>(aa), BC(lda), (T const*)static_cast<BB*>(bb), BC(ldb), *(T const*)beta, (T*)static_cast<CC*>(cc), BC(ldc));               \
 }
-#define xsyrk(T) template<class UL, class C, class S> v syrk(UL ul, C transA, S n, S k, T alpha, T const* A, S lda, T beta, T* CC, S ldc){BLAS(T##syrk)(ul, transA, BC(n), BC(k), alpha, A, BC(lda), beta, CC, BC(ldc));}
-#define xherk(T) template<class UL, class C, class S, class Real> v herk(UL ul, C transA, S n, S k, Real alpha, T const* A, S lda, Real beta, T* CC, S ldc){BLAS(T##herk)(ul, transA, BC(n), BC(k), alpha, A, BC(lda), beta, CC, BC(ldc));}
-#define xtrsm(T) template<class C, class UL, class Di, class S> v trsm(C side, UL ul, C transA, Di di, S m, S n, T alpha, T const* A, S lda, T* B, S ldb){BLAS(T##trsm)(side, ul, transA, di, BC(m), BC(n), alpha, A, lda, B, ldb);}

-namespace core{
 xgemm(s) xgemm(d) xgemm(c) xgemm(z)
-}
-
-namespace core{
-xsyrk(s) xsyrk(d) xsyrk(c) xsyrk(z)
-                  xherk(c) xherk(z)
-xtrsm(s) xtrsm(d) xtrsm(c) xtrsm(z)
-}
-
 #undef xgemm
+
+#define xtrsm(T) \
+template<class ALPHA, class AAP, class AA = typename pointer_traits<AAP>::element_type, class BBP, class BB = typename pointer_traits<BBP>::element_type, \
+enable_if_t< \
+	is_##T<AA>{} and is_##T<BB>{} and is_assignable<BB&, decltype(AA{}*BB{}/ALPHA{})>{} and is_assignable<BB&, decltype(ALPHA{}*BB{}/AA{})>{} and \
+	is_convertible_v<AAP, AA*> and is_convertible_v<BBP, BB*> \
+,int> =0> \
+v trsm(char side, char ul, char transA, char diag, ssize_t m, ssize_t n, ALPHA alpha, AAP aa, ssize_t lda, BBP bb, ssize_t ldb){ \
+	MULTI_MARK_SCOPE("cpu_trsm");											\
+	assert( side   == 'L' or side    == 'R' ); \
+	assert( ul     == 'U' or ul     == 'L' ); \
+	assert( transA == 'N' or transA == 'T' or transA == 'C' ); \
+	assert( diag     == 'U' or diag     == 'N' ); \
+	MULTI_ASSERT1( m >= 0 and n >= 0 ); \
+	using std::max; \
+	if(side == 'L') MULTI_ASSERT1(lda >= max(ssize_t{1}, m)); else if(side == 'R') assert( lda >= max(ssize_t{1}, n) ); \
+	MULTI_ASSERT1( ldb >= max(ssize_t{1}, m) ); \
+	BLAS(T##trsm)(side, ul, transA, diag, BC(m), BC(n), alpha, (T const*)static_cast<AA*>(aa), BC(lda), (T*)static_cast<BB*>(bb), BC(ldb)); \
+}
+xtrsm(s) xtrsm(d) xtrsm(c) xtrsm(z)
+#undef xtrsm
+
+xsyrk(s) xsyrk(d) xsyrk(c) xsyrk(z)
+	              xherk(c) xherk(z)
+
+}
+
 #undef xsyrk
 #undef xherk
 #undef xtrsm

 #undef BC

-}}}
+struct context{ // stateless (and thread safe)
+	template<class... As>
+	static auto axpy(As... as)
+	->decltype(core::axpy(as...)){
+		return core::axpy(as...);}
+
+	template<class... As>
+	static auto gemv(As... as)
+	->decltype(core::gemv(as...)){
+		return core::gemv(as...);}
+
+	template<class... As>
+	static auto gemm(As&&... as)
+	->decltype(core::gemm(std::forward<As>(as)...)){
+		return core::gemm(std::forward<As>(as)...);}
+
+	template<class... As>
+	static auto dot(As&&... as)
+	->decltype(core::dot(std::forward<As>(as)...)){
+		return core::dot(std::forward<As>(as)...);}
+
+	template<class... As>
+	static auto dotc(As&&... as)
+	->decltype(core::dotc(std::forward<As>(as)...)){
+		return core::dotc(std::forward<As>(as)...);}
+
+	template<class... As>
+	static auto dotu(As&&... as)
+	->decltype(core::dotu(std::forward<As>(as)...)){
+		return core::dotu(std::forward<As>(as)...);}
+
+	template<class... As>
+	static auto trsm(As&&... as)
+	->decltype(core::trsm(std::forward<As>(as)...)){
+		return core::trsm(std::forward<As>(as)...);}
+
+	template<class... As>
+	static auto herk(As&&... as)
+	->decltype(core::herk(std::forward<As>(as)...)){
+		return core::herk(std::forward<As>(as)...);}
+};
+
+template<class Context> struct is_context : std::false_type{};
+template<> struct is_context<context> : std::true_type{};
+template<> struct is_context<context&&> : std::true_type{};
+template<> struct is_context<context&> : std::true_type{};
+template<> struct is_context<context const&> : std::true_type{};
+
+template<> struct is_context<void*&> : std::true_type{};
+
+namespace core{
+template<class Context, class... As>
+auto copy(Context&&, As... as)
+->decltype(core::copy(as...)){
+	return core::copy(as...);}
+}
+
+template<class TPtr, std::enable_if_t<std::is_convertible<TPtr, typename std::pointer_traits<TPtr>::element_type*>{}, int> =0> 
+blas::context* default_context_of(TPtr const&){return {};}
+
+}
+
+}}

 ///////////////////////////////////////////////////////////////////////////////

-#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_CORE
+#if not __INCLUDE_LEVEL__

 #include "../../array.hpp"
 #include "../../utility.hpp"

-#include<complex>
-#include<cassert>
-#include<iostream>
-#include<numeric>
-#include<algorithm>
-
-using std::cout;
-namespace multi = boost::multi;
-
 int main(){}

 #endif
--- a/external_codes/boost_multi/multi/adaptors/blas/cuda.hpp_
+++ b/external_codes/boost_multi/multi/adaptors/blas/cuda.hpp_
@ -1,17 +1,25 @@
-#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-exit;$CXX $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas&&$0x&&rm $0x;exit
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
+$CXXX $CXXFLAGS -include"boost/log/trivial.hpp" -D'MULTI_MARK_SCOPE(MsG)=BOOST_LOG_TRIVIAL(trace)<<MsG' -DBOOST_LOG_DYN_LINK $0 -o $0x `pkg-config --cflags --libs cudart-11.0 cublas-11.0 blas` -lboost_unit_test_framework -lboost_log -lboost_thread -lboost_system -lboost_log_setup -lpthread&&$0x&&rm $0x;exit
 #endif
 // © Alfredo A. Correa 2019-2020

 #ifndef MULTI_ADAPTORS_BLAS_CUDA_HPP
 #define MULTI_ADAPTORS_BLAS_CUDA_HPP

+#include "../blas/../../config/MARK.hpp" // MULTI_MARK_SCOPE
+
+#include "../../adaptors/blas/core.hpp" // is_context
+
 #include "../../memory/adaptors/cuda/ptr.hpp"
 #include "../../memory/adaptors/cuda/managed/ptr.hpp"
 #include "../../memory/adaptors/cuda/managed/allocator.hpp"

 #include<cublas_v2.h>

+#include "../cuda/cublas/error.hpp"
+
+#include<thrust/complex.h>
+
 #define DECLRETURN(ExpR) ->decltype(ExpR){return ExpR;}
 #define JUSTRETURN(ExpR)                 {return ExpR;}

@ -21,68 +29,16 @@ exit;$CXX $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas&&$0x&&rm $0x;exit

 #include<system_error>

-namespace boost{
-namespace multi{
-
-enum class cublas_error : typename std::underlying_type<cublasStatus_t>::type{
-	success               = CUBLAS_STATUS_SUCCESS,
-	not_initialized       = CUBLAS_STATUS_NOT_INITIALIZED,
-	allocation_failed     = CUBLAS_STATUS_ALLOC_FAILED,
-	invalid_value         = CUBLAS_STATUS_INVALID_VALUE,
-	architecture_mismatch = CUBLAS_STATUS_ARCH_MISMATCH,
-	mapping_error         = CUBLAS_STATUS_MAPPING_ERROR,
-	execution_failed      = CUBLAS_STATUS_EXECUTION_FAILED,
-	internal_error        = CUBLAS_STATUS_INTERNAL_ERROR,
-	not_supported         = CUBLAS_STATUS_NOT_SUPPORTED,
-	license_error         = CUBLAS_STATUS_LICENSE_ERROR
-};
-
-std::string inline cublas_string(enum cublas_error err){ //https://stackoverflow.com/questions/13041399/equivalent-of-cudageterrorstring-for-cublas
-	switch(err){
-		case cublas_error::success              : return "CUBLAS_STATUS_SUCCESS";
-		case cublas_error::not_initialized      : return "CUBLAS_STATUS_NOT_INITIALIZED";
-		case cublas_error::allocation_failed    : return "CUBLAS_STATUS_ALLOC_FAILED";
-		case cublas_error::invalid_value        : return "CUBLAS_STATUS_INVALID_VALUE";
-		case cublas_error::architecture_mismatch: return "CUBLAS_STATUS_ARCH_MISMATCH";
-		case cublas_error::mapping_error        : return "CUBLAS_STATUS_MAPPING_ERROR";
-		case cublas_error::execution_failed     : return "CUBLAS_STATUS_EXECUTION_FAILED";
-		case cublas_error::internal_error       : return "CUBLAS_STATUS_INTERNAL_ERROR";
-		case cublas_error::not_supported        : return "CUBLAS_STATUS_NOT_SUPPORTED";
-		case cublas_error::license_error        : return "CUBLAS_STATUS_LICENSE_ERROR";
-	}
-	return "cublas status <unknown>";
-}
-
-struct cublas_error_category : std::error_category{
-	char const* name() const noexcept override{return "cublas wrapper";}
-	std::string message(int err) const override{return cublas_string(static_cast<enum cublas_error>(err));}
-	static error_category& instance(){static cublas_error_category instance; return instance;}
-};
-
-inline std::error_code make_error_code(cublas_error err) noexcept{
-	return std::error_code(int(err), cublas_error_category::instance());
-}
-
-/*
-template<class CublasFunction>
-auto cublas_call(CublasFunction f){
-	return [=](auto... args){
-		auto s = static_cast<enum cublas_error>(f(args...));
-		if( s != cublas_error::success ) throw std::system_error{make_error_code(s), "cannot call cublas function "};
-#ifdef _MULTI_CUBLAS_ALWAYS_SYNC
-		cudaDeviceSynchronize();
-#endif
-	};
-}*/
-//#define CUBLAS_(FunctionPostfix) boost::multi::cublas_call(cublas##FunctionPostfix)
-
 #define CUBLAS_CALL(CodE) \
-	auto s = static_cast<enum cublas_error>(CodE); \
-	if( s != cublas_error::success ) throw std::system_error{make_error_code(s), "cannot call cublas function "#CodE };
+	MULTI_MARK_SCOPE("multi::cublas::"#CodE); \
+	auto s = static_cast<enum boost::multi::cuda::cublas::error>(CodE); \
+	cudaDeviceSynchronize(); /*TODO make this more specific to mananged ptr and specific handle*/ \
+	if(s != boost::multi::cuda::cublas::error::success) throw std::system_error{boost::multi::cuda::cublas::make_error_code(s), "cannot call cublas function "#CodE };

-}}
-
-namespace std{template<> struct is_error_code_enum<::boost::multi::cublas_error> : true_type{};}
+cublasStatus_t cublasZdot (cublasHandle_t handle, int n,
+                           const double2          *x, int incx,
+                           const double2          *y, int incy,
+                           double2          *result) = delete;

 namespace boost{
 namespace multi{
@ -136,8 +92,27 @@ template<class T = void> struct cublas3{};

 DEFINE_CUBLAS1(S, s);
 DEFINE_CUBLAS1(D, d);
-DEFINE_CUBLAS1(C, c);
-DEFINE_CUBLAS1(Z, z);
+
+#define DEFINE_CUBLAS1_COMPLEX(UppeR, LowR, ReaLUppeR, ReaLLowR) \
+	template<> struct cublas1<UppeR>{ \
+		template<class...As> static auto iamax(As...as){return cublasI##LowR##amax(as...);}    \
+		/*amin */ \
+		template<class...As> static auto asum (As...as){return cublas##ReaLUppeR##LowR##asum (as...);}   \
+		/*axpy */ \
+		template<class...As> static auto copy (As...as){return cublas##UppeR##copy (as...);}   \
+		template<class...As> static auto dot  (As...as){return cublas##UppeR##dotu  (as...);} \
+		template<class...As> static auto dotu (As...as){return cublas##UppeR##dotu (as...);}   \
+		template<class...As> static auto dotc (As...as){return cublas##UppeR##dotc (as...);}   \
+		template<class...As> static auto nrm2 (As...as){return cublas##UppeR##nrm2 (as...);}   \
+		/*rot  */ \
+		/*rotg */ \
+		/*rotmg*/ \
+		template<class...As> static auto scal (As...as){return cublas##UppeR##scal (as...);}   \
+		/*swap */ \
+	}
+
+DEFINE_CUBLAS1_COMPLEX(C, c, S, s);
+DEFINE_CUBLAS1_COMPLEX(Z, z, D, d);

 template<class T> struct nrm2_result;//{using type = T;};
 template<> struct nrm2_result<S>{using type = S;};
@ -149,13 +124,19 @@ template<> struct cublas1<void>{
 // 2.5.1. cublasI<t>amax() https://docs.nvidia.com/cuda/cublas/index.html#cublasi-lt-t-gt-amax
 	template<class T> static cublasStatus_t iamax(cublasHandle_t handle, int n, const T* x, int incx, int *result   ){return cublas1<T>::iamax(handle, n, x, incx, result);}
 // 2.5.3. cublas<t>asum() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-asum
-	template<class T> static cublasStatus_t asum (cublasHandle_t handle, int n, const T* x, int incx, T* result     ){return cublas1<T>::asum(handle, n, x, incx, result);}
+	template<class T1, class T2> static cublasStatus_t asum (cublasHandle_t handle, int n, T1 const* x, int incx, T2* result     ){return cublas1<T1>::asum(handle, n, x, incx, result);}
 // 2.5.5. cublas<t>copy() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-copy
 	template<class T> static cublasStatus_t copy (cublasHandle_t handle, int n, const T* x, int incx, T* y, int incy){return cublas1<T>::copy(handle, n, x, incx, y, incy);}
 // 2.5.6. cublas<t>dot() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-dot
-	template<class T> static cublasStatus_t dot(cublasHandle_t handle, int n, const T* x, int incx, const T* y, int incy, T* result){return cublas1<T>::dot(handle, n, x, incx, y, incy, result);}
-	template<class T> static auto dotu(cublasHandle_t handle, int n, const T* x, int incx, const T* y, int incy, T* result) DECLRETURN(cublas1<T>::dotu(handle, n, x, incx, y, incy, result))
-	template<class T> static auto dotc(cublasHandle_t handle, int n, const T* x, int incx, const T* y, int incy, T* result) DECLRETURN(cublas1<T>::dotc(handle, n, x, incx, y, incy, result))
+	template<class T> static auto dot(cublasHandle_t handle, int n, const T* x, int incx, const T* y, int incy, T* result)
+	->decltype(cublas1<T>::dot(handle, n, x, incx, y, incy, result)){MULTI_MARK_SCOPE("function dot");
+		return cublas1<T>::dot(handle, n, x, incx, y, incy, result);}
+	template<class T> static auto dotu(cublasHandle_t handle, int n, const T* x, int incx, const T* y, int incy, T* result)
+	->decltype(cublas1<T>::dotu(handle, n, x, incx, y, incy, result)){MULTI_MARK_SCOPE("function dotu");
+		return cublas1<T>::dotu(handle, n, x, incx, y, incy, result);}
+	template<class T> static auto dotc(cublasHandle_t handle, int n, const T* x, int incx, const T* y, int incy, T* result)
+	->decltype(cublas1<T>::dotc(handle, n, x, incx, y, incy, result)){MULTI_MARK_SCOPE("function dotc");
+		return cublas1<T>::dotc(handle, n, x, incx, y, incy, result);}
 // 2.5.7. cublas<t>nrm2() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-nrm2
 	template<class T> static auto nrm2(cublasHandle_t handle, int n,
                            const T           *x, int incx, typename nrm2_result<T>::type  *result){return cublas1<T>::nrm2(handle, n, x, incx, result);}
@ -203,10 +184,16 @@ template<> struct cublas3<Z>{
 	template<class...As> static auto trsm (As...as){ CUBLAS_CALL(cublasZtrsm(as...));}
 };

-template<class T> struct herk_scalar;//{using type = T;};
+template<class T> struct herk_scalar;
 template<> struct herk_scalar<C>{using type = S;};
 template<> struct herk_scalar<Z>{using type = D;};

+template<class T> struct asum_scalar;
+template<> struct asum_scalar<C>{using type = S;};
+template<> struct asum_scalar<Z>{using type = D;};
+
+template<class T> using herk_scalar_t = typename herk_scalar<T>::type;
+
 template<> struct cublas3<void>{
 // 2.7.1. cublas<t>gemm() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-gemm
 	template<class T> static auto gemm(cublasHandle_t handle,
@ -216,7 +203,7 @@ template<> struct cublas3<void>{
                           const T           *A, int lda,
                           const T           *B, int ldb,
                           const T           *beta,
-                           T           *C, int ldc){return cublas3<T>::gemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
+                           T           *C, int ldc){MULTI_MARK_SCOPE("cublas3 gemm"); return cublas3<T>::gemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
 // 2.7.6. cublas<t>syrk() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-syrk
 	template<class T> static auto syrk(cublasHandle_t handle,
                           cublasFillMode_t uplo, cublasOperation_t trans,
@ -226,13 +213,13 @@ template<> struct cublas3<void>{
                           const T           *beta,
                           T           *C, int ldc){return cublas3<T>::syrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);}
 // 2.7.13. cublas<t>herk() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-herk
-	template<class T> static auto herk(cublasHandle_t handle,
+	template<class T2, class T3> static auto herk(cublasHandle_t handle,
                           cublasFillMode_t uplo, cublasOperation_t trans,
                           int n, int k,
-                           const typename herk_scalar<T>::type  *alpha,
-                           const T       *A, int lda,
-                           const typename herk_scalar<T>::type  *beta,
-                           T       *C, int ldc){return cublas3<T>::herk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);}
+                           const herk_scalar_t<T2> *alpha,
+                           const T2       *A, int lda,
+                           const herk_scalar_t<T2> *beta,
+                           T3       *C, int ldc){return cublas3<T2>::herk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);}
 // 2.7.10. cublas<t>trsm() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-trsm
 	template<class T> static auto trsm(cublasHandle_t handle,
                           cublasSideMode_t side, cublasFillMode_t uplo,
@ -246,15 +233,21 @@ template<> struct cublas3<void>{
 namespace cublas{

 template<class T, std::enable_if_t<not std::is_integral<T>{}, int> =0> decltype(auto) translate(T t){return t;}
+template<class T, std::enable_if_t<not std::is_copy_constructible<std::decay_t<T>>{}, int> =0> T& translate(T& t){return t;}

-auto translate(std::complex<float> const * t){return reinterpret_cast<cublas::complex<float>  const*>(t);}	
-auto translate(std::complex<float>       * t){return reinterpret_cast<cublas::complex<float>       *>(t);}	
-auto translate(std::complex<double> const* t){return reinterpret_cast<cublas::complex<double> const*>(t);}	
+auto translate(std::complex<float> const * t){return reinterpret_cast<cublas::complex<float>  const*>(t);}
+auto translate(std::complex<float>       * t){return reinterpret_cast<cublas::complex<float>       *>(t);}
+auto translate(std::complex<double> const* t){return reinterpret_cast<cublas::complex<double> const*>(t);}
 auto translate(std::complex<double>      * t){return reinterpret_cast<cublas::complex<double>      *>(t);}

+auto translate(thrust::complex<double> const* t){return reinterpret_cast<cublas::complex<double> const*>(t);}
+auto translate(thrust::complex<double>      * t){return reinterpret_cast<cublas::complex<double>      *>(t);}
+
 template<class T> auto translate(memory::cuda::ptr<T>          p) DECLRETURN(translate(raw_pointer_cast(p)))
 template<class T> auto translate(memory::cuda::managed::ptr<T> p) DECLRETURN(translate(raw_pointer_cast(p)))

+//auto translate(context& c){return c;}
+
 template<class T, std::enable_if_t<std::is_integral<T>{},int> = 0> 
 auto translate(T n){
 	assert(n <= +static_cast<T>(std::numeric_limits<int>::max()));
@ -267,52 +260,82 @@ auto translate(char O)->cublasOperation_t{
 	return CUBLAS_OP_N;
 }

-struct context : std::unique_ptr<std::decay_t<decltype(*cublasHandle_t{})>, decltype(&cublasDestroy)>{
-	context()  : std::unique_ptr<std::decay_t<decltype(*cublasHandle_t{})>, decltype(&cublasDestroy)>(
-		[]{cublasHandle_t h; cublasCreate(&h); return h;}(), &cublasDestroy
-	){}
-	int version() const{
-		int ret; cublasGetVersion(get(), &ret); return ret;
-	}
-	~context() noexcept = default;
-// 2.4.7. cublasGetPointerMode()
-	auto get_pointer_mode() const{
-		cublasPointerMode_t ret; cublasGetPointerMode(get(), &ret);
-		return static_cast<enum pointer_mode>(ret);
-	}
-// 2.4.8. cublasSetPointerMode() https://docs.nvidia.com/cuda/cublas/index.html#cublassetpointermode
-	context& set_pointer_mode(enum pointer_mode m){
-		cublasSetPointerMode(get(), static_cast<cublasPointerMode_t>(m)); return *this;
-	}
-	//set_stream https://docs.nvidia.com/cuda/cublas/index.html#cublassetstream
-	//get_stream https://docs.nvidia.com/cuda/cublas/index.html#cublasgetstream
-	//get_pointer_mode https://docs.nvidia.com/cuda/cublas/index.html#cublasgetpointermode
-	//set_pointer_mode https://docs.nvidia.com/cuda/cublas/index.html#cublasgetpointermode
-	template<class...As> auto iamax(As...as) const DECLRETURN(cublas1<>::iamax(get(), translate(as)...))
-	template<class...As> auto asum (As...as) const DECLRETURN(cublas1<>::asum (get(), translate(as)...))
-	template<class...As> auto scal (As...as) const DECLRETURN(cublas1<>::scal (get(), translate(as)...))
-	template<class...As> auto dot  (As...as) const DECLRETURN(cublas1<>::dot  (get(), translate(as)...))
-	template<class...As> auto dotu (As...as) const DECLRETURN(cublas1<>::dotu (get(), translate(as)...))
-	template<class...As> auto dotc (As...as) const DECLRETURN(cublas1<>::dotc (get(), translate(as)...))
-	template<class S, class Ptr, class T>
-	auto nrm2(S n, Ptr p, S incx, memory::cuda::ptr<T> result) // no const because the method is not thread safe
-	->decltype(cublas1<>::nrm2 (get(), translate(n), translate(p), translate(incx), translate(result))){set_pointer_mode(pointer_mode::device);
-		auto r=cublas1<>::nrm2 (get(), translate(n), translate(p), translate(incx), translate(result)); set_pointer_mode(pointer_mode::host);
-		return r;
-	}
-	template<class S, class Ptr, class T>
-	auto nrm2(S n, Ptr p, S incx, T* result) const{
-		return cublas1<>::nrm2 (get(), translate(n), translate(p), translate(incx), translate(result));
-	}
-	template<class...As> auto copy (As...as) const DECLRETURN(cublas1<>::copy (get(), translate(as)...))
-	template<class...As> auto trsv (As...as) const{return cublas2<>::trsv(get(), translate(as)...);}
+//struct context : std::unique_ptr<std::decay_t<decltype(*cublasHandle_t{})>, decltype(&cublasDestroy)>{
+//	context()  : std::unique_ptr<std::decay_t<decltype(*cublasHandle_t{})>, decltype(&cublasDestroy)>(
+//		[]{MULTI_MARK_SCOPE("multi::cublas::create context"); cublasHandle_t h; cublasCreate(&h); return h;}(), &cublasDestroy
+//	){}
+//	int version() const{
+//		int ret; cublasGetVersion(get(), &ret); return ret;
+//	}
+//	context(context&& other) noexcept = default;
+//	~context() noexcept = default;
+//// 2.4.7. cublasGetPointerMode()
+//	auto get_pointer_mode() const{
+//		cublasPointerMode_t ret; cublasGetPointerMode(get(), &ret);
+//		return static_cast<enum pointer_mode>(ret);
+//	}
+//// 2.4.8. cublasSetPointerMode() https://docs.nvidia.com/cuda/cublas/index.html#cublassetpointermode
+//	context& set_pointer_mode(enum pointer_mode m){
+//		cublasSetPointerMode(get(), static_cast<cublasPointerMode_t>(m)); return *this;
+//	}
+//	//set_stream https://docs.nvidia.com/cuda/cublas/index.html#cublassetstream
+//	//get_stream https://docs.nvidia.com/cuda/cublas/index.html#cublasgetstream
+//	//get_pointer_mode https://docs.nvidia.com/cuda/cublas/index.html#cublasgetpointermode
+//	//set_pointer_mode https://docs.nvidia.com/cuda/cublas/index.html#cublasgetpointermode
+//	template<class...As> auto iamax(As...as) const DECLRETURN(cublas1<>::iamax(get(), translate(as)...))
+//	template<class...As> auto asum (As...as) const DECLRETURN(cublas1<>::asum (get(), translate(as)...))
+//	template<class...As> auto scal (As...as) const DECLRETURN(cublas1<>::scal (get(), translate(as)...))
+//	template<class...As> auto dot  (As...as) const DECLRETURN(cublas1<>::dot  (get(), translate(as)...))
+//	template<class...As> auto dotu (As...as) const DECLRETURN(cublas1<>::dotu (get(), translate(as)...))
+//	template<class...As> auto dotc (As...as) const DECLRETURN(cublas1<>::dotc (get(), translate(as)...))
+//	template<class S, class Ptr, class T>
+//	auto nrm2(S n, Ptr p, S incx, memory::cuda::ptr<T> result) // no const because the method is not thread safe
+//	->decltype(cublas1<>::nrm2 (get(), translate(n), translate(p), translate(incx), translate(result))){set_pointer_mode(pointer_mode::device);
+//		auto r=cublas1<>::nrm2 (get(), translate(n), translate(p), translate(incx), translate(result)); set_pointer_mode(pointer_mode::host);
+//		return r;
+//	}
+//	template<class S, class Ptr, class T>
+//	auto nrm2(S n, Ptr p, S incx, T* result) const{
+//		return cublas1<>::nrm2 (get(), translate(n), translate(p), translate(incx), translate(result));
+//	}
+//	template<class...As> auto copy (As...as) const DECLRETURN(cublas1<>::copy (get(), translate(as)...))
+//	template<class...As> auto trsv (As...as) const{return cublas2<>::trsv(get(), translate(as)...);}

-	template<typename... As> auto gemm(As... as) DECLRETURN(cublas3<>::gemm(get(), translate(as)...))
+//	template<typename... As> auto gemm(As... as) DECLRETURN(cublas3<>::gemm(get(), translate(as)...))

-	template<class...As> auto syrk (As...as) const{return cublas3<>::syrk(get(), translate(as)...);}
-	template<class...As> auto herk (As...as) const{return cublas3<>::herk(get(), translate(as)...);}
-	template<class...As> auto trsm (As...as) const{return cublas3<>::trsm(get(), translate(as)...);}
-};
+//	template<class...As> auto syrk (As...as) const{return cublas3<>::syrk(get(), translate(as)...);}
+//	template<class...As> auto herk (As...as) const{return cublas3<>::herk(get(), translate(as)...);}
+//	template<class...As> auto trsm (As...as) const{return cublas3<>::trsm(get(), translate(as)...);}
+//};
+
+//context* get_default_context(){
+//	thread_local context instance;
+//	return &instance;
+//}
+
+}
+
+}}
+
+namespace boost{
+namespace multi{
+
+namespace blas{
+
+template<> struct is_context<boost::multi::cublas::context> : std::true_type{};
+template<> struct is_context<boost::multi::cublas::context&&> : std::true_type{};
+template<> struct is_context<boost::multi::cublas::context&> : std::true_type{};
+
+template<class T> boost::multi::cublas::context* default_context_of(memory::cuda::         ptr<T> const&){return boost::multi::cublas::get_default_context();}
+template<class T> boost::multi::cublas::context* default_context_of(memory::cuda::managed::ptr<T> const&){return boost::multi::cublas::get_default_context();}
+
+//template<class T> boost::multi::cublas::context default_context_of(memory::cuda::managed::ptr<T>){return {};}
+
+//}
+
+//namespace memory{namespace cuda{
+//	using boost::multi::blas::default_context_of; // to please nvcc 'default_context_of' should be declared prior to the call site or in namespace 'boost::multi::memory::cuda'
+//}}
 }

 }}
@ -337,7 +360,7 @@ auto asum(S n, cuda::ptr<ComplexTconst> x, S incx){

 template<class...As> auto copy(As... as) DECLRETURN(cublas::context{}.copy(as...))
 template<class...As> auto scal(As... as) DECLRETURN(cublas::context{}.scal(as...))
-template<class...As> auto dot (As... as) DECLRETURN(cublas::context{}.dot (as...))
+//template<class...As> auto dot (As... as) DECLRETURN(cublas::context{}.dot (as...))
 template<class...As> auto dotu(As... as) DECLRETURN(cublas::context{}.dotu(as...))
 template<class...As> auto dotc(As... as) DECLRETURN(cublas::context{}.dotc(as...))
 template<class...As> auto nrm2(As... as) DECLRETURN(cublas::context{}.nrm2(as...))
@ -364,8 +387,7 @@ auto trsv(char ul, char transA, char a_diag, S n, memory::cuda::ptr<Tconst> A, S
 template<class... As>
 auto gemm(As... as)
 ->decltype(cublas::context{}.gemm(as...)){
-	return cublas::context{}.gemm(as...);
-}
+	return cublas::context{}.gemm(as...);}

 template<class Tconst, class T, class UL, class C, class S, class Real>
 void syrk(UL ul, C transA, S n, S k, Real alpha, multi::memory::cuda::ptr<Tconst> A, S lda, Real beta, multi::memory::cuda::ptr<T> CC, S ldc){
@ -429,7 +451,7 @@ using cuda::iamax;
 using cuda::asum;
 using cuda::copy;
 using cuda::scal;
-using cuda::dot;
+//using cuda::dot;
 using cuda::dotu;
 using cuda::dotc;
 using cuda::nrm2;
@ -455,17 +477,69 @@ auto trsm(Side /*cublasSideMode_t*/ side, /*cublasFillMode_t*/ Fill uplo, /*cubl

 #if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_CUDA

+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
 #include "../../array.hpp"
 #include "../../utility.hpp"
+
+#include "../../adaptors/cuda.hpp"
+#include "../../adaptors/blas.hpp"
+#include "../../adaptors/blas/cuda.hpp"
+
 #include<cassert>

 namespace multi = boost::multi;

-int main(){
-//	multi::cublas::context c;
-//	assert( c.version() >= 10100 );
+#if 0
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_version){
+	multi::cublas::context c;
+	BOOST_REQUIRE( c.version() >= 10100 );
 }

+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_iamax){
+	using complex = std::complex<double>;
+	complex const I{0,1};
+	{
+		multi::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
+		using multi::blas::iamax;
+		BOOST_REQUIRE( iamax(A) == 2 );
+	}
+	{
+		multi::cuda::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
+		using multi::blas::iamax;
+		BOOST_REQUIRE( iamax(A) == 2 );
+	}
+	{
+		multi::cuda::managed::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
+		using multi::blas::iamax;
+		BOOST_REQUIRE( iamax(A) == 2 );
+	}
+}
+#endif
+
+template<class T> void what(T&&) = delete;
+
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_dot){
+	using complex = std::complex<double>;
+	complex const I{0,1};
+	multi::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
+	multi::array<complex, 1> const B = {2. + 3.*I, 4., 5. + 6.*I, 7.};
+	namespace blas = multi::blas;
+	{
+		multi::cuda::array<complex, 1> const A_gpu = A, B_gpu = B;
+		using blas::dot;
+		BOOST_REQUIRE( dot(blas::C(A_gpu), B_gpu) == dot(blas::C(A), B) );
+	}
+	{
+		multi::cuda::managed::array<complex, 1> const A_mng = A, B_mng = B;
+		using blas::dot;
+		BOOST_REQUIRE( dot(blas::C(A_mng), A_mng) == dot(blas::C(A), A) );
+	}
+}
+
+
 #endif
 #endif

--- a/external_codes/boost_multi/multi/adaptors/blas/cuda/tests/gemm.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/cuda/tests/gemm.cpp
@ -0,0 +1,167 @@
+#ifdef COMPILATION_INSTRUCTIONS
+/usr/local/cuda-11.1/bin/nvcc -x cu -std=c++17  -use_fast_math -lpthread -D_REENTRANT -DBOOST_PP_VARIADICS  -Xcudafe "--diag_suppress=implicit_return_from_non_void_function" --extended-lambda --expt-relaxed-constexpr $0 -o $0x `pkg-config --cflags --libs cudart-11.0 cublas-11.0 blas` -lboost_unit_test_framework -DBOOST_LOG_DYN_LINK -lboost_log -lboost_thread -lboost_system -lboost_log_setup -lpthread -lboost_timer&&$0x&&rm $0x; exit
+#endif
+// © Alfredo A. Correa 2020-2021
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS gemm"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+#include <boost/timer/timer.hpp>
+
+//#include"boost/log/trivial.hpp"
+//#define MULTI_MARK_SCOPE(MsG) BOOST_LOG_TRIVIAL(trace)<<MsG
+
+//#include "../../../../adaptors/cublas/context.hpp"
+
+#include "../../../cuda/cublas.hpp"
+#include "../../../../array.hpp"
+
+#include "../../../../adaptors/cuda.hpp"
+#include "../../../../adaptors/blas.hpp"
+
+#include<random>
+
+namespace multi = boost::multi;
+
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_gemm_complex_3x2_3x2){
+	using complex = std::complex<double>; complex const I{0, 1};
+	namespace blas = multi::blas;
+	multi::array<complex, 2> const a = {
+		{1. + 2.*I, 5. + 2.*I}, 
+		{9. - 1.*I, 9. + 1.*I}, 
+		{1. + 1.*I, 2. + 2.*I}
+	};
+	multi::array<complex, 2> const b = {
+		{ 11. - 2.*I, 5. + 2.*I},
+		{  7. - 3.*I, 2. + 1.*I},
+		{  8. - 1.*I, 1. + 1.*I}
+	};
+	{
+		{
+			multi::array<complex, 2> c({2, 2});
+			c = blas::gemm(1., blas::H(a), b); // c=ab, c⸆=b⸆a⸆
+			BOOST_REQUIRE( c[1][0] == 125.-84.*I );
+		}
+	}
+	{
+		multi::cuda::array<complex, 2> const a_gpu = a;
+		multi::cuda::array<complex, 2> const b_gpu = b;
+		{
+			multi::cuda::array<complex, 2> c_gpu({2, 2});
+			c_gpu = blas::gemm(1., blas::H(a_gpu), b_gpu); // c=ab, c⸆=b⸆a⸆
+			BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
+		}
+		{
+			auto c_gpu =+ blas::gemm(1.0, blas::H(a_gpu), b_gpu);
+			BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
+		}
+	}
+	{
+		multi::cuda::managed::array<complex, 2> const a_gpu = a;
+		multi::cuda::managed::array<complex, 2> const b_gpu = b;
+		{
+			multi::cuda::managed::array<complex, 2> c_gpu({2, 2});
+			blas::gemm(1., blas::H(a_gpu), b_gpu, 0., c_gpu); // c=ab, c⸆=b⸆a⸆
+			BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
+		}
+		{
+			auto c_gpu =+ blas::gemm(1.0, blas::H(a_gpu), b_gpu);
+			BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
+		}
+	}
+}
+
+//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_gemm_complex_3x2_3x2_with_context){
+//	using complex = std::complex<double>; complex const I{0, 1};
+//	namespace blas = multi::blas;
+//	multi::array<complex, 2> const a = {
+//		{1. + 2.*I, 5. + 2.*I}, 
+//		{9. - 1.*I, 9. + 1.*I}, 
+//		{1. + 1.*I, 2. + 2.*I}
+//	};
+//	multi::array<complex, 2> const b = {
+//		{ 11. - 2.*I, 5. + 2.*I},
+//		{  7. - 3.*I, 2. + 1.*I},
+//		{  8. - 1.*I, 1. + 1.*I}
+//	};
+//	{
+//		{
+//			multi::blas::context ctx;
+//			multi::array<complex, 2> c({2, 2});
+//			blas::gemm(ctx, 1., blas::H(a), b, 0., c); // c=ab, c⸆=b⸆a⸆
+//			BOOST_REQUIRE( c[1][0] == 125.-84.*I );
+//		}
+//	}
+//	{
+//		multi::cublas::context ctx;
+//		multi::cuda::array<complex, 2> const a_gpu = a;
+//		multi::cuda::array<complex, 2> const b_gpu = b;
+//		{
+//			multi::cuda::array<complex, 2> c_gpu({2, 2});
+//			blas::gemm(ctx, 1., blas::H(a_gpu), b_gpu, 0., c_gpu); // c=ab, c⸆=b⸆a⸆
+//			BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
+//		}
+//		{
+//			auto c_gpu =+ blas::gemm(&ctx, blas::H(a_gpu), b_gpu);
+//			BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
+//		}
+//	}
+//	{
+//		multi::cublas::context ctx;
+//		multi::cuda::managed::array<complex, 2> const a_gpu = a;
+//		multi::cuda::managed::array<complex, 2> const b_gpu = b;
+//		{
+//			multi::cuda::managed::array<complex, 2> c_gpu({2, 2});
+//			blas::gemm(ctx, 1., blas::H(a_gpu), b_gpu, 0., c_gpu); // c=ab, c⸆=b⸆a⸆
+//			BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
+//		}
+//		{
+//			auto c_gpu =+ blas::gemm(&ctx, blas::H(a_gpu), b_gpu);
+//			BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
+//		}
+//	}
+//}
+
+#if 0
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_gemm_context_timing){
+	using complex = std::complex<double>;//complex const I{0, 1};
+	
+	multi::array<complex, 2> A({1000, 1000});
+	multi::array<complex, 2> B(      {1000, 1000});
+	multi::array<complex, 2> C({size(A), size(~B)});
+	A[99][99] = B[11][22] = C[33][44] = 1.0;
+	std::cerr<< "memory " << (A.num_elements()+ B.num_elements() + C.num_elements())*sizeof(complex)/1e6 <<" MB"<<std::endl;
+	
+	{
+		auto rand = [d=std::uniform_real_distribution<>{0., 10.}, g=std::mt19937{}]() mutable{return complex{d(g), d(g)};};
+		std::generate(A.elements().begin(), A.elements().end(), rand);
+		std::generate(B.elements().begin(), B.elements().end(), rand);
+	}
+	namespace blas = multi::blas;
+	{
+		boost::timer::auto_cpu_timer t; // 2.398206s
+		for(auto i = 0; i != 10; ++i){
+			blas::context ctx;
+			blas::gemm(ctx, 1, A, B, 0, C);
+		}
+	}
+	using device_array = multi::cuda::array<complex, 2>;
+	{
+		device_array A_gpu = A, B_gpu = B, C_gpu({size(A), size(~B)});
+
+		boost::timer::auto_cpu_timer t; // 0.707426s
+		for(auto i = 0; i != 10; ++i){
+			multi::cublas::context ctx;
+			blas::gemm(ctx, 1, A_gpu, B_gpu, 0, C_gpu);
+		}
+	}
+	{
+		device_array A_gpu = A, B_gpu = B, C_gpu({size(A), size(~B)});
+
+		boost::timer::auto_cpu_timer t; // 0.613534s
+		multi::cublas::context ctx;
+		for(auto i = 0; i != 10; ++i) blas::gemm(ctx, 1, A_gpu, B_gpu, 0, C_gpu);
+	}
+}
+#endif
+
--- a/external_codes/boost_multi/multi/adaptors/blas/cuda/tests/gemm.su
+++ b/external_codes/boost_multi/multi/adaptors/blas/cuda/tests/gemm.su
--- a/external_codes/boost_multi/multi/adaptors/blas/cuda/tests/iamax.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/cuda/tests/iamax.cpp
@ -1,7 +1,7 @@
 #ifdef COMPILATION_INSTRUCTIONS
-nvcc -x cu`#$CXX` $0 -o $0x  `pkg-config --libs blas` -Wno-deprecated-declarations -lcudart -lcublas -lboost_unit_test_framework&&$0x&&rm $0x; exit
+$CXXX $CXXFLAGS $0 -o $0x  `pkg-config --libs blas` -Wno-deprecated-declarations `pkg-config --cflags --libs cudart-11.0 cublas-11.0 blas` -lboost_unit_test_framework&&$0x&&rm $0x; exit
 #endif
-// © Alfredo A. Correa 2019
+// © Alfredo A. Correa 2019-2020

 #define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS iamax"
 #define BOOST_TEST_DYN_LINK
@ -14,8 +14,7 @@ nvcc -x cu`#$CXX` $0 -o $0x  `pkg-config --libs blas` -Wno-deprecated-declaratio
 namespace multi = boost::multi;

 BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_iamax){
-	using complex = std::complex<double>;
-	complex const I{0,1};
+	using complex = std::complex<double>; complex const I{0, 1};
 	{
 		multi::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
 		using multi::blas::iamax;
--- a/external_codes/boost_multi/multi/adaptors/blas/dot.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/dot.hpp
@ -1,204 +1,100 @@
-#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
-#endif
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
 // © Alfredo A. Correa 2019-2020

 #ifndef MULTI_ADAPTORS_BLAS_DOT_HPP
 #define MULTI_ADAPTORS_BLAS_DOT_HPP

 #include "../blas/core.hpp"
-#include "../blas/operations.hpp"
-
-#include "../../array.hpp"
-#include "../../config/NODISCARD.hpp"
+#include "../blas/numeric.hpp" // is_complex
+#include "../blas/operations.hpp" // blas::C

 namespace boost{
-namespace multi{
-namespace blas{
+namespace multi::blas{

-template<class A, std::enable_if_t<not is_conjugated<A>{}, int> =0> 
-auto dot_base_aux(A&& a){return base(a);}
+using core::dot ;
+using core::dotu;
+using core::dotc;

-template<class A, std::enable_if_t<  is_conjugated<A>{}, int> =0> 
-auto dot_base_aux(A&& a){return underlying(base(a));}
-
-template<class X1D, class Y1D, class R, std::enable_if_t<not is_complex<X1D>{}, int> =0>
-auto dot(X1D const& x, Y1D const& y, R&& r){
-	return core::dot(size(x), base(x), stride(x), base(y), stride(y), &r), std::forward<R>(r);}
-
-template<class X1D, class Y1D, class R, std::enable_if_t<    is_complex<X1D>{}, int> =0>
-auto dot(X1D const& x, Y1D const& y, R&& r){
-
-	auto base_x = dot_base_aux(x);
-	auto base_y = dot_base_aux(y);
-
-	using core::dotu;
-	using core::dotc;
-
-	     if(not is_conjugated<X1D>{} and not is_conjugated<Y1D>{}) dotu(size(x), base_x, stride(x), base_y, stride(y), &r);
-	else if(not is_conjugated<X1D>{} and     is_conjugated<Y1D>{}) dotc(size(x), base_y, stride(y), base_x, stride(x), &r);
-	else if(    is_conjugated<X1D>{} and not is_conjugated<Y1D>{}) dotc(size(x), base_x, stride(x), base_y, stride(y), &r);
-	else                                                           assert(0);
-	return std::forward<R>(r);
+template<class Context, class XIt, class Size, class YIt, class RPtr>
+auto dot_n(Context&& ctxt, XIt x_first, Size count, YIt y_first, RPtr rp){
+	if constexpr(is_complex<typename XIt::value_type>{}){
+		;;;; if constexpr (!is_conjugated<XIt>{} and !is_conjugated<YIt>{}) std::forward<Context>(ctxt).dotu(count,            base(x_first) , stride(x_first), base(y_first), stride(y_first), rp);
+		else if constexpr (!is_conjugated<XIt>{} and  is_conjugated<YIt>{}) std::forward<Context>(ctxt).dotc(count, underlying(base(y_first)), stride(y_first), base(x_first), stride(x_first), rp);
+		else if constexpr ( is_conjugated<XIt>{} and !is_conjugated<YIt>{}) std::forward<Context>(ctxt).dotc(count, underlying(base(x_first)), stride(x_first), base(y_first), stride(y_first), rp);
+		else if constexpr ( is_conjugated<XIt>{} and  is_conjugated<YIt>{}) static_assert(!sizeof(XIt*), "not implemented in blas");
+	}else{
+                                                                            std::forward<Context>(ctxt).dot (count,            base(x_first) , stride(x_first), base(y_first), stride(y_first), rp);
+	}
+	struct{XIt x_last; YIt y_last;} ret{x_first + count, y_first + count};
+	return ret;
 }

-template<class X1D, class Y1D, class Alloc>
-NODISCARD("when last argument is an allocator")
-auto alloc_dot(X1D const& x, Y1D const& y, Alloc const& alloc){
-	return dot(x, y, multi::array<typename X1D::value_type, 0, Alloc>(0, alloc) );
+template<class XIt, class Size, class YIt, class RPtr>
+auto dot_n(XIt x_first, Size count, YIt y_first, RPtr rp)
+->decltype(dot_n(blas::context{}, x_first, count, y_first, rp)){
+	return dot_n(blas::context{}, x_first, count, y_first, rp);}
+
+template<class Context, class X1D, class Y1D, class R>
+R&& dot(Context&& ctxt, X1D const& x, Y1D const& y, R&& r){
+	assert( size(x) == size(y) );
+	return blas::dot_n(std::forward<Context>(ctxt), begin(x), size(x), begin(y), &r), std::forward<R>(r);
 }

-template<class X1D, class Y1D>
-NODISCARD("when last argument is read-only")
-auto dot(X1D const& x, Y1D const& y){
-	return alloc_dot(x, y, common(get_allocator(x), get_allocator(y)));
+template<class X1D, class Y1D, class R>
+R&& dot(X1D const& x, Y1D const& y, R&& r){
+	assert( size(x) == size(y) );
+	return blas::dot_n(blas::context{}, begin(x), size(x), begin(y), &r), std::forward<R>(r);
 }

-}}}
+template<class Context, class ItX, class Size, class ItY>
+class dot_ptr{
+	Context ctxt_ = {};
+	ItX  x_first_;
+	Size count_;
+	ItY  y_first_;
+protected:
+	dot_ptr(Context&& ctxt, ItX x_first, Size count, ItY y_first) : ctxt_{std::forward<Context>(ctxt)}, x_first_{x_first}, count_{count}, y_first_{y_first}{}
+public:
+	dot_ptr(dot_ptr const&) = default;
+	template<class ItOut, class Size2>
+	friend constexpr auto copy_n(dot_ptr first, [[maybe_unused]] Size2 count, ItOut d_first)
+	->decltype(blas::dot_n(std::declval<Context>(), std::declval<ItX>(), Size{}      , std::declval<ItY>(), d_first), d_first + count){assert(count == 1);
+		return blas::dot_n(first.ctxt_            , first.x_first_     , first.count_, first.y_first_     , d_first), d_first + count;}

-#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_DOT
+	template<class ItOut, class Size2>
+	friend constexpr auto uninitialized_copy_n(dot_ptr first, Size2 count, ItOut d_first)
+	->decltype(blas::dot_n(std::declval<Context>(), std::declval<ItX>(), Size{}      , std::declval<ItY>(), d_first), d_first + count){assert(count == 1);
+		return blas::dot_n(first.ctxt_            , first.x_first_     , first.count_, first.y_first_     , d_first), d_first + count;}
+//	->decltype(copy_n(first, count, d_first)){ // nvcc is not detecting friend copy_n
+//		return copy_n(first, count, d_first);}
+};

-#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS dot"
-#define BOOST_TEST_DYN_LINK
-#include<boost/test/unit_test.hpp>
+template<class Context, class X, class Y, class Ptr = dot_ptr<Context, typename X::const_iterator, typename X::size_type, typename Y::const_iterator>>
+struct dot_ref : private Ptr{
+	dot_ref(dot_ref const&) = delete;
+	using decay_type = decltype(typename X::value_type{}*typename Y::value_type{});
+	dot_ref(Context&& ctxt, X const& x, Y const& y) : Ptr{std::forward<Context>(ctxt), begin(x), size(x), begin(y)}{assert(size(x)==size(y));}
+	constexpr Ptr const& operator&() const&{return *this;}
+	decay_type decay() const{decay_type r; copy_n(operator&(), 1, &r); return r;}
+	operator decay_type() const{return decay();}
+	decay_type operator+() const{return decay();}
+};

-#include "../../array.hpp"
-#include "../../utility.hpp"
+template<class Context, class X, class Y> [[nodiscard]] 
+dot_ref<Context, X, Y> dot(Context&& ctxt, X const& x, Y const& y){return {std::forward<Context>(ctxt), x, y};}

-#include "../blas/nrm2.hpp"
+template<class X, class Y> [[nodiscard]] 
+dot_ref<blas::context, X, Y> dot(X const& x, Y const& y){return {blas::context{}, x, y};}

-#include<cassert>
-#include<numeric> // inner_product
-
-namespace multi = boost::multi;
-namespace blas = multi::blas;
-
-template<class M, typename = decltype(std::declval<M const&>()[0]), typename = decltype(std::declval<M const&>()[0][0])> 
-decltype(auto) print_2D(M const& C){
-	using std::cout;
-	using multi::size;
-	for(int i = 0; i != size(C); ++i){
-		for(int j = 0; j != size(C[i]); ++j)
-			cout<< C[i][j] <<' ';
-		cout<<std::endl;
-	}
-	return cout<<std::endl;
+namespace operators{
+	template<class X1D, class Y1D> [[nodiscard]]
+	auto operator,(X1D const& x, Y1D const& y)
+	->decltype(dot(x, y)){
+		return dot(x, y);}
 }

-template<class M, typename = decltype(std::declval<M const&>()[0])>//, typename = decltype(std::declval<M const&>()[0])>
-decltype(auto) print_1D(M const& C){
-	using std::cout; using multi::size;
-	for(int i = 0; i != size(C); ++i) cout<< C[i] <<' ';
-	return cout<<std::endl;
+}
 }

-BOOST_AUTO_TEST_CASE(multi_blas_dot_impl_real){
-	multi::array<double, 2> const cA = {
-		{1.,  2.,  3.,  4.},
-		{5.,  6.,  7.,  8.},
-		{9., 10., 11., 12.}
-	};
-	using blas::dot;
-	{
-		double d = dot(cA[1], cA[2]);
-		BOOST_REQUIRE( d==std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
-	}
-	{
-		double d = NAN;
-		dot(cA[1], cA[2], d);
-		BOOST_REQUIRE( d==std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
-	}
-	{
-		double d = NAN;
-		auto d2 = dot(cA[1], cA[2], d);
-		BOOST_REQUIRE( d==d2 );
-	}
-	{
-		multi::array<double, 0> d;
-		auto d2 = dot(cA[1], cA[2], d);
-		BOOST_REQUIRE( d == std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
-	}
-	{
-		double d = dot(cA[1], cA[2]);
-		BOOST_REQUIRE( d == std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
-		BOOST_REQUIRE( dot(cA[1], cA[2]) == dot(cA[2], cA[1]) );
-	}
-	{	
-		using blas::nrm2;
-		using std::sqrt;
-		{
-			double s;
-			dot(cA[1], cA[1], s);
-			assert( sqrt(s)==nrm2(cA[1]) );
-		}
-	}
-}
-
-#if 1
-BOOST_AUTO_TEST_CASE(multi_blas_dot_impl_complex){
-	namespace blas = multi::blas;
-
-	using complex = std::complex<double>; complex const I{0, 1};
-	multi::array<complex, 2> const A = {
-		{1. +    I,  2. + 3.*I,  3.+2.*I,  4.-9.*I},
-		{5. + 2.*I,  6. + 6.*I,  7.+2.*I,  8.-3.*I},
-		{9. + 1.*I, 10. + 9.*I, 11.+1.*I, 12.+2.*I}
-	};
-//	print_2D(A);
-//	print_1D(A[1]);
-	{
-		complex c; blas::dot(A[1], A[1], c);
-		BOOST_TEST( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[1]), complex{0}) );
-	}
-	{
-		complex c = blas::dot(A[1], A[1]);
-		BOOST_TEST( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[1]), complex{0}) );
-	}
-	{
-//		conjugated(A[1]);
-//		complex c; dot(A[1], conjugated(A[1]), c);
-//		BOOST_TEST( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[1]), complex{0}, std::plus<>{}, [](auto a, auto b){return a*conj(b);}) );
-	}
-#if 0
-
-
-	{
-
-
-
-
-		{
-
-		{
-			multi::array<complex, 1> cc = {1., 2., 3.};
-			dot(A[1], conjugated(A[1]), cc[0]);
-			BOOST_TEST( cc[0] == std::inner_product(begin(A[1]), end(A[1]), begin(A[1]), complex{0}, std::plus<>{}, [](auto a, auto b){return a*conj(b);}) );
-		}
-		{
-			auto const c = dot(A[1], conjugated(A[1]));
-			std::cout<< c() <<std::endl;
-			BOOST_REQUIRE( c() == std::inner_product(begin(A[1]), end(A[1]), begin(A[1]), complex{0}, std::plus<>{}, [](auto a, auto b){return a*conj(b);}) );
-			BOOST_REQUIRE( dot(A[1], conjugated(A[1])) == dot(conjugated(A[1]), A[1]) );
-		}
-		{
-			auto const c = dot(conjugated(A[1]), A[1]);
-			std::cout<< c() <<std::endl;
-			BOOST_REQUIRE( c() == std::inner_product(begin(A[1]), end(A[1]), begin(A[1]), complex{0}, std::plus<>{}, [](auto a, auto b){return a*conj(b);}) );
-		}
-		{
-			multi::array<complex, 1> a = {1. +    I,  2. + 3.*I,  3.+2.*I,  4.-9.*I};
-			multi::array<complex, 1> b = {5. + 2.*I,  6. + 6.*I,  7.+2.*I,  8.-3.*I};
-			BOOST_REQUIRE( dot(a            , b            )()== 19. - 27.*I );
-			BOOST_REQUIRE( dot(a            , conjugated(b))()==121. - 43.*I );
-			BOOST_REQUIRE( dot(conjugated(a), b            )()==121. + 43.*I );
-		//	BOOST_REQUIRE( dot(conjugated(a), conjugated(b))() == 19. + 27.*I );
-		}
-	}
-#endif
-}
-#endif
-
-
-#endif
 #endif

--- a/external_codes/boost_multi/multi/adaptors/blas/filling.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/filling.hpp
@ -1,5 +1,5 @@
 #ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
+$CXXX $CXXFLAGS $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
 #endif
 // © Alfredo A. Correa 2019-2020

@ -14,19 +14,13 @@ namespace boost{
 namespace multi{
 namespace blas{

-//enum class uplo  : char{L='L', U='U'};
-//enum uplo : char{
-//	L = 'U',
-//	U = 'L'
-//};
-
 enum class filling : char{
-	lower = 'U', //static_cast<char>(uplo::U),
-	upper = 'L'  //static_cast<char>(uplo::L)
+	lower = 'U',
+	upper = 'L' 
 };

-static constexpr filling U = filling::upper;
-static constexpr filling L = filling::lower;
+MAYBE_UNUSED static constexpr filling U = filling::upper;
+MAYBE_UNUSED static constexpr filling L = filling::lower;

 filling flip(filling side){
 	switch(side){
@ -64,8 +58,8 @@ filling detect_triangular_aux(A2D const& A){

 template<class A2D>
 filling detect_triangular(A2D const& A){
-#if __cpp_if_constexpr>=201606
-	if constexpr(not is_hermitized<A2D>()){
+#if defined(__cpp_if_constexpr)
+	if constexpr(not is_conjugated<A2D>{}){
 		using blas::asum;
 		for(auto i = size(A); i != 0; --i){
 			auto const asum_up = asum(A[i-1]({i, A[i-1].size()}));
@ -119,7 +113,6 @@ decltype(auto) print(M const& C){

 namespace multi = boost::multi;
 using complex = std::complex<double>;
-auto const I = complex(0., 1.);

 BOOST_AUTO_TEST_CASE(multi_adaptors_blas_side){
 	return;
--- a/external_codes/boost_multi/multi/adaptors/blas/gemm.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/gemm.hpp
--- a/external_codes/boost_multi/multi/adaptors/blas/gemv.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/gemv.hpp
@ -1,290 +1,152 @@
-#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX -DADD_ $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework \
-`#-Wl,-rpath,/usr/local/Wolfram/Mathematica/12.0/SystemFiles/Libraries/Linux-x86-64 -L/usr/local/Wolfram/Mathematica/12.0/SystemFiles/Libraries/Linux-x86-64 -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core` \
-&&$0x&&rm $0x;exit
-#endif
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
 // © Alfredo A. Correa 2019-2020

 #ifndef MULTI_ADAPTORS_BLAS_GEMV_HPP
 #define MULTI_ADAPTORS_BLAS_GEMV_HPP

 #include "../blas/core.hpp"
+
+#include "../blas/dot.hpp"
+
 #include "./../../detail/../utility.hpp"
-#include "././../../detail/.././array_ref.hpp"
-//#include "../../../../detail/utility.hpp"

 namespace boost{
-namespace multi{
-namespace blas{
+namespace multi::blas{

-//struct trans{enum : char{N='N', T='T', C='C'};};
+using core::gemv;

-//struct conj{template<class T> auto operator()(T const& t) const{using std::conj; return conj(t);}};
-
-template<class Trans, class T, class ItA, class Size, class Itx, class Ity>
-Ity gemv_n(Trans IN, T a, ItA A_first, Size n, Itx x_first, T beta, Ity y_first){
-	assert( IN == 'N' or IN == 'T' or IN == 'C' );
-//	assert( A_first->stride() == 1 );
-	gemv(IN, size(*A_first), n, a, base(A_first), stride(A_first), base(x_first), stride(x_first), beta, base(y_first), stride(y_first));
-	switch(IN){
-		case 'N' : return y_first + size(*A_first);
-		case 'T' : return y_first + n;
-		case 'C' : return y_first + n;
+template<class Context, class A, class MIt, class Size, class XIt, class B, class YIt>
+auto gemv_n(Context&& ctxt, A a, MIt m_first, Size count, XIt x_first, B b, YIt y_first){
+	assert(m_first->stride()==1 or m_first.stride()==1); // blas doesn't implement this case
+	assert( x_first.base() != y_first.base() ); 
+	if constexpr(not is_conjugated<MIt>{}){
+		assert( y_first.base() !=            m_first.base()  );
+		;;;; if(m_first .stride()==1) std::forward<Context>(ctxt).gemv('N', count, m_first->size(), a, m_first.base()            , m_first->stride(), x_first.base(), x_first.stride(), b, y_first.base(), y_first.stride());
+		else if(m_first->stride()==1) std::forward<Context>(ctxt).gemv('T', m_first->size(), count, a, m_first.base()            , m_first. stride(), x_first.base(), x_first.stride(), b, y_first.base(), y_first.stride());
+		else                          assert(0);
+	}else{
+		assert( y_first.base() != underlying(m_first.base()) );
+		;;;; if(m_first->stride()==1) std::forward<Context>(ctxt).gemv('C', m_first->size(), count, a, underlying(m_first.base()), m_first. stride(), x_first.base(), x_first.stride(), b, y_first.base(), y_first.stride());
+		else if(m_first. stride()==1) assert(0); // not implemented in blas (use cblas?)
+		else                          assert(0); // not implemented in blas
 	}
-	return y_first;
+	struct{
+		MIt m_last;
+		YIt y_last;
+	} ret{m_first + count, y_first + count};
+	return ret;
 }

-template<class Trans, class T, class ItA, class Itx, class Ity>
-Ity gemv(Trans IN, T a, ItA A_first, ItA A_last, Itx x_first, T beta, Ity y_first){
-//	assert( stride(A_first) == stride(A_last) and A_first->size()==A_last->size() );
-	return gemv_n(IN, a, A_first, std::distance(A_first, A_last), x_first, beta, y_first);
+template<class A, class MIt, class Size, class XIt, class B, class YIt>
+auto gemv_n(A a, MIt m_first, Size count, XIt x_first, B b, YIt y_first){
+	return gemv_n(blas::context{}, a, m_first, count, x_first, b, y_first);
 }

-template<class Trans, class T, class A2D, class X1D, class Y1D>
-Y1D&& gemv(Trans IN, T a, A2D const& A, X1D const& x, T beta, Y1D&& y){
-	assert( not(IN == 'T') or size(A) == size(y) );
-	assert( not(IN == 'N') or size(A) == size(x) );
-	assert( not(IN == 'C') or size(A) == size(x) );
-	using std::begin; using std::end;
-	auto e = gemv(IN, a, begin(A), end(A), begin(x), beta, begin(y)); (void)e;
-	assert( e == end(y) );
-	return std::forward<Y1D>(y);
-} //y := alpha*A*x + beta*y,
-
-template<class T, class A2D, class X1D, class Y1D>
-Y1D&& gemv(T a, A2D const& A, X1D const& x, T beta, Y1D&& y){
-	return stride(A)==1?gemv('N', a, rotated(A), x, beta, y):gemv('T', a, A, x, beta, y);
+template<class A, class M, class V, class B, class W>
+W&& gemv(A const& a, M const& m, V const& v, B const& b, W&& w){
+	assert(size( m) == size(w) );
+	assert(size(~m) == size(v) );
+	gemv_n(a, begin(m), size(m), begin(v), b, begin(w));
+	return std::forward<W>(w);
 }

-template<class A2D, class X1D, class Y1D>
-Y1D&& gemv(A2D const& A, X1D const& x, Y1D&& y){
-	return gemv(1., A, x, 0., std::forward<Y1D>(y));
+template<class Scalar, class It2D, class It1D, class Context>
+class gemv_iterator{
+	Scalar alpha_ = 1.;
+	It2D m_it_; 
+	It1D v_first_;
+	Context ctxt_;
+public:
+	using difference_type = typename std::iterator_traits<It2D>::difference_type;
+	using value_type = typename std::iterator_traits<It1D>::value_type;
+	using pointer = void;
+	using reference = void;
+	using iterator_category = std::random_access_iterator_tag;
+//	using iterator_category = std::output_iterator_tag;
+//	friend difference_type distance(gemv_iterator const& a, gemv_iterator const& b){assert(a.v_first_ == b.v_first_);
+//		return b.m_it_ - a.m_it_;
+//	}
+	friend difference_type operator-(gemv_iterator const& a, gemv_iterator const& b){assert(a.v_first_ == b.v_first_);
+		return a.m_it_ - b.m_it_;
+	}
+	template<class It1DOut>
+	friend auto copy_n(gemv_iterator first, difference_type count, It1DOut result){
+		if constexpr
+			(std::is_same<Context, void>{}) blas::gemv_n(             first.alpha_, first.m_it_, count, first.v_first_, 0., result);
+		else                                blas::gemv_n(first.ctxt_, first.alpha_, first.m_it_, count, first.v_first_, 0., result);
+		return result + count;
+	}
+	template<class It1DOut>
+	friend auto copy(gemv_iterator first, gemv_iterator last, It1DOut result){return copy_n(first, last - first, result);}
+	template<class It1DOut>
+	friend auto uninitialized_copy(gemv_iterator first, gemv_iterator last, It1DOut result){
+		static_assert(std::is_trivially_default_constructible<typename It1DOut::value_type>{});
+		return copy(first, last, result);
+	}
+	gemv_iterator(Scalar alpha, It2D m_it, It1D v_first, Context ctxt) 
+		: alpha_{alpha}, m_it_{m_it}, v_first_{v_first}, ctxt_{ctxt}{}
+	value_type operator*() const{return 0.;}
+};
+
+template<class Scalar, class It2D, class It1D, class DecayType, class Context>
+class gemv_range{
+	Scalar alpha_ = 1.;
+	It2D m_begin_;
+	It2D m_end_;
+	It1D v_first_;
+	Context ctxt_ = {};
+public:
+	gemv_range(gemv_range const&) = delete;
+	gemv_range(Scalar alpha, It2D m_first, It2D m_last, It1D v_first) 
+		: alpha_{alpha}, m_begin_{m_first}, m_end_{m_last}, v_first_{v_first}{
+		assert(m_begin_.stride() == m_end_.stride());
+	}
+	gemv_range(Context&& ctxt, Scalar alpha, It2D m_first, It2D m_last, It1D v_first) 
+		: alpha_{alpha}, m_begin_{m_first}, m_end_{m_last}, v_first_{v_first}, ctxt_{std::forward<Context>(ctxt)}{
+		assert(m_begin_.stride() == m_end_.stride());
+	}
+	using iterator = gemv_iterator<Scalar, It2D, It1D, Context>;
+	using decay_type = DecayType;
+	iterator begin() const{return {alpha_, m_begin_, v_first_, ctxt_};}
+	iterator end()   const{return {alpha_, m_end_  , v_first_, ctxt_};}
+	size_type size() const{return end() - begin();}
+	typename decay_type::extensions_type extensions() const{return typename decay_type::extensions_type{{0, size()}};}
+	decay_type decay() const{
+		decay_type ret; 
+		ret = *this;
+		return ret;
+	}
+	friend auto operator+(gemv_range const& self){return self.decay();}
+	template<class V>
+	friend V&& operator+=(V&& v, gemv_range const& s){
+		if constexpr
+			(std::is_same<Context, void*>{}) blas::gemv_n(         s.alpha_, s.m_begin_, s.m_end_ - s.m_begin_, s.v_first_, 1., v.begin());
+		else                                 blas::gemv_n(s.ctxt_, s.alpha_, s.m_begin_, s.m_end_ - s.m_begin_, s.v_first_, 1., v.begin());
+		return std::forward<V>(v);
+	}
+};
+
+template<class Scalar, class M, class V>
+auto gemv(Scalar s, M const& m, V const& v)
+{//->decltype(gemv_range{s, m, v}){
+	assert(size(~m) == size(v));
+	return gemv_range<Scalar, typename M::const_iterator, typename V::const_iterator, typename V::decay_type, blas::context>(s, m.begin(), m.end(), v.begin());}
+
+template<class Context, class Scalar, class M, class V>
+auto gemv(Context&& ctxt, Scalar s, M const& m, V const& v)
+//->decltype(gemv_ranges, m, v})
+{	assert(size(~m) == size(v));
+	return gemv_range<Scalar, typename M::const_iterator, typename V::const_iterator, typename V::decay_type, Context&&>(std::forward<Context>(ctxt), s, m.begin(), m.end(), v.begin());}
+
+namespace operators{
+	template<class M, class V>
+	auto operator%(M const& m, V const& v)
+	->decltype(+blas::gemv(1., m, v)){
+		return +blas::gemv(1., m, v);}
 }

-//template<class A, class B, class RowIt, class ConstIt, class It>
-//It gemv(A const& a, RowIt M_first, RowIt M_last, ConstIt X_first, B const& b, It Y_first){
-//	using std::transform; using std::inner_product; using std::begin; using std::end;
-//	return transform(M_first, M_last, Y_first, Y_first, [&](auto const& r, auto const& e){
-//		return a*inner_product(begin(r), end(r), X_first, typename std::iterator_traits<It>::value_type{0}) + b*e;
-//	});
-//}
-//template<class A, class B, class RowIt, class ConstIt, class It, class Conj>
-//It gemv(A const& a, RowIt M_first, RowIt M_last, ConstIt X_first, B const& b, It Y_first, Conj&& /*conj*/){
-//	std::cout<< __LINE__ <<std::endl;
-//	using std::transform; using std::inner_product; using std::begin; using std::end;
-//	return transform(M_first, M_last, Y_first, Y_first, [&](auto&& r, auto&& e){
-//		return a*inner_product(begin(r), end(r), X_first, typename std::iterator_traits<It>::value_type{0}/*, std::plus<>{}, [&](auto const& a, auto const& b){return conj(a)*b;}*/) + b*e;
-//	});
-//}
-
-#if 0
-template<class AB, class RowIt, class ConstIt, class It>
-It gemv(AB const& a, RowIt M_first, RowIt M_last, ConstIt X_first, AB const& b, It Y_first){
-	assert( stride(M_first) == stride(M_last) );
-	std::cout<< __LINE__ <<std::endl;
-	using std::distance;
-#ifndef NO_BLAS
-	     if(stride(*M_first) == 1){std::cout<< __LINE__ <<std::endl; gemv(blas::trans::T, M_first->size(), std::distance(M_first, M_last), a, base(M_first), stride( M_first), base(X_first), stride(X_first), b, base(Y_first), stride(Y_first));}
-	else if(stride( M_first) == 1){std::cout<< __LINE__ <<std::endl; gemv(blas::trans::N, std::distance(M_first, M_last),M_first->size(), a, base(M_first), stride(*M_first), base(X_first), stride(X_first), b, base(Y_first), stride(Y_first));}
-	else
-#endif
-#ifdef NO_GENERICBLAS
-		assert(0);
-#else
-		gemv<AB, AB>(a, M_first, M_last, X_first, b, Y_first);
-#endif
-	return Y_first + std::distance(M_first, M_last);
 }
-
-template<class RowIt, class ConstIt, class It>
-It gemv(std::complex<double> const& a, RowIt M_first, RowIt M_last, ConstIt X_first, std::complex<double> const& b, It Y_first, blas::conj&&){
-	using AB = std::complex<double>;
-	std::cout<< __LINE__ <<std::endl;
-	assert( stride(M_first) == stride(M_last) );
-	using std::distance;
-	
-	if(stride( M_first) == 1){
-	 	std::cout<< __LINE__ << " " << stride(*M_first) << " " << std::distance(M_first, M_last) << " " << M_first->size() << std::endl;
-     	gemv(trans::C, std::distance(M_first, M_last), M_first->size(), a, base(M_first), stride(*M_first), base(X_first), stride(X_first), b, base(Y_first), stride(Y_first));
-     	std::cout<< __LINE__ << " " << stride(*M_first) << " " << std::distance(M_first, M_last) << " " << M_first->size() << std::endl;
-//			assert(0);
-     }else{
-     	gemv<AB, AB>(a, M_first, M_last, X_first, b, Y_first, blas::conj{});
-     }
-
-#if 0
-	
-#ifndef NO_BLAS
-	     if(stride( M_first) == 1){
-	     	std::cout<< __LINE__ << " " << stride(*M_first) << " " << std::distance(M_first, M_last) << " " << M_first->size() << std::endl;
-	     	gemv('C', std::distance(M_first, M_last), M_first->size(), a, base(M_first), stride(*M_first), base(X_first), stride(X_first), b, base(Y_first), stride(Y_first));
-	     	std::cout<< __LINE__ << " " << stride(*M_first) << " " << std::distance(M_first, M_last) << " " << M_first->size() << std::endl;
-//			assert(0);
-	     }
-	else
-#endif
-#ifdef NO_GENERICBLAS
-	assert(0);
-#else
-	gemv<AB, AB>(a, M_first, M_last, X_first, b, Y_first, blas::conj{});
-#endif
-#endif
- 	std::cout<< __LINE__ << " " << stride(*M_first) << " " << std::distance(M_first, M_last) << " " << M_first->size() << std::endl;
-	return Y_first;// + std::distance(M_first, M_last);
-}
-
-template<class T, class A2D, class X1D, class Y1D>
-Y1D gemv(T const& a, A2D const& A, X1D const& x, T const& b, Y1D&& y){
-	std::cout<< __LINE__ <<std::endl;
-	assert( size(x)==std::get<1>(shape(A)) and size(y)==std::get<0>(shape(A)) );
-	auto last = gemv(a, begin(A), end(A), begin(x), b, begin(y));
-	assert( last == end(y) );
-	return std::forward<Y1D>(y);
-	
-//	else if(IN == 'N') 
-//	assert( std::get<1>(strides(A)) == 1 ); // gemv is not implemented for arrays with non-leading stride != 1
-	auto m = std::get<1>(shape(A));
-	auto n = std::get<0>(shape(A));
-	if(std::get<1>(strides(A)) == 1){
-	//	if(IN=='T' or IN=='H') 
-		assert( size(x)==std::get<1>(A.shape()) and size(y)==std::get<0>(A.shape()));
-		gemv(trans::T, m, n, a, origin(A), std::get<0>(strides(A)), origin(x), stride(x), b, origin(y), stride(y));
-	}else if(std::get<0>(strides(A)) == 1){
-		assert( size(x) == std::get<0>(A.shape()) and size(y) == std::get<1>(A.shape()));
-		gemv(trans::N, m, n, a, origin(A), std::get<1>(strides(A)), origin(x), stride(x), b, origin(y), stride(y));
-	}else{assert(0);}
-	return std::forward<Y1D>(y);
-} //y := alpha*A*x + beta*y,
-
-template<class T, class A2D, class X1D, class Y1D, class Conj>
-Y1D&& gemv(T const& a, A2D const& A, X1D const& x, T const& b, Y1D&& y, Conj&& c){
-	std::cout<<__LINE__ <<std::endl;
-	assert( size(x)==std::get<1>(shape(A)) and size(y)==std::get<0>(shape(A)) );
-//	auto last = 
-	gemv(a, begin(A), end(A), begin(x), b, begin(y), std::forward<Conj>(c));
-	std::cout<< __LINE__ <<std::endl;
-//	assert( last == end(y) );
-	return std::forward<Y1D>(y);
-} //y := alpha*A*x + beta*y,
-#endif
-
-}}}
-
-#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_GEMV
-
-#define BOOST_TEST_MODULE "C++ Unit Tests for Multi blas gemv"
-#define BOOST_TEST_DYN_LINK
-#include<boost/test/unit_test.hpp>
-
-#include "../../array.hpp"
-#include "../../utility.hpp"
-
-#include<complex>
-#include<cassert>
-#include<iostream>
-#include<numeric>
-#include<algorithm>
-
-using std::cout;
-namespace multi = boost::multi;
-
-BOOST_AUTO_TEST_CASE(multi_blas_gemv){
-	namespace blas = multi::blas;
-
-	using std::abs;
-	{
-		multi::array<double, 2> const M = {
-			{ 9., 24., 30., 9.},
-			{ 4., 10., 12., 7.},
-			{14., 16., 36., 1.}
-		};
-		assert( M[2][0] == 14. );
-		multi::array<double, 1> const X = {1.1,2.1,3.1, 4.1};
-		multi::array<double, 1> Y = {4.,5.,6.};
-		double a = 1.1, b = 1.2;
-		blas::gemv('T', a, M, X, b, Y); // y = a*M*x + b*y
-
-		multi::array<double, 1> const Y3 = {214.02, 106.43, 188.37}; // = 1.1 {{9., 24., 30., 9.}, {4., 10., 12., 7.}, {14., 16., 36., 1.}}.{1.1, 2.1, 3.1, 4.1} + 1.2 {4., 5., 6.}
-//		cout << abs(Y[1] - Y3[1]) << std::endl;
-		assert( abs(Y[1] - Y3[1]) < 2e-14 );
-	}
-#if 0
-	{
-		double const M[3][4] = {
-			{ 9., 24., 30., 9.},
-			{ 4., 10., 12., 7.},
-			{14., 16., 36., 1.}
-		};
-		assert( M[2][0] == 14. );
-		double const X[4] = {1.1,2.1,3.1, 4.1};
-		double Y[3] = {4.,5.,6.};
-		double const a = 1.1;
-		double const b = 1.2;
-		gemv('T', a, M, X, b, Y); // y = a*M*x + b*y
-		double Y3[3] = {214.02, 106.43, 188.37};
-		assert( abs(Y[1] - Y3[1]) < 2e-14 );
-	}
-
-	{
-		multi::array<double, 2> const M = {
-			{ 9., 24., 30., 9.},
-			{ 4., 10., 12., 7.},
-			{14., 16., 36., 1.}
-		};
-		assert( M[2][0] == 14. );
-		multi::array<double, 1> const X = {1.1,2.1,3.1};
-		multi::array<double, 1> Y = {4.,5.,6., 7.};
-		double a = 1.8, b = 1.6;
-		gemv('N', a, M, X, b, Y); // y = a*(M^T)*x + b*y, y^T = a*(x^T)*M + b*y^T
-		multi::array<double, 1> const Y3 = {117.46, 182.6, 315.24, 61.06}; // =1.8 Transpose[{{9., 24., 30., 9.}, {4., 10., 12., 7.}, {14., 16., 36., 1.}}].{1.1, 2.1, 3.1} + 1.6 {4., 5., 6., 7.}
-		assert( abs(Y[2] - Y3[2]) < 1e-13 );
-	}
-	{
-		multi::array<double, 2> const M = {
-			{ 9., 24., 30., 9.},
-			{ 4., 10., 12., 7.},
-			{14., 16., 36., 1.}
-		};
-		assert( M[2][0] == 14. );
-		multi::array<double, 1> const X = {1.1,2.1,3.1, 4.1};
-		multi::array<double, 1> Y = {4.,5.,6.};
-		double a = 1.1, b = 1.2;
-		gemv(a, M, X, b, Y); // y = a*M*x + b*y
-		multi::array<double, 1> const Y3 = {214.02, 106.43, 188.37}; // = 1.1 {{9., 24., 30., 9.}, {4., 10., 12., 7.}, {14., 16., 36., 1.}}.{1.1, 2.1, 3.1, 4.1} + 1.2 {4., 5., 6.}
-		assert( std::abs(Y[1] - Y3[1]) < 2e-14 );
-	}
-	{
-		double const M[3][4] = {
-			{ 9., 24., 30., 9.},
-			{ 4., 10., 12., 7.},
-			{14., 16., 36., 1.}
-		};
-		assert( M[2][0] == 14. );
-		double const X[4] = {1.1,2.1,3.1, 4.1};
-		double Y[3] = {4.,5.,6.};
-		double a = 1.1, b = 1.2;
-		gemv(a, M, X, b, Y); // y = a*M*x + b*y
-		double const Y3[3] = {214.02, 106.43, 188.37};
-		assert( std::abs(Y[1] - Y3[1]) < 2e-14 );
-	}
-	{
-		multi::array<double, 2> const M = {
-			{ 9., 4., 14.},
-			{24., 10., 16.},
-			{30., 12., 36.},
-			{9., 7., 1.}
-		}; assert( M[0][2] == 14. );
-		multi::array<double, 1> const X = {1.1,2.1,3.1, 4.1};
-		multi::array<double, 1> Y = {4.,5.,6.};
-		double a = 1.1, b = 1.2;
-		gemv(a, rotated(M), X, b, Y); // y = a*M*x + b*y
-
-		multi::array<double, 1> const Y3 = {214.02, 106.43, 188.37}; // = 1.1 {{9., 24., 30., 9.}, {4., 10., 12., 7.}, {14., 16., 36., 1.}}.{1.1, 2.1, 3.1, 4.1} + 1.2 {4., 5., 6.}
-		assert( abs(Y[1] - Y3[1]) < 2e-14 );
-	}
-#endif
 }

 #endif
-#endif

--- a/external_codes/boost_multi/multi/adaptors/blas/ger.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/ger.hpp
@ -1,5 +1,5 @@
 #ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX -DADD_ $0 -o $0x -lblas -lboost_unit_test_framework&&$0x&&rm $0x;exit
+$CXXX $CXXFLAGS -DADD_ $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
 #endif
 // © Alfredo A. Correa 2019-2020

@ -12,6 +12,8 @@ namespace boost{
 namespace multi{
 namespace blas{

+using core::ger;
+
 template<class T, class It1, class Size1, class It2, class Size2, class Out>
 Out ger_n(T alpha, It1 x_first, Size1 x_n, It2 y_first, Size2 y_n, Out A_first){
 	assert( A_first->size() == x_n );
@ -135,7 +137,6 @@ BOOST_AUTO_TEST_CASE(multi_blas_ger){
 	//	a = {{2., 3.}, {1., 4.}, {1., 0.}}; GER[1, {1., 2., 5.}, {-2., 1.}, a]; Print[a] : {{0., 4.}, {-3., 6.}, {-9., 5.}}
 	//	assert( A[1][1] == 6. );
 	}
-	return;
 	{
 		multi::array<double, 2> A = {
 			{0., 0.},
@ -155,14 +156,15 @@ BOOST_AUTO_TEST_CASE(multi_blas_ger){
 //		assert( A[1][2] == 1. );
 	}
 	{
-		multi::array<double, 2> A = {
-			{2., 3., 6., 8.},
-			{4., 1., 6., 8.},
-			{0., 1., 6., 8.}
-		};
-		assert( A[1][2] == 6. );
-		multi::array<double, 1> const x = { 0., 1., 0.};
-		multi::array<double, 1> const y = { 0., 0., 1., 0.};
+//		multi::array<double, 2> A = {
+//			{2., 3., 6., 8.},
+//			{4., 1., 6., 8.},
+//			{0., 1., 6., 8.}
+//		};
+//		assert( A[1][2] == 6. );
+//		multi::array<double, 1> const x = { 0., 1., 0.};
+//		multi::array<double, 1> const y = { 0., 0., 1., 0.};
+		
 	//	multi::blas::ger(0., x, y, rotated(A)); // 

 	//	a = {{2., 3.}, {1., 4.}, {1., 0.}}; GER[1, {1., 2., 5.}, {-2., 1.}, a]; Print[a] : {{0., 4.}, {-3., 6.}, {-9., 5.}}
--- a/external_codes/boost_multi/multi/adaptors/blas/herk.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/herk.hpp
@ -1,5 +1,5 @@
 #ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x -lboost_unit_test_framework `pkg-config --libs blas` \
+$CXXX $CXXFLAGS $0 -o $0x -lboost_unit_test_framework `pkg-config --libs blas` \
 `#-Wl,-rpath,/usr/local/Wolfram/Mathematica/12.0/SystemFiles/Libraries/Linux-x86-64 -L/usr/local/Wolfram/Mathematica/12.0/SystemFiles/Libraries/Linux-x86-64 -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5` \
 -lboost_timer &&$0x&&rm $0x; exit
 #endif
@ -27,49 +27,55 @@ namespace boost{
 namespace multi{namespace blas{

 template<class A, std::enable_if_t<not is_conjugated<A>{}, int> =0> 
-auto base_aux(A&& a){return base(a);}
+auto base_aux(A&& a)
+->decltype(base(a)){
+	return base(a);}

 template<class A, std::enable_if_t<    is_conjugated<A>{}, int> =0>
-auto base_aux(A&& a){return underlying(base(a));}
+auto base_aux(A&& a)
+->decltype(underlying(base(a))){
+	return underlying(base(a));}

 using core::herk;

-template<class AA, class BB, class A2D, class C2D, class = typename A2D::element_ptr, std::enable_if_t<is_complex<C2D>{}, int> =0>
-auto herk(filling c_side, AA alpha, A2D const& a, BB beta, C2D&& c)
->decltype(herk('\0', '\0', size(c), size(         a), alpha, base_aux(a), stride(rotated(a)), beta, base_aux(c), stride(c)), std::forward<C2D>(c))
+template<class AA, class BB, class A2D, class C2D, class = typename A2D::element_ptr, std::enable_if_t<is_complex_array<C2D>{}, int> =0>
+C2D&& herk(filling c_side, AA alpha, A2D const& a, BB beta, C2D&& c)
+//->decltype(herk('\0', '\0', c.size(), a.size(), &alpha, base_aux(a), stride(a.rotated()), &beta, base_aux(c), stride(c)), std::forward<C2D>(c))
 {
-	assert( size(a) == size(c) );
-	assert( size(c) == size(rotated(c)) );
-	if( is_conjugated<C2D>{} ){ herk(flip(c_side), alpha, a, beta, hermitized(c)); return std::forward<C2D>(c);}
-	if(size(c)==0) return std::forward<C2D>(c);
+	assert( a.size() == c.size() );
+	assert( c.size() == rotated(c).size() );
+	if(c.size()==0) return std::forward<C2D>(c);
+	if constexpr(is_conjugated<C2D>{}){herk(flip(c_side), alpha, a, beta, hermitized(c)); return std::forward<C2D>(c);}
 	{
 		auto base_a = base_aux(a);
 		auto base_c = base_aux(c); //  static_assert( not is_conjugated<C2D>{}, "!" );
-		if(is_conjugated<A2D>{}){
+		if constexpr(is_conjugated<A2D>{}){
+		//	auto& ctxt = *blas::default_context_of(underlying(a.base()));
 			// if you get an error here might be due to lack of inclusion of a header file with the backend appropriate for your type of iterator
-				 if(stride(a)==1 and stride(c)!=1) core::herk(c_side==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), alpha, base_a, stride(rotated(a)), beta, base_c, stride(c));
+				 if(stride(a)==1 and stride(c)!=1) herk(c_side==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), &alpha, base_a, stride(rotated(a)), &beta, base_c, stride(c));
 			else if(stride(a)==1 and stride(c)==1){
-				if(size(a)==1) herk(c_side==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), alpha, base_a, stride(rotated(a)), beta, base_c, stride(c));
+				if(size(a)==1)                     herk(c_side==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), &alpha, base_a, stride(rotated(a)), &beta, base_c, stride(c));
 				else assert(0);
 			}
-			else if(stride(a)!=1 and stride(c)==1) herk(c_side==filling::upper?'U':'L', 'C', size(c), size(rotated(a)), alpha, base_a, stride(        a ), beta, base_c, stride(rotated(c)));
-			else if(stride(a)!=1 and stride(c)!=1) herk(c_side==filling::upper?'L':'U', 'C', size(c), size(rotated(a)), alpha, base_a, stride(        a ), beta, base_c, stride(        c ));
+			else if(stride(a)!=1 and stride(c)==1) herk(c_side==filling::upper?'U':'L', 'C', size(c), size(rotated(a)), &alpha, base_a, stride(        a ), &beta, base_c, stride(rotated(c)));
+			else if(stride(a)!=1 and stride(c)!=1) herk(c_side==filling::upper?'L':'U', 'C', size(c), size(rotated(a)), &alpha, base_a, stride(        a ), &beta, base_c, stride(        c ));
 			else assert(0);
 		}else{
-				 if(stride(a)!=1 and stride(c)!=1) herk(c_side==filling::upper?'L':'U', 'C', size(c), size(rotated(a)), alpha, base_a, stride(        a ), beta, base_c, stride(c));
+		//	auto& ctxt = *blas::default_context_of(           a.base() );
+			;;;; if(stride(a)!=1 and stride(c)!=1) herk(c_side==filling::upper?'L':'U', 'C', size(c), size(rotated(a)), &alpha, base_a, stride(        a ), &beta, base_c, stride(c));
 			else if(stride(a)!=1 and stride(c)==1){
-				if(size(a)==1) herk(c_side==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), alpha, base_a, stride(rotated(a)), beta, base_c, stride(rotated(c)));
+				if(size(a)==1)                     herk(c_side==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), &alpha, base_a, stride(rotated(a)), &beta, base_c, stride(rotated(c)));
 				else assert(0);
 			}
 			else if(stride(a)==1 and stride(c)!=1) assert(0);//case not implemented, herk(c_side==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), alpha, base_a, stride(rotated(a)), beta, base(c), stride(c)); 
-			else if(stride(a)==1 and stride(c)==1) herk(c_side==filling::upper?'U':'L', 'N', size(c), size(rotated(a)), alpha, base_a, stride(rotated(a)), beta, base_c, stride(rotated(c)));
+			else if(stride(a)==1 and stride(c)==1) herk(c_side==filling::upper?'U':'L', 'N', size(c), size(rotated(a)), &alpha, base_a, stride(rotated(a)), &beta, base_c, stride(rotated(c)));
 			else assert(0);
 		}
 	}
 	return std::forward<C2D>(c);
 }

-template<class AA, class BB, class A2D, class C2D, class = typename A2D::element_ptr, std::enable_if_t<not is_complex<C2D>{}, int> =0>
+template<class AA, class BB, class A2D, class C2D, class = typename A2D::element_ptr, std::enable_if_t<not is_complex_array<C2D>{}, int> =0>
 auto herk(filling c_side, AA alpha, A2D const& a, BB beta, C2D&& c)
 ->decltype(syrk(c_side, alpha, a, beta, std::forward<C2D>(c))){
 	return syrk(c_side, alpha, a, beta, std::forward<C2D>(c));}
@ -94,17 +100,19 @@ auto herk(A2D const& a, C2D&& c)
 ->decltype(herk(1., a, std::forward<C2D>(c))){
 	return herk(1., a, std::forward<C2D>(c));}

+/*
 template<class A2D, class C2D>
 NODISCARD("when last argument is const")
 auto herk(A2D const& a, C2D const& c)
 ->decltype(herk(1., a, decay(c))){
 	return herk(1., a, decay(c));}
+*/

 template<class AA, class A2D, class Ret = typename A2D::decay_type>
-NODISCARD("when second argument is const")
-auto herk(AA alpha, A2D const& a)
-{//->std::decay_t<decltype(herk(alpha, a, Ret({size(a), size(a)}, get_allocator(a))))>{
-	return herk(alpha, a, Ret({size(a), size(a)}, get_allocator(a)));
+NODISCARD("when argument is read-only")
+auto herk(AA alpha, A2D const& a)//->std::decay_t<decltype(herk(alpha, a, Ret({size(a), size(a)}, get_allocator(a))))>{
+{
+	return herk(alpha, a, Ret({size(a), size(a)}));//Ret({size(a), size(a)}));//, get_allocator(a)));
 }

 template<class T> struct numeric_limits : std::numeric_limits<T>{};
@ -165,9 +173,6 @@ template<class M> decltype(auto) print(M const& C){
 	return cout << std::endl;
 }

-using complex = std::complex<double>; 
-constexpr complex I(0, 1);
-
 BOOST_AUTO_TEST_CASE(inq_case){
 	using namespace multi::blas;
 	multi::array<double, 2> const a = {
@ -194,6 +199,23 @@ BOOST_AUTO_TEST_CASE(inq_case){
 	}
 }

+BOOST_AUTO_TEST_CASE(multi_blas_herk_real){
+	namespace blas = multi::blas;
+	multi::array<double, 2> const a = {
+		{ 1., 3., 4.},
+		{ 9., 7., 1.}
+	};
+	{
+		multi::array<double, 2> c({2, 2}, 9999);
+		blas::herk(1., a, c);
+		BOOST_REQUIRE( c[1][0] == 34 );
+		BOOST_REQUIRE( c[0][1] == 34 );
+
+		multi::array<double, 2> const c_copy = blas::herk(1., a);
+		BOOST_REQUIRE( c == c_copy );
+	}
+}
+
 BOOST_AUTO_TEST_CASE(multi_blas_herk1x1_case){
 	namespace blas = multi::blas;
 	multi::array<double, 2> const A = {{1., 2., 3.}};
--- a/external_codes/boost_multi/multi/adaptors/blas/herk.su
+++ b/external_codes/boost_multi/multi/adaptors/blas/herk.su
--- a/external_codes/boost_multi/multi/adaptors/blas/nrm2.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/nrm2.hpp
@ -1,8 +1,13 @@
 #ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
+$CXXX $CXXFLAGS $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
 #endif
 // © Alfredo A. Correa 2019-2020

+#ifdef __CUDA_ARCH__
+//#define BOOST_NO_RTTI 1
+//#define BOOST_TYPE_INDEX_CTTI_USER_DEFINED_PARSING (39, 1, true, "T = ")
+#endif
+
 #ifndef MULTI_ADAPTORS_BLAS_NRM2_HPP
 #define MULTI_ADAPTORS_BLAS_NRM2_HPP

@ -18,19 +23,50 @@ namespace blas{

 using core::nrm2;

-template<class X1D, class R0D>
-auto nrm2(X1D const& x, R0D&& r)
->decltype(nrm2(size(x), base(x), stride(x), base(r)), std::forward<R0D>(r)){
-	return nrm2(size(x), base(x), stride(x), base(r)), std::forward<R0D>(r);}
+using multi::base; 
+using std::norm; // nvcc11 needs using std::FUNCTION and the FUNCTION (and it works in clang, gcc, culang, icc)

-using std::norm; // for some reason nvcc needs using std::norm/norm (and works in clang, gcc, culang, icc)
+template<class A1D, class A0D>
+auto nrm2(A1D const& x, A0D&& r)
+->decltype(nrm2(x.size(), x.base(), x.stride(), base(r)), std::forward<A0D>(r)){
+	return nrm2(x.size(), x.base(), x.stride(), base(r)), std::forward<A0D>(r);}

-template<class X1D, 
-	typename T = decltype(norm(std::declval<typename X1D::value_type>())),
-	typename Alloc = typename std::allocator_traits<decltype(get_allocator(std::declval<X1D>()))>::template rebind_alloc<T>
+#if 0
+template<class A1D>
+auto nrm2(A1D const& x, double& r)
+->decltype(nrm2(x.size(), x.base(), x.stride(), &r), r){
+	return nrm2(x.size(), x.base(), x.stride(), &r), r;}
+
+template<class A1D>
+auto nrm2(A1D const& x, float& r)
+->decltype(nrm2(x.size(), x.base(), x.stride(), &r), r){
+	return nrm2(x.size(), x.base(), x.stride(), &r), r;}
+#endif
+
+template<
+	class A1D, typename T = double, //decltype(norm(std::declval<typename A1D::value_type>())), 
+	class Alloc = typename std::allocator_traits<typename A1D::default_allocator_type>::template rebind_alloc<T>
 >
-auto nrm2(X1D const& x){
-	return nrm2(x, multi::static_array<T, 0, Alloc>{}); // TODO: this supports only default constructible (deduced) allocator
+NODISCARD("")
+auto nrm2(A1D const& x)
+//->std::decay_t<decltype(nrm2(x, multi::static_array<T, 0, Alloc>({}, x.get_allocator()) ))>{
+->std::decay_t<decltype(nrm2(x, multi::static_array<T, 0, Alloc>({})))>{ // x.get_allocator() in decltype doesn't work for icc
+	return nrm2(x, multi::static_array<T, 0, Alloc>({}, x.get_allocator()));}
+
+template<class Alloc, class A1D, typename T = decltype(norm(std::declval<typename A1D::value_type>())), 
+	class AllocR = typename std::allocator_traits<typename A1D::default_allocator_type>::template rebind_alloc<T>
+>
+NODISCARD("")
+auto nrm2(A1D const& x, AllocR const& alloc)
+->std::decay_t<decltype(blas::nrm2(x, multi::static_array<T, 0, AllocR>({}, alloc)))>{
+	return              blas::nrm2(x, multi::static_array<T, 0, AllocR>({}, alloc)) ;}
+
+namespace operators{
+	using std::norm;
+	template<class A1D>//decltype(norm(std::declval<typename A1D::value_type>()))> 
+	NODISCARD("") auto operator^(A1D const& a, int n)
+	->decltype(std::pow(blas::nrm2(a), n)){
+		return std::pow(blas::nrm2(a), n);}
 }

 }}}
@ -41,7 +77,13 @@ auto nrm2(X1D const& x){
 #define BOOST_TEST_DYN_LINK
 #include<boost/test/unit_test.hpp>

+
 #include "../../array.hpp"
+#include "../../complex.hpp"
+
+//#include<thrust/complex.h>
+
+#include<boost/mpl/list.hpp>

 namespace multi = boost::multi;

@ -55,17 +97,33 @@ BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_real){

 	double n;
 	BOOST_REQUIRE( blas::nrm2(rotated(cA)[1], n) ==  std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
+	BOOST_REQUIRE( n == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
 	BOOST_REQUIRE( blas::nrm2(rotated(cA)[1]) ==  std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
+	
+	double n2 = blas::nrm2(rotated(cA)[1]);
+	BOOST_REQUIRE( n == n2 );

 	multi::array<double, 1> R(4);
 	blas::nrm2( rotated(cA)[1], R[2]);
 	BOOST_REQUIRE( R[2] ==  std::sqrt( 2.*2. + 6.*6 + 10.*10.) );

+	multi::array<double, 0> R0;
+	blas::nrm2( rotated(cA)[1], R0);
+	BOOST_REQUIRE( R0 ==  std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
+	
+	BOOST_REQUIRE( blas::nrm2(rotated(cA)[1]) == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
+
 }

-using complex = std::complex<double>; complex const I{0,1};
+BOOST_AUTO_TEST_CASE(multi_adaptor_blas_nrm2_operators){
+	multi::array<double, 1> X = {1.1,2.1,3.1, 4.1};
+	double n; multi::blas::nrm2(X, n);
+	BOOST_REQUIRE( n == multi::blas::nrm2(X) );
+
+}

 BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_complex_real_case){
+	using complex = std::complex<double>;
 	multi::array<complex, 2> const cA = {
 		{1.,  2.,  3.,  4.},
 		{5.,  6.,  7.,  8.},
@ -75,11 +133,46 @@ BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_complex_real_case){
 	using multi::blas::nrm2;
 	double n; 
 	BOOST_REQUIRE( nrm2(rotated(cA)[1], n) == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
-
-	BOOST_REQUIRE( nrm2(rotated(cA)[1]) == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
+	BOOST_REQUIRE( nrm2(rotated(cA)[1])    == n );
 }

+#if 0
+BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_complex_real_case_thrust){
+	using complex = thrust::complex<double>;
+	multi::array<complex, 2> const cA = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+
+	using multi::blas::nrm2;
+	double n;
+	BOOST_REQUIRE( nrm2(rotated(cA)[1], n) == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
+	BOOST_REQUIRE( nrm2(rotated(cA)[1])    == n );
+}
+
+BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_complex_real_case_types){
+	boost::mpl::for_each<boost::mpl::list<
+		std   ::complex<double>, 
+		thrust::complex<double>//,
+	//	boost::multi::complex<double> // TODO make this work
+	>>([](auto cplx){
+		multi::array<decltype(cplx), 2> const cA = {
+			{1.,  2.,  3.,  4.},
+			{5.,  6.,  7.,  8.},
+			{9., 10., 11., 12.}
+		};
+
+		using multi::blas::nrm2;
+		double n;
+		BOOST_REQUIRE( nrm2(rotated(cA)[1], n) == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
+		BOOST_REQUIRE( nrm2(rotated(cA)[1])    == n );
+	});
+}
+#endif
+
 BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_complex){
+	using complex = std::complex<double>; complex const I{0,1};
 	multi::array<complex, 2> const cA = {
 		{1.,  2. + 1.*I,  3.,  4.},
 		{5.,  6. + 4.*I,  7.,  8.},
@ -88,8 +181,12 @@ BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_complex){

 	using multi::blas::nrm2;
 	double n;
-	BOOST_REQUIRE( nrm2(rotated(cA)[1], n) == std::sqrt( norm(cA[0][1]) + norm(cA[1][1]) + norm(cA[2][1]) ) );
-	BOOST_REQUIRE( nrm2(rotated(cA)[1]) == std::sqrt( norm(cA[0][1]) + norm(cA[1][1]) + norm(cA[2][1]) ) );
+	BOOST_REQUIRE( nrm2(rotated(cA)[1], n)   == std::sqrt( norm(cA[0][1]) + norm(cA[1][1]) + norm(cA[2][1]) ) );
+	BOOST_REQUIRE( nrm2(rotated(cA)[1])      == std::sqrt( norm(cA[0][1]) + norm(cA[1][1]) + norm(cA[2][1]) ) );
+
+	using namespace multi::blas::operators;
+	BOOST_TEST_REQUIRE( (rotated(cA)[1]^-1) == 1/std::sqrt(norm(cA[0][1]) + norm(cA[1][1]) + norm(cA[2][1])) , boost::test_tools::tolerance(1e-15) );
+	BOOST_TEST_REQUIRE( (rotated(cA)[1]^2) == norm(cA[0][1]) + norm(cA[1][1]) + norm(cA[2][1]) , boost::test_tools::tolerance(1e-15) );
 }

 #endif
--- a/external_codes/boost_multi/multi/adaptors/blas/numeric.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/numeric.hpp
@ -1,55 +1,42 @@
-#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x -lboost_unit_test_framework&&$0x&&rm $0x;exit
-#endif
-// © Alfredo A. Correa 2019-2020
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
+// © Alfredo A. Correa 2019-2021

 #ifndef MULTI_ADAPTORS_BLAS_NUMERIC_HPP
 #define MULTI_ADAPTORS_BLAS_NUMERIC_HPP

+#include "../../memory/pointer_traits.hpp"
 #include "../../array_ref.hpp"
-#include<complex>
+#include "../../complex.hpp"

-#if defined(__CUDACC__)
-#define HD __host__ __device__
-#else
-#define HD
-#endif
+#include "numeric/is_complex.hpp"

 namespace boost{
-namespace multi{
-namespace blas{
+namespace multi::blas{

 template<class T> struct Complex_{T real; T imag;};

-template<class A, typename T=typename std::decay_t<A>::element_type::value_type, typename C_=Complex_<T>>
-auto real_aux(A&& a, std::complex<T> const&)
->decltype(member_array_cast<T>(reinterpret_array_cast<C_>(std::forward<A>(a)), &C_::real)){
-	return member_array_cast<T>(reinterpret_array_cast<C_>(std::forward<A>(a)), &C_::real);}
-
-template<class A, class T>
-auto real_aux(A&& a, T const&)
->decltype(member_array_cast<T>(std::forward<A>(a), &T::real)){
-	return member_array_cast<T>(std::forward<A>(a), &T::real);}
-
-template<class A>
+template<
+	class A, typename Complex = typename std::decay_t<A>::element, typename T=typename Complex::value_type,
+	class=std::enable_if_t<blas::numeric::is_complex_of<Complex, T>::value>
+>
 auto real(A&& a)
->decltype(real_aux(std::forward<A>(a), typename std::decay_t<A>::element_type{})){
-	return real_aux(std::forward<A>(a), typename std::decay_t<A>::element_type{});}
+->decltype(std::forward<A>(a).template reinterpret_array_cast<Complex_<T>>().template member_cast<T>(&Complex_<T>::real)){
+	return std::forward<A>(a).template reinterpret_array_cast<Complex_<T>>().template member_cast<T>(&Complex_<T>::real);}

-template<class A, typename T=typename std::decay_t<A>::element_type::value_type, typename C_=Complex_<T>>
-auto imag_aux(A&& a, std::complex<T> const&)
->decltype(member_array_cast<T>(reinterpret_array_cast<C_>(std::forward<A>(a)), &C_::imag)){
-	return member_array_cast<T>(reinterpret_array_cast<C_>(std::forward<A>(a)), &C_::imag);}
-
-template<class A, class T>
-auto imag_aux(A&& a, T const&)
->decltype(member_array_cast<T>(std::forward<A>(a), &T::imag)){
-	return member_array_cast<T>(std::forward<A>(a), &T::imag);}
-
-template<class A>
+template<
+	class A, class Complex = typename std::decay_t<A>::element_type, typename T=typename Complex::value_type,
+	class=std::enable_if_t<blas::numeric::is_complex_of<Complex, T>::value>
+>
 auto imag(A&& a)
->decltype(imag_aux(std::forward<A>(a), typename std::decay_t<A>::element_type{})){
-	return imag_aux(std::forward<A>(a), typename std::decay_t<A>::element_type{});}
+->decltype(std::forward<A>(a).template reinterpret_array_cast<Complex_<T>>().template member_cast<T>(&Complex_<T>::imag)){
+	return std::forward<A>(a).template reinterpret_array_cast<Complex_<T>>().template member_cast<T>(&Complex_<T>::imag);}
+	
+template<class ComplexArr, class ComplexElem = typename std::decay_t<ComplexArr>::element, typename RealElem = typename ComplexElem::value_type,
+	class=std::enable_if_t<blas::numeric::is_complex_of<ComplexElem, RealElem>::value>
+>
+auto real_doubled(ComplexArr&& a){ // produces a real view of complex array with the last dimension duplicated and with interleaved real imaginary parts
+	return std::forward<ComplexArr>(a).template reinterpret_array_cast<RealElem>(2).rotated().flatted().unrotated();
+}

 template<class Ref, class Involution> class involuted;

@ -62,20 +49,25 @@ protected:
 	Involution f_;
 public:
 	using decay_type =std::decay_t<decltype(std::declval<Involution>()(std::declval<Ref>()))>;
-	explicit involuted(Ref r, Involution f = {}) HD : r_{std::forward<Ref>(r)}, f_{f}{}
+	constexpr explicit involuted(Ref r, Involution f = {}) : r_{std::forward<Ref>(r)}, f_{f}{}
 	involuted& operator=(involuted const& other)=delete;//{r_ = other.r_; return *this;}
 public:
 	involuted(involuted const&) = delete;
 	involuted(involuted&&) = default; // for C++14
-	decay_type decay() const&{return f_(r_);}
-	operator decay_type() const&{return f_(r_);}
-	decltype(auto) operator&()&&{return involuter<decltype(&std::declval<Ref>()), Involution>{&r_, f_};}
+	constexpr decay_type decay() const&{return f_(r_);}
+	constexpr operator decay_type() const&{return f_(r_);}
+	constexpr operator decay_type() &&{return f_(r_);}
+	constexpr decltype(auto) operator&()&&{return involuter<decltype(&std::declval<Ref>()), Involution>{&r_, f_};}
 //	template<class DecayType>
 //	auto operator=(DecayType&& other)&&
 //	->decltype(r_=f_(std::forward<DecayType>(other)), *this){
 //		return r_=f_(std::forward<DecayType>(other)), *this;}
 	template<class DecayType>
-	auto operator=(DecayType&& other)&
+	constexpr auto operator=(DecayType&& other)&
+	->decltype(r_=f_(std::forward<DecayType>(other)), *this){
+		return r_=f_(std::forward<DecayType>(other)), *this;}
+	template<class DecayType>
+	constexpr auto operator=(DecayType&& other)&&
 	->decltype(r_=f_(std::forward<DecayType>(other)), *this){
 		return r_=f_(std::forward<DecayType>(other)), *this;}
 //	template<class OtherRef>
@ -83,27 +75,36 @@ public:
 //	->decltype(r_=f_==o.f_?std::forward<decltype(o.r_)>(o.r_):f_(o), *this){
 //		return r_=f_==o.f_?std::forward<decltype(o.r_)>(o.r_):f_(o), *this;}
 	template<class DecayType>
-	auto operator==(DecayType&& other) const
+	constexpr auto operator==(DecayType&& other) const
 	->decltype(this->operator decay_type()==other){
 		return this->operator decay_type()==other;}
 	template<class DecayType>
-	auto operator!=(DecayType&& other) const
+	constexpr auto operator!=(DecayType&& other) const
 	->decltype(this->operator decay_type()!=other){
 		return this->operator decay_type()!=other;}
+
+	friend constexpr auto operator==(decay_type const& other, involuted const& self){
+		return other == self.operator decay_type();}
+
 	template<class DecayType, std::enable_if_t<not std::is_base_of<involuted, DecayType>{}, int> =0>
-	friend auto operator==(DecayType&& other, involuted const& self){
+	friend constexpr auto operator==(DecayType&& other, involuted const& self){
 		return other == self.operator decay_type();}
 	template<class DecayType, std::enable_if_t<not std::is_base_of<involuted, DecayType>{}, int> =0>
-	friend auto operator!=(DecayType&& other, involuted const& self){
+	friend constexpr auto operator!=(DecayType&& other, involuted const& self){
 		return other != self.operator decay_type();}
 //	auto imag() const{return static_cast<decay_type>(*this).imag();}
-	template<class Any> friend Any& operator<<(Any&& a, involuted const& self)
+	template<class Any> friend constexpr Any& operator<<(Any&& a, involuted const& self)
 //	->decltype(a << self.operator decay_type())
 	{
 		return a << self.operator decay_type();}
+	constexpr auto conj() const&{return adl_conj(operator decay_type());}
+	template<class T = void*>
+	friend constexpr auto imag(involuted const& self, T = nullptr)
+	->decltype(adl_imag(std::declval<decay_type>())){
+		return adl_imag(self.operator decay_type());}
 };

-#if __cpp_deduction_guides
+#if defined(__cpp_deduction_guides)
 template<class T, class F> involuted(T&&, F)->involuted<T const, F>;
 //template<class T, class F> involuted(T&, F)->involuted<T&, F>;
 //template<class T, class F> involuted(T const&, F)->involuted<T const&, F>;
@ -116,8 +117,8 @@ template<class It, class F>
 auto get_allocator(involuter<It, F> const& s);

 template<class It, class F>
-auto default_allocator_of(involuter<It, F> const& s){
-	return default_allocator_of(s.it_);
+auto default_allocator_of(involuter<It, F> const& iv){
+	return default_allocator_of(iv.it_);
 }

 template<class It, class F, class Reference>
@ -132,22 +133,23 @@ public:
 	using reference 	  = Reference;
 	using iterator_category = typename std::iterator_traits<It>::iterator_category;
 	using element_type 	  = typename std::pointer_traits<It>::element_type;
-	template<class U> using rebind = involuter<typename std::pointer_traits<It>::template rebind<U>, F>; 
+	template<class U> using rebind = involuter<typename std::pointer_traits<It>::template rebind<U>, F>;

 	involuter() = default;
-	explicit involuter(It it, F f = {}) : it_{std::move(it)}, f_{std::move(f)}{}
+	constexpr explicit involuter(It it, F f = {}) : it_{std::move(it)}, f_{std::move(f)}{}
 	involuter(involuter const& other) = default;
 //	template<class Other, > constexpr involuter(Other const& other) : it_{other.it_}, f_{other.f_}{}

 	template<class Other, typename = decltype(_implicit_cast<It>(typename Other::underlying_type{}))> 
-	constexpr involuter(Other const& o) : it_{o.it_}, f_{o.f_}{}
+	// cppcheck-suppress noExplicitConstructor
+	constexpr          involuter(Other const& o) : it_{o.it_}, f_{o.f_}{}
 	template<class Other, typename = decltype(_explicit_cast<It>(typename Other::underlying_type{}))> 
-	explicit constexpr involuter(Other const& o, int = 0) : it_{o.it_}, f_{o.f_}{}
+	constexpr explicit involuter(Other const& o, int = 0) : it_{o.it_}, f_{o.f_}{}

 	constexpr auto operator*() const {return reference{*it_, f_};}
 	bool operator==(involuter const& o) const{return it_==o.it_;}
 	bool operator!=(involuter const& o) const{return it_!=o.it_;}
-	involuter& operator+=(typename involuter::difference_type n) HD{it_+=n; return *this;}
+	constexpr involuter& operator+=(typename involuter::difference_type n){it_+=n; return *this;}
 	constexpr auto operator+(typename involuter::difference_type n) const{return involuter{it_+n, f_};}
 //	decltype(auto) operator->() const{
 //		return &const_cast<reference&>(reinterpret_cast<reference const&>(*this));
@ -157,24 +159,24 @@ public:
 	auto operator-(involuter const& other) const{return it_-other.it_;}
 	explicit operator bool() const{return it_;}
 	using underlying_type = It;
-	friend underlying_type underlying(involuter const& self) HD{return self.it_;}
+	friend constexpr underlying_type underlying(involuter const& self){return self.it_;}
 	constexpr explicit operator It() const {return underlying(*this);}
 	template<class Itt, class FF> friend auto get_allocator(involuter<Itt, FF> const&);
-	friend auto default_allocator_of(involuter const& s){
+	friend auto default_allocator_of(involuter const& inv){
 		using multi::default_allocator_of;
-		return default_allocator_of(s.it_);
+		return default_allocator_of(inv.it_);
 	}
 	using default_allocator_type = typename multi::pointer_traits<It>::default_allocator_type;
-	friend auto get_allocator(involuter const& s){
+	friend auto get_allocator(involuter const& inv){
 		using boost::multi::get_allocator;
-		return get_allocator(s.it_);
+		return get_allocator(inv.it_);
 	}
 };

 template<class It, class F>
-auto get_allocator(involuter<It, F> const& s){
+auto get_allocator(involuter<It, F> const& inv){
 	using multi::get_allocator;
-	return get_allocator(s.it_);
+	return get_allocator(inv.it_);
 }

 template<class Ref> using negated = involuted<Ref, std::negate<>>;
@ -183,11 +185,11 @@ template<class It>  using negater = involuter<It, std::negate<>>;
 #if 1
 struct conjugate{
 	template<class T>
-	auto operator()(T const& a) const{
+	decltype(auto) operator()(T&& a) const{
 	//	using std::conj; /*for doubles?*/ 
-		using std::conj;
-		std::complex<double> A{a};
-		return conj(A);
+	//	using std::conj;
+	//	std::complex<double> A = static_cast<std::complex<double>>(a);
+		return multi::adl_conj(std::forward<T>(a)); // this is needed by icc
 	}
 };
 #endif
@ -218,16 +220,28 @@ template<class It> using conjugater = involuter<It, conjugate>;//, conjugated<ty
 template<class It> auto make_conjugater(It it){return conjugater<It>{it};}
 template<class It> It make_conjugater(conjugater<It> it){return underlying(it);}

-template<class T> auto imag(involuted<T, conjugate> const& s){return s.decay().imag();}
-template<class T> auto real(involuted<T, conjugate> const& s){return s.decay().real();}
+template<class T> auto imag(involuted<T, conjugate> const& inv){return inv.decay().imag();}
+template<class T> auto real(involuted<T, conjugate> const& inv){return inv.decay().real();}

-template<class A = void> struct is_complex{
-	template<class T> static auto _(T const& t) -> decltype(imag(*t), std::true_type());
-	                  static auto _(...       ) ->                    std::false_type  ;
-	constexpr operator bool() const{return decltype(_(base(std::declval<A>()))){};}
-	template<class AA> constexpr auto operator()(AA&&){return _(base(std::declval<A>()));}
+template<class T> auto has_imag_fun_aux(T const& t)->decltype(imag(t), std::true_type {});
+                  auto has_imag_fun_aux(...       )->decltype(         std::false_type{});
+template<class T> struct has_imag_fun : decltype(has_imag_fun_aux(std::declval<T>())){};
+
+
+template<class T> auto has_imag_mem_aux(T const& t)->decltype(t.imag(), std::true_type {});
+                  auto has_imag_mem_aux(...       )->decltype(         std::false_type{});
+template<class T> struct has_imag_mem : decltype(has_imag_mem_aux(std::declval<T>())){};
+
+template<class T> struct has_imag : std::integral_constant<bool, (has_imag_fun<T>{} or has_imag_mem<T>{})>{};
+
+template<class A = void> struct is_complex_array{
+	template<class T> static auto _(T const& t) -> has_imag<T>;
+	constexpr operator bool() const{return decltype(_(*base(std::declval<A>()))){};}
+	template<class AA> constexpr auto operator()(AA&&){return _(*base(std::declval<A>()));}
 };

+template<class V> struct is_complex : has_imag<V>{};
+
 template<class A = void> struct is_conjugated{
 	template<class It> static std::true_type  _(conjugater<It> a);
 	                   static std::false_type _(...             );
@ -236,189 +250,40 @@ template<class A = void> struct is_conjugated{
 };

 template<class A, class D = std::decay_t<A>, typename Elem=typename D::element_type, typename Ptr=typename D::element_ptr,
-	std::enable_if_t<not is_conjugated<A>{}, int> =0>
+	std::enable_if_t<not is_complex_array<A>{}, int> =0>
+A&& conj(A&& a){
+//	return multi::static_array_cast<Elem, conjugater<Ptr>>(a);
+	return std::forward<A>(a);
+}
+
+template<class A, class D = std::decay_t<A>, typename Elem=typename D::element_type, typename Ptr=typename D::element_ptr,
+	std::enable_if_t<not is_conjugated<A>{} and is_complex_array<A>{}, int> =0>
 decltype(auto) conj(A&& a){
-	return multi::static_array_cast<Elem, conjugater<Ptr>>(a);
+//	return multi::static_array_cast<Elem, conjugater<Ptr>>(a);
+	return std::forward<A>(a).template static_array_cast<Elem, conjugater<Ptr>>();
 }

 template<class A, class D = std::decay_t<A>, typename Elem=typename D::element_type, typename Ptr=typename D::element_ptr::underlying_type,
 	std::enable_if_t<    is_conjugated<A>{}, int> =0>
-decltype(auto) conj(A&& a){
-	return multi::static_array_cast<Elem, Ptr>(a);
-}
-
-}
-}
-}
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_NUMERIC
-
-#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS numeric"
-#define BOOST_TEST_DYN_LINK
-#include<boost/test/unit_test.hpp>
-
-//#include "../blas/gemm.hpp"
-
-#include "../../array.hpp"
-#include "../../utility.hpp"
-
-//#include "../../adaptors/cuda.hpp"
-
-#include<cassert>
-#include<iostream>
-
-namespace multi = boost::multi;
-
-template<class M> decltype(auto) print(M const& C){
-	using std::cout;
-	using boost::multi::size;
-	for(int i = 0; i != size(C); ++i){
-		for(int j = 0; j != size(C[i]); ++j) cout<< C[i][j] <<' ';
-		cout<<std::endl;
-	}
-	return cout<<std::endl;
-}
-
-using complex = std::complex<double>; constexpr complex I{0, 1};
-
-BOOST_AUTO_TEST_CASE(multi_blas_numeric_real_imag_part){
-
-	multi::array<double, 2> A = {
-		{1., 3., 4.}, 
-		{9., 7., 1.}
-	};
-	multi::array<complex, 2> Acplx = A;
-
-	multi::array<complex, 2> B = {
-		{1. - 3.*I, 6. + 2.*I},
-		{8. + 2.*I, 2. + 4.*I},
-		{2. - 1.*I, 1. + 1.*I}
-	};
-
-	multi::array<double, 2> Breal = {
-		{1., 6.},
-		{8., 2.},
-		{2., 1.}
-	};
-	multi::array<double, 2> Bimag = {
-		{-3., +2.},
-		{+2., +4.},
-		{-1., +1.}
-	};
-
-	using multi::blas::real;
-	using multi::blas::imag;
-
-	BOOST_REQUIRE( Breal == real(B) );
-	BOOST_REQUIRE( real(B) == Breal );
-	BOOST_REQUIRE( imag(B) == Bimag );
-
-	BOOST_REQUIRE( B[1][0] == 8. + 2.*I );
-	BOOST_REQUIRE( imag(B[1][0]) == 2. );
-// 	using multi::blas::hermitized;
-//	BOOST_REQUIRE( hermitized(B)[0][1] == 8. - 2.*I );
-//	BOOST_REQUIRE( imag(hermitized(B)[0][1]) == -2. );
-	
-}
-
-template<class T> void what(T&&) = delete;
-
-BOOST_AUTO_TEST_CASE(multi_blas_numeric_real_conjugated){
-
-	multi::array<complex, 2> B = {
-		{1. - 3.*I, 6. + 2.*I},
-		{8. + 2.*I, 2. + 4.*I},
-		{2. - 1.*I, 1. + 1.*I}
-	};
-	BOOST_REQUIRE( B[0][0] == 1. - 3.*I );
-
-	multi::array<complex, 2> const Bconst = {
-		{1. - 3.*I, 6. + 2.*I},
-		{8. + 2.*I, 2. + 4.*I},
-		{2. - 1.*I, 1. + 1.*I}
-	};
-	BOOST_REQUIRE( Bconst[0][0] == 1. - 3.*I );
-
-	auto BdataC = multi::blas::make_conjugater(B.data_elements());
-	auto BconstdataC = multi::blas::make_conjugater(Bconst.data_elements());
-	decltype(BconstdataC) ppp = BdataC;
-	ppp = BdataC;
-
-	BOOST_REQUIRE( *BdataC == 1. + 3.*I );
-
-	static_assert(    multi::blas::is_complex<decltype(B)>{}, "!");
-	static_assert(not multi::blas::is_conjugated<decltype(B)>{}, "!");
-
-	auto&& Bconj = multi::blas::conj(B);
-	static_assert(multi::blas::is_conjugated<decltype(Bconj)>{}, "!");
-
-	BOOST_REQUIRE( Bconj[0][0] == 1. + 3.*I );
-	BOOST_TEST_REQUIRE( imag(*base(Bconj)) == +3 );
-//	BOOST_TEST_REQUIRE( base(Bconj)->imag() == +3 );
-	BOOST_REQUIRE( rotated(Bconj)[1][0] == Bconj[0][1] );
-
-//	BOOST_REQUIRE( base(Bconj) == -3.*I );
-	static_assert(multi::blas::is_complex<decltype(Bconj)>{}, "!");
-
-	BOOST_REQUIRE( conj(Bconj) == B );
-	BOOST_REQUIRE( base(conj(Bconj)) == base(B) );
-
-	BOOST_REQUIRE( base(conj(Bconj))->imag() == -3. );
-//	BOOST_REQUIRE( base(conjugated(Bconj))->imag() == -3. );
+auto conj(A&& a)
+->decltype(std::forward<A>(a).template static_array_cast<Elem, Ptr>()){
+	return std::forward<A>(a).template static_array_cast<Elem, Ptr>();}
+//	return multi::static_array_cast<Elem, Ptr>(a);}
+//	return multi::static_array_cast<Elem, Ptr>(a);}

 }

-#if 0
-
-	namespace cuda = multi::cuda;
-	{
-		cuda::array<complex, 2> Bgpu = B;
-		using multi::blas::imag;
-		BOOST_REQUIRE( imag(Bgpu)[1][1] == imag(B)[1][1] );
-		BOOST_REQUIRE( real(Bgpu)[1][1] == real(B)[1][1] );
-	}
-	{
-		cuda::managed::array<complex, 2> Bgpu = B;
-		using multi::blas::imag;
-		BOOST_REQUIRE( imag(Bgpu)[1][1] == imag(B)[1][1] );
-		BOOST_REQUIRE( real(Bgpu)[1][1] == real(B)[1][1] );
-	}
-
-	multi::array_ref<double, 2> rB(reinterpret_cast<double*>(data_elements(B)), {size(B), 2*size(*begin(B))});
-
-	auto&& Bconj = multi::static_array_cast<complex, multi::blas::detail::conjugater<complex*>>(B);
-	assert( size(Bconj) == size(B) );
-	assert( conj(B[1][2]) == Bconj[1][2] );
-
-//	auto&& BH = multi::blas::hermitized(B);
-//	assert( BH[1][2] == conj(B[2][1]) );
-//	std::cout << BH[1][2] << " " << B[2][1] << std::endl;
-
-//	auto&& BH1 = multi::static_array_cast<complex, multi::blas::detail::conjugater<complex*>>(rotated(B));
-//	auto&& BH2 = rotated(multi::static_array_cast<complex, multi::blas::detail::conjugater<complex*>>(B));
-
-//	what( BH1, BH2 );
-//	using multi::blas::imag;
-
-//	assert( real(A)[1][2] == 1. );
-//	assert( imag(A)[1][2] == -3. );
-
-//	print(A) <<"--\n";
-//	print(real(A)) <<"--\n";
-//	print(imag(A)) <<"--\n";
-
-	multi::array<complex, 2> C({2, 2});
-	multi::array_ref<double, 2> rC(reinterpret_cast<double*>(data_elements(C)), {size(C), 2*size(*begin(C))});
-
-//	gemm('T', 'T', 1., A, B, 0., C);
-//	gemm('T', 'T', 1., A, B, 0., C);
-//	gemm('T', 'T', 1., real(A), B, 0., C);
+template<class It, class F, class Reference>
+auto default_allocator_of(multi::blas::involuter<It, F, Reference> it){
+	return multi::default_allocator_of(underlying(it));
 }
-#endif
-#endif
+
+}
+
+namespace std{
+//	template<> struct is_convertible<boost::multi::blas::Complex_<double>*, std::complex<double>*> : std::true_type{};
+//	template<class T> struct is_convertible<boost::multi::blas::Complex_<double>*, T*> : boost::multi::blas::numeric::is_complex_of<T, double>{};
+}
+
 #endif

--- a/external_codes/boost_multi/multi/adaptors/blas/numeric/is_complex.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/numeric/is_complex.hpp
@ -0,0 +1,93 @@
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+$CXXX $CXXFLAGS $0 -o $0x -lboost_unit_test_framework&&$0x&&rm $0x;exit
+#endif
+// © Alfredo A. Correa 2020
+
+#ifndef MULTI_ADAPTORS_BLAS_NUMERIC_IS_COMPLEX_HPP
+#define MULTI_ADAPTORS_BLAS_NUMERIC_IS_COMPLEX_HPP
+
+#include<complex>
+#include<type_traits>
+
+namespace boost{
+namespace multi{
+namespace blas{
+namespace numeric{
+
+using std::true_type;
+using std::false_type;
+
+template<class T> auto has_real_fun_aux(T const& t)->decltype(real(t),  true_type{});
+                  auto has_real_fun_aux(...       )->decltype(         false_type{});
+template<class T> struct has_real_fun : decltype(has_real_fun_aux(std::declval<T>())){};
+template<class T> constexpr bool has_real_fun_v = has_real_fun<T>::value;
+
+template<class T> auto has_real_aux(T const& t)->decltype(t.real(),  true_type{});
+                  auto has_real_aux(...       )->decltype(          false_type{});
+template<class T> struct has_real : decltype(has_real_aux(std::declval<T>())){};
+template<class T> constexpr bool has_real_v = has_real<T>::value;
+
+template<class T> auto has_imag_fun_aux(T const& t)->decltype(imag(t),  true_type{});
+                  auto has_imag_fun_aux(...       )->decltype(         false_type{});
+template<class T> struct has_imag_fun : decltype(has_imag_fun_aux(std::declval<T>())){};
+template<class T> constexpr bool has_imag_fun_v = has_imag_fun<T>::value;
+
+template<class T> auto has_imag_aux(T const& t)->decltype(t.imag(),  true_type{});
+                  auto has_imag_aux(...       )->decltype(          false_type{});
+template<class T> struct has_imag : decltype(has_imag_aux(std::declval<T>())){};
+template<class T> constexpr bool has_imag_v = has_imag<T>::value;
+
+template<class T> struct is_complex : std::integral_constant<bool, 
+	(has_real_v<T> or has_real_fun_v<T>) and (has_imag_v<T> or has_imag_fun_v<T>)
+>{};
+
+template<class V, class T> auto real_is_aux(T const& t)->typename std::is_same<decltype(t.real()), V>;
+template<class>            auto real_is_aux(...       )->false_type;
+template<class T, class V> struct real_is : decltype(real_is_aux<V>(std::declval<T>())){};
+
+template<class V, class T> auto imag_is_aux(T const& t)->typename std::is_same<decltype(t.imag()), V>;
+template<class>            auto imag_is_aux(...       )->false_type;
+template<class T, class V> struct imag_is : decltype(imag_is_aux<V>(std::declval<T>())){};
+
+template<class T, class V> struct is_complex_of : std::integral_constant<bool, real_is<T, V>::value and imag_is<T, V>::value>{};
+
+}}}}
+
+#if not __INCLUDE_LEVEL__
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS numeric is_complex"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+#include<thrust/complex.h>
+
+#include "../../../complex.hpp"
+#include "boost/mpl/list.hpp"
+
+namespace multi = boost::multi;
+
+BOOST_AUTO_TEST_CASE(multi_blas_is_complex){
+	namespace numeric = multi::blas::numeric;
+
+	boost::mpl::for_each<boost::mpl::list<double, float, long double>>([](auto f){
+		using F = decltype(f);
+		static_assert( not numeric::is_complex<F>{}, "!");
+
+		static_assert( numeric::is_complex<std::complex<F>>{}, "!");
+		static_assert( numeric::is_complex<thrust::complex<F>>{}, "!");
+		static_assert( numeric::is_complex<multi::complex<F>>{}, "!");
+
+		static_assert( numeric::is_complex_of<std::complex<F>, F>{}, "!");
+		static_assert( not numeric::is_complex_of<F, F>{}, "!");
+	});
+
+
+	static_assert( not numeric::is_complex_of<std::complex<double>, float>{}, "!");
+	static_assert( not numeric::is_complex_of<double, float>{}, "!");
+	
+	static_assert( numeric::is_complex<std::complex<double> const&>{}, "!");
+}
+
+#endif
+#endif
+
--- a/external_codes/boost_multi/multi/adaptors/blas/operations.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/operations.hpp
@ -1,12 +1,12 @@
 #ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
+$CXXX $CXXFLAGS $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
 #endif
 // © Alfredo A. Correa 2019-2020

 #ifndef MULTI_ADAPTORS_BLAS_OPERATIONS_HPP
 #define MULTI_ADAPTORS_BLAS_OPERATIONS_HPP

-#include    "../blas/numeric.hpp"
+#include "../blas/numeric.hpp"

 namespace boost{
 namespace multi{
@ -37,9 +37,45 @@ decltype(auto) hermitized(A&& a){return conjugated_transposed(std::forward<A>(a)
 template<class A>
 decltype(auto) transposed(A&& a){return rotated(std::forward<A>(a));}

-template<class A> [[deprecated("use blas::H instead of blas::C for hermitized to avoid confusions")]]
+//template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 2, int> =0>
+//decltype(auto) H(A&& a){return hermitized(std::forward<A>(a));}
+
+namespace operators{
+
+MAYBE_UNUSED constexpr static struct {
+	template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 2, int> =0>
+	decltype(auto) operator()(A&& a) const{return hermitized(std::forward<A>(a));}
+	template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 1, int> =0>
+	[[deprecated("use blas::C instead of blas::H for conjugated vectors to avoid confusions")]]
+	decltype(auto) operator()(A&& a) const{return blas::conj(std::forward<A>(a));}
+} H;
+
+template<class A, class Op>
+auto operator^(A&& a, Op op)
+->decltype(op(std::forward<A>(a))){
+	return op(std::forward<A>(a));}
+}
+
+using operators::H;
+
+template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 1, int> =0> 
+decltype(auto) C(A&& a){return blas::conj(std::forward<A>(a));}
+template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 2, int> =0> 
 decltype(auto) C(A&& a){return hermitized(std::forward<A>(a));}
-template<class A> decltype(auto) H(A&& a){return hermitized(std::forward<A>(a));}
+
+namespace operators{
+
+	template<class A>
+	auto operator*(A&& a)
+	->decltype(blas::conj(std::forward<A>(a))){
+		return blas::conj(std::forward<A>(a));}
+
+}
+
+//template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 1, int> =0>
+//[[deprecated("use blas::C instead of blas::H for conjugated vectors to avoid confusions")]]
+//decltype(auto) H(A&& a){return blas::conj(std::forward<A>(a));}
+
 template<class A> decltype(auto) T(A&& a){return transposed(std::forward<A>(a));}
 template<class A> decltype(auto) N(A&& a){return identity  (std::forward<A>(a));}

@ -47,7 +83,7 @@ template<class A> decltype(auto) N(A&& a){return identity  (std::forward<A>(a));

 }

-#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_OPERATIONS
+#if not __INCLUDE_LEVEL__

 #define BOOST_TEST_MODULE "C++ Unit Tests for Multi blas operations"
 #define BOOST_TEST_DYN_LINK
@ -66,9 +102,10 @@ template<class M> decltype(auto) print(M const& C){
 }

 namespace multi = boost::multi;
-using complex = std::complex<double>; constexpr complex I{0, 1};

 BOOST_AUTO_TEST_CASE(m){
+	using complex = std::complex<double>; constexpr complex I{0., 1.};
+
 	namespace blas = multi::blas;
 	multi::array<complex, 2> const A = {
 		{1. - 3.*I, 6.  + 2.*I},
@ -86,123 +123,19 @@ BOOST_AUTO_TEST_CASE(m){

 	static_assert( not blas::is_conjugated<decltype(blas::T(A))>{}, "!" );
 	BOOST_REQUIRE( blas::T(A)[0][1] == A[1][0] );
-	
-//	static_assert( multi::blas::is_conjugated<decltype(T(A))>{}, "!" );

-/*	using multi::blas::gemm;
+	using namespace blas::operators;
+	BOOST_REQUIRE( (*~A)[0][1] == conj(A[1][0]) );
+	BOOST_REQUIRE( (~*A)[0][1] == conj(A[1][0]) );
+	BOOST_REQUIRE( ( ~A)[0][1] ==      A[1][0]  );
+	BOOST_REQUIRE( ( *A)[0][1] == conj(A[0][1]) );

-
-
-	BOOST_REQUIRE( gemm(A, hermitized(A))[2][1] == 20. - 14.*I );
-
-
-	BOOST_REQUIRE( gemm(A, transposed(A))[2][1] == 16. + 2.*I );
-
-	static_assert( multi::blas::is_conjugated_t<decltype(hermitized(A))>{} , "!" );
-	static_assert( not multi::blas::is_conjugated_t<std::decay_t<decltype( conjugated(hermitized(A)) )>>{}, "!");
-	static_assert( not multi::blas::is_hermitized<std::decay_t<decltype( conjugated(hermitized(A)) )>>{}, "!");
-*/
 }

 BOOST_AUTO_TEST_CASE(is_complex_array_test){
-	static_assert(multi::blas::is_complex<multi::array<std::complex<double>, 2>>{}, "!");
+	static_assert(multi::blas::is_complex_array<multi::array<std::complex<double>, 2>>{}, "!");
 }

-#if 0
-BOOST_AUTO_TEST_CASE(multi_adaptors_blas_operations_enums){
-	BOOST_REQUIRE( multi::blas::operation::identity == multi::blas::real_operation::identity );
-	BOOST_REQUIRE( multi::blas::operation::transposition == multi::blas::real_operation::transposition );
-	BOOST_REQUIRE( multi::blas::operation::hermitian == multi::blas::complex_operation::hermitian );
-	BOOST_REQUIRE( multi::blas::operation::identity == multi::blas::complex_operation::identity );
-
-	BOOST_REQUIRE( multi::blas::operation{multi::blas::real_operation::identity} == multi::blas::real_operation::identity );
-	BOOST_REQUIRE( multi::blas::operation{multi::blas::real_operation::transposition} == multi::blas::real_operation::transposition );
-}
-
-BOOST_AUTO_TEST_CASE(multi_adaptors_blas_operations){
-
-	multi::array<complex, 2> const A = {
-		{1. - 3.*I, 6.  + 2.*I},
-		{8. + 2.*I, 2. + 4.*I},
-		{2. - 1.*I, 1. + 1.*I}
-	};
-
-	print(A);
-	print(multi::blas::conjugated(A));
-
-	auto&& Aconjd = multi::blas::conjugated(A);
-	assert( Aconjd[1][2] == conj(A[1][2]) );
-	multi::array<complex, 2> Aconj = multi::blas::conjugated(A);
-	assert( Aconj[1][2] == conj(A[1][2]) );
-	assert( Aconjd == Aconj );
-
-	auto&& Aconjdconjd = multi::blas::conjugated(Aconjd);
-	assert( Aconjdconjd[1][2] == A[1][2] );
-	assert( &Aconjdconjd[1][2] == &A[1][2] );
-
-	auto&& Atranspd = multi::blas::transposed(A);
-	assert( Atranspd[1][2] == A[2][1] );
-	multi::array<complex, 2> Atransp = multi::blas::transposed(A);
-	assert( Atransp[1][2] == A[2][1] );
-	assert( Atransp == Atranspd );
-
-	auto&& Aconjdtranspd = multi::blas::conjugated_transposed(A); (void)Aconjdtranspd;
-	assert( Aconjdtranspd[1][2] == conj(A[2][1]) );
-	auto Aconjtransp = multi::blas::conjugated_transposed(A).decay();
-	
-	assert( Aconjtransp[1][2] == conj(A[2][1]) );
-	assert( Aconjdtranspd == Aconjtransp );
-
-	
-{
-	multi::array<complex, 2> const A = {
-		{1. - 3.*I, 6.  + 2.*I},
-		{8. + 2.*I, 2. + 4.*I},
-		{2. - 1.*I, 1. + 1.*I}
-	};
-	using multi::blas::hermitized;
-	assert( hermitized(A)[0][1] == conj(A[1][0]) );
-//	[]{}(hermitized(A));
-	static_assert( multi::blas::is_conjugated<decltype(hermitized(A))>{} , "!");
-
-	using multi::blas::conjugated;
-//	[]{}(conjugated(conjugated(A)));
-
-	using multi::blas::hermitized;
-	[]{}(hermitized(hermitized(A)));
-
-//	static_assert( not multi::blas::is_conjugated<decltype(hermitized(hermitized(A)))>{} , "!");
-
-//	[]{}(hermitized(hermitized(A)));
-//	[]{}(conjugated(conjugated(A)));
-
-	static_assert( multi::blas::is_complex_array<std::decay_t<decltype(A)>>{} , "!");
-//	auto&& AH = multi::blas::hermitized(A);
-//	auto c = AH[0][0].imag();
-//	static_assert( multi::blas::is_complex_array<std::decay_t<decltype(AH)>>{} , "!");
-
-//	auto&& Aconjd = multi::blas::conjugated(A);
-//	assert( Aconjd[1][2] == conj(A[1][2]) );
-//	multi::array<complex, 2> Aconj = multi::blas::conjugated(A);
-//	assert( Aconj[1][2] == conj(A[1][2]) );
-//	assert( Aconjd == Aconj );
-
-	auto&& Atranspd = multi::blas::T(A);
-	assert( Atranspd[1][2] == A[2][1] );
-	multi::array<complex, 2> Atransp = multi::blas::transposed(A);
-	assert( Atransp[1][2] == A[2][1] );
-	assert( Atransp == Atranspd );
-
-	auto&& Aconjdtranspd = multi::blas::C(A); (void)Aconjdtranspd;
-	assert( Aconjdtranspd[1][2] == conj(A[2][1]) );
-	multi::array<complex, 2> Aconjtransp = multi::blas::conjugated_transposed(A);
-	assert( Aconjtransp[1][2] == conj(A[2][1]) );
-	assert( Aconjdtranspd == Aconjtransp );
-
-}
-	
-}
-#endif
 #endif
 #endif

--- a/external_codes/boost_multi/multi/adaptors/blas/scal.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/scal.hpp
@ -1,71 +1,47 @@
-#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
-#endif
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
 // © Alfredo A. Correa 2019-2020

 #ifndef MULTI_ADAPTORS_BLAS_SCAL_HPP
 #define MULTI_ADAPTORS_BLAS_SCAL_HPP

 #include "../blas/core.hpp"
-#include "../../config/NODISCARD.hpp"

-namespace boost{namespace multi{
-namespace blas{
+namespace boost{
+namespace multi::blas{

 using core::scal;

-template<class X1D, typename Elem = typename std::decay_t<X1D>::element_type>
-auto scal(Elem a, X1D&& m)
->decltype(scal(size(m), &a, base(m), stride(m)), std::forward<X1D>(m)){
-	return scal(size(m), &a, base(m), stride(m)), std::forward<X1D>(m);}
+template<class A, class It, class Size>
+auto scal_n(A const& a, It first, Size count)
+->decltype(scal(count, &a, first.base(), first.stride()), void()){
+	       scal(count, &a, first.base(), first.stride());        }
+
+template<class A, class It1D>
+auto scal(A const& a, It1D first, It1D last)
+->decltype(blas::scal_n(a, first, last - first)){
+	return blas::scal_n(a, first, last - first);}
+
+template<class A, class X1D> // don't do this: ", typename Elem = typename X1D::element_type>"
+auto scal(A const& a, X1D&& x)
+->decltype(blas::scal(a, x.begin(), x.end()), std::forward<X1D>(x)){
+	return blas::scal(a, x.begin(), x.end()), std::forward<X1D>(x);}
+
+template<class A>
+class scal_range{
+	A alpha_;
+public:
+	using scalar_type = A;
+	explicit scal_range(A const& alpha) : alpha_{alpha}{}
+	template<class X1D>
+	friend auto operator*=(X1D&& x, scal_range const& self)
+	->decltype(std::forward<X1D>(scal(std::declval<scalar_type const&>(), x))){
+		return std::forward<X1D>(scal(self.alpha_, x));}
+};
+
+template<class A> auto scal(A const& a){return scal_range<A>{a};}

-template<class X1D, typename Elem = typename X1D::element_type>
-NODISCARD("because last argument is const")
-auto scal(Elem a, X1D const& m){
-	return scal(a, m.decay());
 }
-
-}}}
-
-#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_SCAL
-
-#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS scal"
-#define BOOST_TEST_DYN_LINK
-#include<boost/test/unit_test.hpp>
-
-#include "../../array.hpp"
-
-namespace multi = boost::multi;
-namespace blas = multi::blas;
-
-BOOST_AUTO_TEST_CASE(multi_blas_scal_real){
-	{
-		multi::array<double, 2> A = {
-			{1.,  2.,  3.,  4.},
-			{5.,  6.,  7.,  8.},
-			{9., 10., 11., 12.}
-		};
-
-		using blas::scal;
-		auto S = scal(2., rotated(A)[1]);
-
-		BOOST_REQUIRE( A[2][1] == 20 );
-		BOOST_REQUIRE( S[0] == 4 );
-	}
-	{
-		multi::array<double, 2> const A = {
-			{1.,  2.,  3.,  4.},
-			{5.,  6.,  7.,  8.},
-			{9., 10., 11., 12.}
-		};
-		using multi::blas::scal;
-		auto rA1_scaled = scal(2., A[1]);
-		BOOST_REQUIRE( size(rA1_scaled) == 4 );
-		BOOST_REQUIRE( rA1_scaled[1] == 12 );
-	}
-
 }

 #endif
-#endif

--- a/external_codes/boost_multi/multi/adaptors/blas/side.hpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/side.hpp
@ -1,7 +1,7 @@
 #ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
-$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
+$CXXX $CXXFLAGS $0 -o $0.$X `pkg-config --libs blas` -lboost_unit_test_framework&&$0.$X&&rm $0.$X;exit
 #endif
-// © Alfredo A. Correa 2019
+// © Alfredo A. Correa 2019-2020

 #ifndef MULTI_ADAPTORS_BLAS_SIDE_HPP
 #define MULTI_ADAPTORS_BLAS_SIDE_HPP
@ -14,11 +14,13 @@ namespace boost{
 namespace multi{
 namespace blas{

-enum class SIDE : char{L='L', R='R'};
+//enum class SIDE : char{L='L', R='R'};

 enum side : char{
-	left = static_cast<char>(SIDE::R), right = static_cast<char>(SIDE::L),
-	pre_multiply = static_cast<char>(SIDE::R), post_multiply = static_cast<char>(SIDE::L)
+	left  = 'L', 
+	right = 'R'//,
+//	pre_multiply = 'R', 
+//	post_multiply = 'L'
 };

 side swap(side s){
@ -34,37 +36,12 @@ side swap(side s){
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////

-#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_SIDE
+#if defined(__INCLUDE_LEVEL__) and not __INCLUDE_LEVEL__

 #define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS adaptors side"
 #define BOOST_TEST_DYN_LINK
 #include<boost/test/unit_test.hpp>

-#include "../../array.hpp"
-#include "../../utility.hpp"
-#include "../blas/nrm2.hpp"
-
-#include<complex>
-#include<cassert>
-#include<iostream>
-#include<numeric>
-#include<algorithm>
-
-using std::cout;
-
-template<class M> 
-decltype(auto) print(M const& C){
-	using boost::multi::size;
-	for(int i = 0; i != size(C); ++i){
-		for(int j = 0; j != size(C[i]); ++j) cout<< C[i][j] <<' ';
-		cout<<std::endl;
-	}
-	return cout<<"---"<<std::endl;
-}
-
-namespace multi = boost::multi;
-using complex = std::complex<double>; constexpr complex I{0, 1};
-
 BOOST_AUTO_TEST_CASE(multi_adaptors_blas_side){
 	return;
 }
--- a/external_codes/boost_multi/multi/adaptors/blas/test/CMakeLists.txt
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/CMakeLists.txt
@ -0,0 +1,95 @@
+# -*-indent-tabs-mode:nil;c-basic-offset:2;tab-width:4;autowrap:nil;-*-
+#[=[Multi Test suite can be run like this:
+  mkdir -p build
+  cd build
+  cmake .. [-DENABLE_CUDA=1]
+  make -j
+  ctest -j --output-on-error [-T memcheck]
+  exit
+#]=]
+cmake_minimum_required(VERSION 3.11)
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+project(boost-multi-adaptors-blas-test VERSION 0.1 LANGUAGES CXX)
+
+find_package(Boost REQUIRED COMPONENTS unit_test_framework)
+
+find_package(BLAS REQUIRED)
+find_path(BLAS_INCLUDE_DIRS cblas.h
+  /usr/include
+  /usr/local/include
+  $ENV{BLAS_HOME}/include)
+
+link_libraries(${BLAS_LIBRARIES})
+include_directories(${TEST_EXE} PRIVATE ${BLAS_INCLUDE_DIRS})
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+if(ENABLE_CUDA OR DEFINED CXXCUDA)
+	enable_language(CUDA)
+	set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe \"--diag_suppress=implicit_return_from_non_void_function\"")
+endif()
+
+find_package(CUDA QUIET)
+
+if (CUDA_FOUND)
+    message("CUDA found")
+    include_directories(${CUDA_INCLUDE_DIRS})
+else()
+    message("CUDA not found")
+endif()
+
+enable_testing()
+list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure") # needs cmake 3.17
+include(CTest)
+
+configure_file("config.hpp.in" ${CMAKE_BINARY_DIR}/config.hpp)
+
+include_directories(${CMAKE_BINARY_DIR})
+
+#file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
+set(TEST_SRCS
+  axpy.cpp
+  copy.cpp
+  dot.cpp
+  herk.cpp
+  gemv.cpp
+  gemm.cpp
+  numeric.cpp
+  scal.cpp
+  traits.cpp
+  trsm.cpp
+)
+
+foreach(TEST_FILE ${TEST_SRCS})
+  SET(TEST_EXE "${TEST_FILE}.x")
+  add_executable            (${TEST_EXE} ${TEST_FILE})
+  if(ENABLE_CUDA OR DEFINED CXXCUDA)
+    set_source_files_properties(${TEST_FILE} PROPERTIES LANGUAGE CUDA)
+    target_compile_options  (${TEST_EXE} PRIVATE -std=c++17)
+  endif()
+#  target_compile_features   (${TEST_EXE} PUBLIC cxx_std_17) 
+  target_compile_definitions(${TEST_EXE} PRIVATE "BOOST_PP_VARIADICS")
+  target_compile_definitions(${TEST_EXE} PRIVATE ${Boost_DEFINITIONS})
+  target_include_directories(${TEST_EXE} PRIVATE ${Boost_INCLUDE_DIRS})
+  target_link_libraries     (${TEST_EXE} PRIVATE ${Boost_LIBRARIES})
+  target_link_directories   (${TEST_EXE} PRIVATE ${Boost_LIBRARY_DIRS})
+  if(NOT ENABLE_CUDA)
+    target_compile_options  (${TEST_EXE} PRIVATE
+       -Werror -Wall -Wextra -fno-common
+       $<$<CXX_COMPILER_ID:GNU>:
+          -Wpedantic -Wformat-truncation -fstack-usage>#-Wconversion
+       $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:
+          -Wpedantic -Wmove>
+       $<$<CXX_COMPILER_ID:Intel>:
+                                        -wd161 -diag-disable=remark -Warray-bounds -Wchar-subscripts -Wcomment -Wenum-compare -Wformat -Wuninitialized -Wmaybe-uninitialized -Wmain -Wnarrowing -Wnonnull -Wparentheses -Wpointer-sign -Wreorder -Wno-return-type -Wsign-compare -Wsequence-point -Wtrigraphs -Wunused-function -Wunused-but-set-variable -Wunused-variable -Wwrite-strings -Werror -diag-error:3846
+       >
+       $<$<CXX_COMPILER_ID:MSVC>:
+          /W4>)
+  endif()
+  add_test(NAME ${TEST_EXE} COMMAND ./${TEST_EXE})
+endforeach()
+
--- a/external_codes/boost_multi/multi/adaptors/blas/tests/asum.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/tests/asum.cpp
@ -1,9 +1,9 @@
-#ifdef COMPILATION_INSTRUCTIONS
-$CXX -Wall -Wextra -Wpedantic $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas -lboost_unit_test_framework&&$0x&&rm $0x;exit
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+$CXX $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas -lboost_unit_test_framework&&$0x&&rm $0x;exit
 #endif
-// © Alfredo A. Correa 2019
+// © Alfredo A. Correa 2019-2020

-#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS axpy"
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS asum"
 #define BOOST_TEST_DYN_LINK
 #include<boost/test/unit_test.hpp>

@ -49,16 +49,18 @@ BOOST_AUTO_TEST_CASE(multi_blas_asum_double_cuda){
 	BOOST_REQUIRE(asum(A[1]) == 26 );
 }

+using complex = std::complex<double>; constexpr complex I{0, 1};
+
 BOOST_AUTO_TEST_CASE(multi_blas_asum_complex_cuda){
-	using Z = std::complex<double>; Z const I{0, 1};
-	multi::cuda::array<Z, 2> const A = {
+	namespace blas = multi::blas;
+	multi::cuda::array<complex, 2> const A = {
 		{1. + 2.*I,  2.,  3.,  4.},
 		{5.,  6. + 3.*I,  7.,  8.},
 		{9., 10., 11.+ 4.*I, 12.}
 	};
-	using multi::blas::asum;
-	BOOST_REQUIRE( asum(A[1]) == Z{29.} );
-	BOOST_REQUIRE( asum(A[1]({0, 4})) == Z{29.} );
+
+	BOOST_REQUIRE( blas::asum(A[1]) == 29. );
+	BOOST_REQUIRE( blas::asum(A[1]({0, 4})) == 29. );
 }

 BOOST_AUTO_TEST_CASE(multi_blas_asum_complex_cuda_mutable){
--- a/external_codes/boost_multi/multi/adaptors/blas/test/axpy.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/axpy.cpp
@ -0,0 +1,150 @@
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+// © Alfredo A. Correa 2019-2020
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS axpy"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+#include "config.hpp"
+
+#include "../../../array.hpp"
+#include       "../../blas.hpp"
+
+#include<complex>
+
+namespace multi = boost::multi;
+namespace blas = multi::blas;
+
+BOOST_AUTO_TEST_CASE(multi_blas_axpy_real){
+	multi::array<double, 2> A = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	auto const AC = A;
+	multi::array<double, 1> const B = A[2];
+
+	blas::axpy(2., B, A[1]); // daxpy
+	BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_axpy_double){
+	multi::array<double, 2> const cA = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	multi::array<double, 2> A = cA;
+	multi::array<double, 1> const b = cA[2];
+
+	blas::axpy(2., b, A[1]); // A[1] = 2*b + A[1], A[1]+= a*A[1]
+	BOOST_REQUIRE( A[1][2] == 2.*b[2] + cA[1][2] );
+
+	using complex = std::complex<double>; complex const I = {0, 1};
+	multi::array<complex, 1> AC = {1. + 2.*I, 3. + 4.*I, 4. - 8.*I};
+	multi::array<complex, 1> BC(size(AC), complex{0.});
+	
+	blas::axpy(+1., blas::real(AC), blas::real(BC));
+	blas::axpy(-1., blas::imag(AC), blas::imag(BC));
+
+	BOOST_REQUIRE( BC[2] == std::conj(AC[2]) );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_axpy_complex){
+	{
+		using complex = std::complex<double>;
+		multi::array<complex, 2> A = {
+			{1.,  2.,  3.,  4.},
+			{5.,  6.,  7.,  8.},
+			{9., 10., 11., 12.}
+		};
+		auto const AC = A;
+		multi::array<complex, 1> const B = A[2];
+		blas::axpy(2., B, A[1]); // zaxpy (2. is promoted to 2+I*0 internally and automatically)
+		BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
+	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_axpy_complex_as_operator_plus_equal){
+	using complex = std::complex<double>;
+	multi::array<complex, 2> A = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	auto const AC = A;
+	multi::array<complex, 1> const B = A[2];
+	A[1] += blas::axpy(2., B); // zaxpy (2. is promoted to 2+I*0 internally and automatically)
+	BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_axpy_complex_as_operator_minus_equal){
+	using complex = std::complex<double>;
+	multi::array<complex, 2> A = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	auto const AC = A;
+	multi::array<complex, 1> const B = A[2];
+	A[1] -= blas::axpy(2., B); // zaxpy (2. is promoted to 2+I*0 internally and automatically)
+	BOOST_REQUIRE( A[1][2] == -2.*B[2] + AC[1][2] );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_axpy_complex_context){
+	using complex = std::complex<double>;
+	multi::array<complex, 2> A = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	auto const AC = A;
+	multi::array<complex, 1> const B = A[2];
+	blas::axpy(blas::context{}, 2., B, A[1]); // zaxpy (2. is promoted to 2+I*0 internally and automatically)
+	BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_axpy_operator_minus){
+	using complex = std::complex<double>;
+	multi::array<complex, 1> x = {10., 11., 12., 13.};
+	multi::array<complex, 1> y = x;
+	
+	using blas::operators::operator-;
+	using blas::operators::operator+;
+	using blas::operators::operator-=;
+
+	BOOST_REQUIRE( (x - y)[0] == 0. );
+	BOOST_REQUIRE( (y - x)[0] == 0. );
+	
+	BOOST_REQUIRE( (x - (y+y))[0] == -x[0] );
+	BOOST_REQUIRE( ((x+x) - y)[0] == +x[0] );
+	
+	multi::array<complex, 2> A = {{1., 2.}, {3., 4.}};
+	multi::array<complex, 1> B = {1., 2.};
+	BOOST_REQUIRE( (A[0] - B)[0] == 0. );
+	BOOST_REQUIRE( (A[0] - B)[1] == 0. );
+
+	multi::array<complex, 1> X = {10., 11., 12., 13.};
+	multi::array<complex, 1> Y = {10., 11., 12., 13.};
+	X -= Y;
+	BOOST_REQUIRE( X[0] == 0. );
+}
+
+#if CUDA_FOUND
+#include<thrust/complex.h>
+BOOST_AUTO_TEST_CASE(multi_blas_axpy_complex_thrust){
+	{
+		using complex = thrust::complex<double>;
+		multi::array<complex, 2> A = {
+			{1.,  2.,  3.,  4.},
+			{5.,  6.,  7.,  8.},
+			{9., 10., 11., 12.}
+		};
+		auto const AC = A;
+		multi::array<complex, 1> const B = A[2];
+		blas::axpy(2., B, A[1]); // zaxpy (2. is promoted to 2+I*0 internally and automatically)
+		BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
+	}
+}
+#endif
+
--- a/external_codes/boost_multi/multi/adaptors/blas/test/config.hpp.in
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/config.hpp.in
@ -0,0 +1,7 @@
+#ifndef MULTI_ADAPTORS_BLAS_TEST_CONFIG_HPP
+#define MULTI_ADAPTORS_BLAS_TEST_CONFIG_HPP
+
+#cmakedefine01 CUDA_FOUND
+
+#endif
+
--- a/external_codes/boost_multi/multi/adaptors/blas/test/copy.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/copy.cpp
@ -0,0 +1,159 @@
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+
+#include "../../blas.hpp"
+#include "../../../array.hpp"
+
+#include<complex>
+
+#include "config.hpp"
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS copy"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+namespace multi = boost::multi;
+namespace blas = multi::blas;
+
+BOOST_AUTO_TEST_CASE(multi_blas_copy_n){
+	multi::array<double, 1> const A = {1., 2., 3., 4.};
+	multi::array<double, 1> B = {5., 6., 7., 8.};
+	blas::copy_n(A.begin(), A.size(), B.begin());
+	BOOST_REQUIRE( B == A );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_copy_it){
+	multi::array<double, 1> const A = {1., 2., 3., 4.};
+	multi::array<double, 1> B = {5., 6., 7., 8.};
+	blas::copy(A.begin(), A.end(), B.begin());
+	BOOST_REQUIRE( B == A );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_copy){
+	multi::array<double, 1> const A = {1., 2., 3., 4.};
+	{
+		multi::array<double, 1> B = {5., 6., 7., 8.};
+		blas::copy(A, B); // segmentation fault in clang-11
+		BOOST_REQUIRE( B == A );
+	}
+	{
+		multi::array<double, 1> B = {5., 6., 7., 8.};
+		BOOST_REQUIRE( size(B) == size(A) );
+		B = blas::copy(A);
+		BOOST_REQUIRE( B == A );
+	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_copy_real){
+	namespace blas = multi::blas;
+	multi::array<double, 2> A = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	BOOST_REQUIRE( A[0][2] ==  3. );
+	BOOST_REQUIRE( A[2][2] == 11. );
+
+	blas::copy(A[0], A[2]);
+	BOOST_REQUIRE( A[0][2] ==  3. );
+	BOOST_REQUIRE( A[2][2] ==  3. );
+
+//	multi::blas::copy(begin(A[1]), end(A[1]), begin(A[2])); // dcopy
+	blas::copy( A[1]({0, size(A[1])}), A[2]({0, size(A[1])}) );
+	BOOST_REQUIRE( A[1][3] == 8. );
+	BOOST_REQUIRE( A[2][3] == 8. );
+
+	multi::array<double, 1> AR3 = blas::copy(rotated(A)[3]); // dcopy
+	BOOST_REQUIRE( AR3[1] == A[1][3] );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_copy_row){
+	multi::array<double, 2> const A = {
+		{1., 2., 3.},
+		{4., 5., 6.},
+		{7., 8., 9.}
+	};
+	multi::array<double, 1> B(3);
+	blas::copy(rotated(A)[0], B);
+	BOOST_REQUIRE( B == rotated(A)[0] );
+}
+
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_copy_complex){
+	using complex = std::complex<double>; constexpr complex I{0, 1};
+	multi::array<complex, 2> A = {
+		{1. + 3.*I,  2. + 4.*I,  3. + 5.*I,  4. + 6.*I},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	blas::copy(A[0], A[2]);
+	BOOST_REQUIRE( A[0][2] == 3. + 5.*I );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_copy_context){
+	multi::array<double, 1> const A = {1., 2., 3., 4.};
+	blas::context ctx;
+	{
+		multi::array<double, 1> B = {5., 6., 7., 8.};
+		blas::copy(ctx, A, B);
+		BOOST_REQUIRE( A == B );
+	}
+	{
+		multi::array<double, 1> B = {5., 6., 7., 8.};
+		BOOST_REQUIRE( size(B) == size(A) );
+		B = blas::copy(ctx, A);
+		BOOST_REQUIRE( A == B );
+	}
+}
+
+#if CUDA_FOUND
+#include<thrust/complex.h>
+
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_copy_thrust){
+
+	multi::array<thrust::complex<double>, 1> const a(10, thrust::complex<double>{});
+	multi::array<thrust::complex<double>, 1> b(10);
+	blas::copy(a, b);
+
+	BOOST_REQUIRE( a == b );
+}
+
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_text_copy_interop){
+
+	static_assert( std::is_convertible<std::complex<double>, thrust::complex<double>>{} );
+	static_assert( std::is_convertible<thrust::complex<double>, std::complex<double>>{} );
+	multi::array<std::complex<double>, 1> a(10, std::complex<double>{});
+	multi::array<thrust::complex<double>, 1> b(10);
+	blas::copy(a, b);
+
+	BOOST_REQUIRE( a == b );
+}
+#endif
+
+//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_copy_cuda_complex){
+//	namespace cuda = multi::cuda;
+//	cuda::array<complex, 2> A = {
+//		{1. + 3.*I,  2. + 4.*I,  3. + 5.*I,  4. + 6.*I},
+//		{5.,  6.,  7.,  8.},
+//		{9., 10., 11., 12.}
+//	};
+
+//	blas::copy(A[0], A[2]);
+//	BOOST_REQUIRE( A[0][2] == 3. + 5.*I );
+//	BOOST_REQUIRE( A[2][2] == 3. + 5.*I );
+//}
+
+//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_copy_cuda_managed_complex){
+//	namespace cuda = multi::cuda;
+//	namespace blas = multi::blas;
+
+//	cuda::managed::array<complex, 2> A = {
+//		{1. + 3.*I,  2. + 4.*I,  3. + 5.*I,  4. + 6.*I},
+//		{5.,  6.,  7.,  8.},
+//		{9., 10., 11., 12.}
+//	};
+//	blas::copy(A[0], A[2]);
+//	BOOST_REQUIRE( A[0][2] == 3. + 5.*I );
+//	BOOST_REQUIRE( A[2][2] == 3. + 5.*I );
+//}
+
+
+
--- a/external_codes/boost_multi/multi/adaptors/blas/test/dot.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/dot.cpp
@ -0,0 +1,403 @@
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+// © Alfredo A. Correa 2019-2020
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS dot"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+#include "config.hpp"
+
+#include "../../blas/dot.hpp"
+
+#include "../../../array.hpp"
+//#include "../../blas/cuda.hpp"
+//#include "../../../adaptors/cuda.hpp"
+
+#include<cassert>
+#include<complex>
+#include<numeric>
+#include<type_traits>
+
+namespace multi = boost::multi;
+namespace blas = multi::blas;
+
+BOOST_AUTO_TEST_CASE(blas_dot_context){
+	multi::array<float, 1> const A = {1.,2.,3.};
+	multi::array<float, 1> const B = {1.,2.,3.};
+	blas::context ctxt;
+	auto C = +blas::dot(ctxt, A, B);
+	BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), 0.F) );
+}
+
+BOOST_AUTO_TEST_CASE(blas_dot_no_context){
+	multi::array<float, 1> const A = {1.,2.,3.};
+	multi::array<float, 1> const B = {1.,2.,3.};
+	auto C = +blas::dot(A, B);
+	BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), 0.F) );
+}
+
+BOOST_AUTO_TEST_CASE(blas_dot_no_context_out_param){
+	multi::array<float, 1> const A = {1.,2.,3.};
+	multi::array<float, 1> const B = {1.,2.,3.};
+	float C = NAN;
+	blas::dot(A, B, C);
+	BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), 0.F) );
+}
+
+BOOST_AUTO_TEST_CASE(blas_dot_no_context_out_param_complex){
+	using complex = std::complex<double>;
+	multi::array<complex, 1> const A = {1.,2.,3.};
+	multi::array<complex, 1> const B = {1.,2.,3.};
+	complex C;
+	blas::dot(A, B, C);
+	BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), complex{0.}, std::plus<>{}, [](auto const& a, auto const& b){return a*std::conj(b);}) );
+}
+
+BOOST_AUTO_TEST_CASE(blas_dot_no_context_out_param_complex_C){
+	using complex = std::complex<double>; complex const I{0., 1.};
+	multi::array<complex, 1> const A = {1.,2., 3.};
+	multi::array<complex, 1> const B = {1.,2. + 2.*I, 3.};
+	complex C;
+	blas::dot(blas::C(A), B, C);
+	BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), complex{0.}, std::plus<>{}, [](auto const& a, auto const& b){return conj(a)*b;}) );
+}
+
+#if defined(CUDA_FOUND) and CUDA_FOUND
+#include<thrust/complex.h>
+BOOST_AUTO_TEST_CASE(blas_dot_no_context_out_param_complex_C_thrust){
+	using complex = thrust::complex<double>; complex const I{0., 1.};
+	multi::array<complex, 1> const A = {1.,2., 3.};
+	multi::array<complex, 1> const B = {1.,2. + 2.*I, 3.};
+	complex C;
+	blas::dot(blas::C(A), B, C);
+	BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), complex{0.}, std::plus<>{}, [](auto& a, auto& b){return conj(a)*b;}) );
+}
+#endif
+
+BOOST_AUTO_TEST_CASE(blas_dot){
+//	multi::array<float, 1> const A = {1.,2.,3.};
+//	multi::array<float, 1> const B = {1.,2.,3.};
+//	{
+//		float f = blas::dot(A, B); // uses cast operator decay
+//		BOOST_REQUIRE( f == std::inner_product(begin(A), end(A), begin(B), 0.f) );
+//	}
+//	{
+//		float f2;
+//		*multi::array_ptr<float, 0>(&f2, {}) = blas::dot(A, B); // uses custom copy
+//		BOOST_REQUIRE( f2 == std::inner_product(begin(A), end(A), begin(B), 0.f) );
+//	}
+//	{
+//		multi::array<float, 0> F = blas::dot(A, B);
+//		BOOST_REQUIRE( F() == std::inner_product(begin(A), end(A), begin(B), 0.f) );
+//	}
+
+//	using complex = std::complex<double>; complex const I{0, 1};
+//	{
+//		multi::array<complex, 1> const A = {I, 2.*I, 3.*I};
+//		BOOST_TEST( blas::dot(A, A).decay() == std::inner_product(begin(A), end(A), begin(A), complex{0.}) );
+//	}
+//	{
+//		multi::array<complex, 1> const A = {I, 1. + 2.*I, 3.*I};
+//		multi::array<complex, 1> const B = {I, 1. + 2.*I, 3.*I};
+
+//		BOOST_TEST( blas::dot(A, B).decay() == std::inner_product(begin(A), end(A), begin(B), complex{0.}, std::plus<>{}, [](auto&& a, auto&& b){return a*b;}) );
+////		BOOST_REQUIRE(
+////			std::inner_product(begin(A), end(A), begin(        B ), std::complex<double>{0.}, std::plus<>{}, [](auto&& a, auto&& b){return a*std::conj(b);}) 
+////			==s
+////			std::inner_product(begin(A), end(A), begin(blas::C(B)), std::complex<double>{0.}, std::plus<>{}, [](auto&& a, auto&& b){return a*b;}) 
+////		);
+//		BOOST_REQUIRE( blas::dot(A, blas::C(B)).decay() == std::inner_product(begin(A), end(A), begin(B), complex{0.}, std::plus<>{}, [](auto&& a, auto&& b){return a*std::conj(b);}) );
+//	}
+//	{
+//		multi::array<complex, 1> const a = {1. + I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
+//		multi::array<complex, 1> const b = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
+//		{
+//			multi::array<complex, 0> c({}, complex{});
+//			blas::dot(a, b, c);
+//			BOOST_TEST( c() == 19. - 27.*I );
+//		}
+//	}
+//	{
+//		cuda::array<complex, 1> const acu = {1. + I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
+//		cuda::array<complex, 1> const bcu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
+
+//		{
+//			cuda::array<complex, 0> ccu;
+//			blas::dot(acu, bcu, ccu);
+//			BOOST_REQUIRE( ccu() == 19. - 27.*I );
+//		}
+//		BOOST_REQUIRE( blas::C(bcu)[1] == 6. - 6.*I );
+//		{
+//			cuda::array<complex, 0> ccu;
+//			static_assert( multi::blas::is_complex_array<multi::array<complex, 1>>{}, "!" );
+//			static_assert( multi::blas::is_complex_array<cuda::array<complex, 1>>{}, "!" );
+//			blas::dot(acu, blas::C(bcu), ccu);
+//			BOOST_REQUIRE( ccu() == 121. - 43.*I );
+//		}
+//		{
+//			auto const ccu = blas::dot(acu, blas::C(bcu));
+//			BOOST_REQUIRE( ccu() == 121. - 43.*I );
+//		}
+//		{
+//			cuda::array<complex, 1> ccu = {1, 2, 3};
+//			blas::dot(acu, blas::C(bcu), ccu[0]);
+//			BOOST_REQUIRE( ccu[0] == 121. - 43.*I );
+//		}
+//		{
+//			cuda::array<complex, 2> ccu({1, 1});
+//			blas::dot(acu, blas::C(bcu), ccu[0][0]);
+//			BOOST_REQUIRE( ccu[0][0] == 121. - 43.*I );
+//		}
+//	}
+//	{
+//		namespace cuda = multi::cuda;
+//		cuda::managed::array<complex, 1> const amcu = {1. + I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
+//		cuda::managed::array<complex, 1> const bmcu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
+//		{
+//			cuda::managed::array<complex, 0> cmcu;
+//			blas::dot(amcu, bmcu, cmcu);
+//			BOOST_REQUIRE( cmcu() == 19.- I*27. );
+//		}
+//		{
+//			cuda::array<complex, 1> cmcu = {1, 2, 3};
+//			blas::dot(amcu, blas::C(bmcu), cmcu[0]);
+//			BOOST_REQUIRE( cmcu[0] == complex(121., -43.) );
+//		}
+//	}
+//	{
+//		using complex = std::complex<double>; complex const I{0, 1};
+//		cuda::array<complex, 1> const acu = {1. + I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
+//		cuda::array<complex, 1> const bcu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
+//		{
+//			cuda::array<complex, 0> ccu;
+//			blas::dot(acu, bcu, ccu);
+//			BOOST_REQUIRE( ccu() == 19. - 27.*I );
+//		}
+//	}
+//	{
+//		using complex = thrust::complex<double>; complex const I{0, 1};
+//		cuda::managed::array<complex, 1> const acu = {1. + I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
+//		cuda::managed::array<complex, 1> const bcu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
+//		{
+//			cuda::managed::array<complex, 0> ccu;
+//			blas::dot(acu, bcu, ccu);
+//			BOOST_REQUIRE( ccu() == 19. - 27.*I );
+//		}
+//	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_dot_strided){
+	multi::array<double, 2> const CA = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	double d = std::numeric_limits<double>::quiet_NaN();
+	blas::dot_n(begin(CA[1]), size(CA[1]), begin(CA[2]), &d);
+	BOOST_REQUIRE( d == std::inner_product(begin(CA[1]), begin(CA[2]), end(CA[1]), 0.) );
+
+	double d2 = blas::dot(CA[1], CA[2]);
+	BOOST_REQUIRE( d == d2 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_dot_strided_context){
+	multi::array<double, 2> const CA = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	double d = std::numeric_limits<double>::quiet_NaN();
+	blas::dot_n(blas::context{}, begin(CA[1]), size(CA[1]), begin(CA[2]), &d);
+	BOOST_REQUIRE( d == std::inner_product(begin(CA[1]), begin(CA[2]), end(CA[1]), 0.) );
+
+	double d2 = blas::dot(CA[1], CA[2]);
+	BOOST_REQUIRE( d == d2 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_dot_1d_real){
+
+	multi::array<float, 1> V = {1., 2., 3.};
+	multi::array<float, 1> W = {1., 2., 3.};
+	
+	using blas::dot;
+	BOOST_REQUIRE( 14. == dot(V, W) );
+	BOOST_REQUIRE( dot(V, W) == 14. );
+
+}
+
+
+BOOST_AUTO_TEST_CASE(multi_blas_dot_impl_real){
+	multi::array<double, 2> const cA = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	{
+		double d = blas::dot(cA[1], cA[2]);
+		BOOST_REQUIRE( d==std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
+	}
+	{
+		double d = NAN;
+		blas::dot(cA[1], cA[2], d);
+		BOOST_REQUIRE( d==std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
+	}
+	{
+		double d = NAN;
+		auto d2 = blas::dot(cA[1], cA[2], d);
+		BOOST_REQUIRE( d==d2 );
+	}
+//	{
+//		multi::array<double, 0> d;
+//		auto d2 = blas::dot(cA[1], cA[2], d);
+//		BOOST_REQUIRE( d == std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
+//	}
+	{
+		double d = blas::dot(cA[1], cA[2]);
+		BOOST_REQUIRE( d == std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
+		BOOST_REQUIRE( blas::dot(cA[1], cA[2]) == blas::dot(cA[2], cA[1]) );
+	}
+//	{	
+//		double s;
+//		blas::dot(cA[1], cA[1], s);
+//		BOOST_REQUIRE( std::sqrt(s)==blas::nrm2(cA[1]) );
+//	}
+	{
+	//	auto d1 = blas::dot(cA[1], cA[1]);
+	//	auto d2 = blas::dot(blas::conj(cA[1]), cA[1]);
+	}
+}
+
+BOOST_AUTO_TEST_CASE(inq_case){
+	multi::array<double, 1> v1(10, +1.0);
+	multi::array<double, 1> v2(10, -1.0);
+
+	using blas::dot;
+	using blas::hermitized;
+	using blas::conj;
+	
+	auto a = dot(v1, v2);
+	auto b = dot(hermitized(v1), v2);
+	
+	BOOST_REQUIRE(a == b);
+	
+	auto c = dot(blas::conj(v1), v2); // conjugation doesn't do anything for real array
+	BOOST_REQUIRE(c == a);
+	
+	auto d_arr = dot(blas::C(v1), v2);
+	BOOST_REQUIRE(d_arr == a);
+	
+	static_assert( not std::is_same<decltype(d_arr), double>{}, "!" );
+
+	using blas::C;
+	double d_doub = dot(C(v1), v2);
+	
+	BOOST_REQUIRE( d_doub == d_arr );
+}
+
+//BOOST_AUTO_TEST_CASE(multi_blas_dot_impl_complex){
+//	namespace blas = multi::blas;
+
+//	using complex = std::complex<double>; complex const I{0, 1};
+//	multi::array<complex, 2> const A = {
+//		{1. +    I,  2. + 3.*I,  3.+2.*I,  4.-9.*I},
+//		{5. + 2.*I,  6. + 6.*I,  7.+2.*I,  8.-3.*I},
+//		{9. + 1.*I, 10. + 9.*I, 11.+1.*I, 12.+2.*I}
+//	};
+//	{
+//		complex c; blas::dot(A[1], A[2], c);
+//		BOOST_TEST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}) );
+//	}
+//	{
+//		complex c = blas::dot(A[1], A[2]);
+//		BOOST_TEST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}) );
+//	}
+//	{
+//		complex c = blas::dot(A[1], blas::C(A[2]));
+//		BOOST_TEST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}, std::plus<>{}, [](auto a, auto b){return a*conj(b);}) );
+//	}
+//	{
+//		complex c = blas::dot(blas::C(A[1]), A[2]);
+//		BOOST_TEST_REQUIRE( c == inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{}, std::plus<>{}, [](auto a, auto b){return conj(a)*b;}) );
+//	}
+//	{
+//		complex c = blas::dot(blas::conj(A[1]), A[2]);
+//		BOOST_TEST_REQUIRE( c == inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{}, std::plus<>{}, [](auto a, auto b){return conj(a)*b;}) );
+//	}
+////	{
+////		complex c = blas::dot(blas::C(A[1]), blas::C(A[2]));
+////		BOOST_TEST_REQUIRE( c == inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{}, std::plus<>{}, [](auto a, auto b){return conj(a)*conj(b);}) );
+////	}
+//	{
+//		complex c = blas::dot(blas::C(A[1]), A[2]);
+//		BOOST_TEST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}, std::plus<>{}, [](auto a, auto b){return conj(a)*b;}) );
+//	}
+////	{
+////		complex c = blas::dot(blas::C(A[1]), blas::C(A[2]));
+////		BOOST_TEST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}, std::plus<>{}, [](auto a, auto b){return conj(a)*conj(b);}) );
+////	}
+//}
+
+#include "config.hpp" // cuda found
+#if defined(CUDA_FOUND) and CUDA_FOUND
+
+#include<thrust/complex.h>
+
+BOOST_AUTO_TEST_CASE(multi_blas_dot_impl_complex_thrust){
+	namespace blas = multi::blas;
+
+	using complex = std::complex<double>; complex const I{0, 1};
+	multi::array<complex, 2> const A = {
+		{1. +    I,  2. + 3.*I,  3.+2.*I,  4.-9.*I},
+		{5. + 2.*I,  6. + 6.*I,  7.+2.*I,  8.-3.*I},
+		{9. + 1.*I, 10. + 9.*I, 11.+1.*I, 12.+2.*I}
+	};
+	{
+		complex c;
+		blas::core::dotu(size(A[1]), A[1].base(), A[1].stride(), A[2].base(), A[2].stride(), &c);
+		auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
+		BOOST_REQUIRE( c.real() == inner.real() );
+		BOOST_REQUIRE( c.imag() == inner.imag() );
+	}
+	{
+		complex c;
+		blas::context::dotu(size(A[1]), A[1].base(), A[1].stride(), A[2].base(), A[2].stride(), &c);
+		auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
+		BOOST_REQUIRE( c.real() == inner.real() );
+		BOOST_REQUIRE( c.imag() == inner.imag() );
+	}
+	{
+		complex c;
+		blas::dot_n(begin(A[1]), size(A[1]), begin(A[2]), &c);
+		auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
+		BOOST_REQUIRE( c == inner );
+	}
+	{
+		complex c;
+		blas::dot(A[1], A[2], c);
+		auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
+		BOOST_REQUIRE( c == inner );
+	}
+	{
+		complex c = blas::dot(A[1], A[2]);
+		auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
+		BOOST_REQUIRE( c == inner );
+	}
+	{
+		auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
+		BOOST_REQUIRE( +blas::dot(A[1], A[2]) == inner );
+	}
+	{
+		complex c; blas::dot(A[1], A[2], c);
+		BOOST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}) );
+	}
+	{
+		complex c = blas::dot(A[1], A[2]);
+		BOOST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}) );
+	}
+	{
+		complex c = blas::dot(A[1], blas::C(A[2]));
+		BOOST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}, std::plus<>{}, [](auto a, auto b){return a*conj(b);}) );
+	}
+}
+#endif
+
--- a/external_codes/boost_multi/multi/adaptors/blas/test/gemm.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/gemm.cpp
--- a/external_codes/boost_multi/multi/adaptors/blas/test/gemv.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/gemv.cpp
@ -0,0 +1,277 @@
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+// © Alfredo A. Correa 2020
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS gemv"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+#include "config.hpp"
+
+#include "../../../adaptors/blas/gemv.hpp"
+#include "../../../array.hpp"
+
+#include "../../../array.hpp"
+#include "../../../utility.hpp"
+
+#include "../../blas/axpy.hpp"
+#include "../../blas/dot.hpp"
+#include "../../blas/gemm.hpp"
+#include "../../blas/nrm2.hpp"
+
+#include<random>
+
+namespace multi = boost::multi;
+namespace blas = multi::blas;
+
+template<class T> void what(T&&) = delete;
+
+template<class M, class VI, class VO>
+void MV(M const& m, VI const& x, VO&& y){
+	std::transform(
+		begin(m), end(m), begin(y),
+		[&x](auto&& row){return std::inner_product(begin(row), end(row), begin(x), 0.);}
+	);
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_gemv){//, *utf::tolerance(0.0001)){
+
+	multi::array<double, 2> const M = {
+		{ 9., 24., 30., 9.},
+		{ 4., 10., 12., 7.},
+		{14., 16., 36., 1.}
+	};
+	multi::array<double, 1> const v = {1.1, 2.1, 3.1, 4.1};
+	{
+		multi::array<double, 1>       w(size(M));
+		blas::gemv_n(1., begin(M), size(M), begin(v), 0., begin(w));
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3                , 0.0001 );
+		BOOST_REQUIRE_CLOSE( w[2] , +blas::dot(M[2], v) , 0.0001 );
+	}
+	{
+		multi::array<double, 1>       w(size(M));
+		multi::array<double, 2> const MT = ~M;
+		blas::gemv_n(1., begin(~MT), size(~MT), begin(v), 0., begin(w));
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3               , 0.0001 );
+		BOOST_REQUIRE_CLOSE( w[2] , +blas::dot(M[2], v), 0.0001 );
+	}
+	{
+		multi::array<double, 1> w(size(M));
+		auto mv = blas::gemv(1., M, v);
+		copy_n(mv.begin(), mv.size(), w.begin());
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
+		
+		multi::array<double, 1> w2(size(M));
+		MV(M, v, w2);
+		BOOST_REQUIRE_CLOSE( w2[0] , w[0], 0.00001 );
+	}
+	{
+		multi::array<double, 1> w(size(M));
+		w = blas::gemv(1., M, v);
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
+	}
+	{
+		multi::array<double, 1> w = blas::gemv(1., M, v);
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
+	}
+	{
+		multi::array<double, 1> w(size(M), 0.);
+		w += blas::gemv(1., M, v);
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
+	}
+	{
+		multi::array<double, 1> w = {4., 5., 6.};
+		blas::gemv(1.1, M, v, 1., w); // y = a*M*x + b*y
+		BOOST_REQUIRE_CLOSE( w[1] , 105.43 , 0.00001 );
+	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_gemv_real){//, *utf::tolerance(0.0001)){
+	namespace blas = multi::blas;
+
+	using std::abs;
+	multi::array<double, 2> const M = {
+		{ 9., 24., 30., 9.},
+		{ 4., 10., 12., 7.},
+		{14., 16., 36., 1.}
+	};
+	multi::array<double, 1> const X = {1.1, 2.1, 3.1, 4.1};
+	{
+		multi::array<double, 1> Y = {4.,5.,6.};
+		double const a = 1.1;
+		double const b = 1.2;
+		blas::gemv(a, M, X, b, Y); // y = a*M*x + b*y
+
+		multi::array<double, 1> const Y3 = {214.02, 106.43, 188.37};
+		BOOST_REQUIRE( abs(Y[1] - Y3[1]) < 2e-14 );
+	}
+	{
+		auto Y = +blas::gemv(1., M, X);
+		BOOST_REQUIRE_CLOSE( Y[0] , +blas::dot(M[0], X) , 0.00001 );
+		BOOST_REQUIRE_CLOSE( Y[1] , +blas::dot(M[1], X) , 0.00001 );
+		BOOST_REQUIRE_CLOSE( Y[2] , +blas::dot(M[2], X) , 0.00001 );
+	}
+	{
+		multi::array<double, 1> const a = {1., 2., 3.};
+		multi::array<double, 1> const b = {4., 5., 6.};
+		multi::array<double, 1> const dot = blas::gemv(1., multi::array<double, 2>({a}), b);
+		BOOST_REQUIRE( dot[0] == blas::dot(a, b) );
+	}
+	{
+		using blas::operators::operator%;
+		using blas::operators::operator-;
+		using blas::operators::operator^;
+		BOOST_REQUIRE_SMALL( ((~+~M)%X - M%X)^2 , 1e-13 );
+	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_gemv_real_complex){
+	namespace blas = multi::blas;
+	using complex = std::complex<double>; //#define I *std::complex<double>(0, 1)
+	using std::abs;
+	multi::array<complex, 2> const M = {
+		{ 9., 24., 30., 9.},
+		{ 4., 10., 12., 7.},
+		{14., 16., 36., 1.}
+	};
+	multi::array<complex, 1> const X = {1.1, 2.1, 3.1, 4.1};
+	{
+		multi::array<complex, 1> Y = {4., 5., 6.};
+		double const a = 1.1;
+		double const b = 1.2;
+		blas::gemv(a, M, X, b, Y); // y = a*M*x + b*y
+		
+		multi::array<complex, 1> const Y3 = {214.02, 106.43, 188.37};
+		
+		using blas::operators::operator-;
+		double const n2 = blas::nrm2(Y - Y3);
+		BOOST_REQUIRE_SMALL( n2 , 1e-13);
+	}
+}
+
+#if CUDA_FOUND
+#include<thrust/complex.h>
+BOOST_AUTO_TEST_CASE(multi_blas_gemv_real_complex_thrust){
+	namespace blas = multi::blas;
+	using complex = thrust::complex<double>; //#define I *std::complex<double>(0, 1)
+	using std::abs;
+	multi::array<complex, 2> const M = {
+		{ 9., 24., 30., 9.},
+		{ 4., 10., 12., 7.},
+		{14., 16., 36., 1.}
+	};
+	multi::array<complex, 1> const X = {1.1, 2.1, 3.1, 4.1};
+	{
+		multi::array<complex, 1> Y = {4., 5., 6.};
+		double const a = 1.1;
+		double const b = 1.2;
+		blas::gemv(a, M, X, b, Y); // y = a*M*x + b*y
+		
+		multi::array<complex, 1> const Y3 = {214.02, 106.43, 188.37};
+	}
+	{
+		multi::array<complex, 1> Y = {4., 5., 6.};
+		blas::gemv(1.1, M, X, 1., Y); // y = a*M*x + b*y
+		BOOST_REQUIRE( Y[1] == 105.43 );
+	}
+
+}
+#endif
+
+BOOST_AUTO_TEST_CASE(multi_blas_gemv_complex){
+	
+	namespace blas = multi::blas;
+	using complex = std::complex<double>; std::complex<double> const I{0, 1};
+	
+	using std::abs;
+	multi::array<complex, 2> const M = {{2. + 3.*I, 2. + 1.*I, 1. + 2.*I}, {4. + 2.*I, 2. + 4.*I, 3. + 1.*I}, 
+ {7. + 1.*I, 1. + 5.*I, 0. + 3.*I}};
+	multi::array<complex, 1> const X = {1. + 2.*I, 2. + 1.*I, 9. + 2.*I};
+	BOOST_REQUIRE(( +blas::gemv(1., M, X) == multi::array<complex, 1>{4. + 31.*I, 25. + 35.*I, -4. + 53.*I} ));
+	
+	auto MT = +~M;
+	BOOST_REQUIRE(( +blas::gemv(1., ~MT, X) == multi::array<complex, 1>{4. + 31.*I, 25. + 35.*I, -4. + 53.*I} ));
+	
+//	auto MH = +*~M;
+	BOOST_REQUIRE( +blas::gemv(1., ~M, X) == (multi::array<complex, 1>{63. + 38.*I, -1. + 62.*I, -4. + 36.*I}) );
+	BOOST_REQUIRE( +blas::gemv(1., ~M, X) == +blas::gemv(1., MT, X) );// == multi::array<complex, 1>{4. + 31.*I, 25. + 35.*I, -4. + 53.*I} ));
+	
+//	BOOST_REQUIRE( +blas::gemv(1., *M, X) == (multi::array<complex, 1>{26. - 15.*I, 45. - 3.*I, 22. - 23.*I}) );
+//	BOOST_REQUIRE( +blas::gemv(1., ~*M, X) == (multi::array<complex, 1>{83. + 6.*I, 31. - 46.*I, 18. - 26.*I}) ); // not supported by blas
+
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_gemv_temporary){
+
+	using complex = std::complex<double>;
+	
+	multi::array<complex, 2> const A = {
+		{1., 0., 0.}, 
+		{0., 1., 0.},
+		{0., 0., 1.}
+	};
+	
+	auto const B = [](auto _){
+		auto rand = [d=std::normal_distribution<>{}, g=std::mt19937{1}]()mutable{return complex{d(g), d(g)};}; // NOLINT(cert-msc32-c,cert-msc51-cpp): test purposes
+		std::generate(_.elements().begin(), _.elements().end(), rand);
+		return _;
+	}(multi::array<complex, 2>({3, 3}));
+
+	using blas::operators::operator*;
+	using blas::operators::operator-;
+	using blas::operators::operator^;
+	BOOST_REQUIRE( (((A*B)[0] - B[0])^2) == 0. );
+	BOOST_REQUIRE( (((A*B)[1] - B[1])^2) == 0. );
+	BOOST_REQUIRE( (((A*B)[2] - B[2])^2) == 0. );
+
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_gemv_context){//, *utf::tolerance(0.0001)){
+
+	multi::array<double, 2> const M = {
+		{ 9., 24., 30., 9.},
+		{ 4., 10., 12., 7.},
+		{14., 16., 36., 1.}
+	};
+	multi::array<double, 1> const v = {1.1, 2.1, 3.1, 4.1};
+	
+	blas::context ctxt;
+	{
+		multi::array<double, 1>       w(size(M));
+		blas::gemv_n(ctxt, 1., begin(M), size(M), begin(v), 0., begin(w));
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.0001 );
+		BOOST_REQUIRE_CLOSE( w[2] , +blas::dot(M[2], v) , 0.0001 );
+	}
+	{
+		multi::array<double, 1>       w(size(M));
+		multi::array<double, 2> const MT = ~M;
+		blas::gemv_n(ctxt, 1., begin(~MT), size(~MT), begin(v), 0., begin(w));
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
+		BOOST_REQUIRE_CLOSE( w[2] , +blas::dot(M[2], v) , 0.00001 );
+	}
+	{
+		multi::array<double, 1> w(size(M));
+		auto&& mv = blas::gemv(ctxt, 1., M, v);
+		copy_n(mv.begin(), mv.size(), w.begin());
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
+	}
+	{
+		multi::array<double, 1> w(size(M));
+		w = blas::gemv(ctxt, 1., M, v);
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
+	}
+	{
+		multi::array<double, 1> w = blas::gemv(ctxt, 1., M, v);
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
+	}
+	{
+		multi::array<double, 1> w(size(M), 0.);
+		w += blas::gemv(ctxt, 1., M, v);
+		BOOST_REQUIRE_CLOSE( w[1] , 91.3, 0.00001 );
+	}
+	{
+		multi::array<double, 1> w = {4., 5., 6.};
+		w += blas::gemv(ctxt, 1.1, M, v);
+		BOOST_REQUIRE_CLOSE( w[1] , 105.43, 0.00001 );
+	}
+
+}
--- a/external_codes/boost_multi/multi/adaptors/blas/test/herk.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/herk.cpp
@ -0,0 +1,272 @@
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+$CXX $0 -o $0x -lcudart -lcublas -lboost_unit_test_framework `pkg-config --libs blas`&&$0x&&rm $0x;exit
+#endif
+// © Alfredo A. Correa 2019-2020
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS herk"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+//#include "../../../adaptors/cuda.hpp" // multi::cuda ns
+
+//#include "../../../adaptors/blas/cuda.hpp"
+#include "../../../adaptors/blas/gemm.hpp"
+#include "../../../adaptors/blas/herk.hpp"
+
+#include "../../../array.hpp"
+
+namespace multi = boost::multi;
+//namespace cuda = multi::cuda;
+
+
+BOOST_AUTO_TEST_CASE(multi_blas_herk){
+	namespace blas = multi::blas;
+	using complex = std::complex<double>; constexpr complex I{0, 1};
+
+	multi::array<complex, 2> const a = {
+		{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
+		{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
+	};
+	{
+		multi::array<complex, 2> c({2, 2}, 9999.);
+		blas::herk(a, c);
+		BOOST_REQUIRE( c[1][0] == complex(50., -49.) );
+		BOOST_REQUIRE( c[0][1] == complex(50., +49.) );
+
+		multi::array<complex, 2> const c_copy = blas::herk(1., a);
+		BOOST_REQUIRE( c == c_copy );
+
+		BOOST_REQUIRE( +blas::gemm(1., a, blas::H(a)) == blas::herk(a) );
+	}
+}
+
+//BOOST_AUTO_TEST_CASE(multi_blas_cuda_herk_complex){
+//	namespace blas = multi::blas;
+//	multi::array<complex, 2> const a = {
+//		{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
+//		{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
+//	};
+//	{
+//		cuda::array<complex, 2> const acu = a;
+//		BOOST_REQUIRE(a == acu);
+
+//		cuda::array<complex, 2> ccu({2, 2}, 9999.);
+//		blas::herk(acu, ccu);
+//		BOOST_REQUIRE( ccu[1][0] == complex(50., -49.) );
+//		BOOST_REQUIRE( ccu[0][1] == complex(50., +49.) );
+
+//		cuda::array<complex, 2> const ccu_copy = blas::herk(1., acu);
+//		BOOST_REQUIRE( blas::herk(1., acu) == ccu );
+//	}
+//	{
+//		cuda::managed::array<complex, 2> const amcu = a; BOOST_REQUIRE(a == amcu);
+//		cuda::managed::array<complex, 2> cmcu({2, 2}, 9999.);
+
+//		blas::herk(1., amcu, cmcu);
+//		BOOST_REQUIRE( cmcu[1][0] == complex(50., -49.) );
+//		BOOST_REQUIRE( cmcu[0][1] == complex(50., +49.) );
+
+//		cuda::managed::array<complex, 2> const cmcu_copy = blas::herk(1., amcu);
+//		BOOST_REQUIRE( cmcu_copy == cmcu );
+//	}
+//	{
+//		multi::array<complex, 2> c({3, 3}, 9999.);
+//		blas::herk(1., blas::H(a), c);
+//		BOOST_REQUIRE( c[2][1] == complex(41, +2) );
+//		BOOST_REQUIRE( c[1][2] == complex(41, -2) );
+
+//		multi::array<complex, 2> const c_copy = blas::herk(1., blas::H(a));
+//		BOOST_REQUIRE( c_copy == c );
+//	}
+//	{
+//		cuda::array<complex, 2> const acu = a; 
+//		BOOST_REQUIRE(a == acu);
+
+//		cuda::array<complex, 2> ccu({3, 3}, 9999.);
+
+//		blas::herk(1., blas::H(acu), ccu);
+//		BOOST_REQUIRE( ccu[2][1] == complex(41, +2) );
+//		BOOST_REQUIRE( ccu[1][2] == complex(41, -2) );
+
+//		cuda::array<complex, 2> const ccu_copy = blas::herk(1., blas::H(acu));
+//		BOOST_REQUIRE( ccu_copy == ccu );
+//	}
+//	{
+//		cuda::managed::array<complex, 2> const acu = a; BOOST_REQUIRE(a == acu);
+//		cuda::managed::array<complex, 2> ccu({3, 3}, 9999.);
+
+//		blas::herk(1., blas::H(acu), ccu);
+//		BOOST_REQUIRE( ccu[2][1] == complex(41, +2) );
+//		BOOST_REQUIRE( ccu[1][2] == complex(41, -2) );
+
+//		cuda::managed::array<complex, 2> const ccu_copy = blas::herk(1., blas::H(acu));
+//		BOOST_REQUIRE( ccu_copy == ccu );
+//	}
+//}
+
+//BOOST_AUTO_TEST_CASE(multi_blas_cuda_herk_n_complex){
+//	namespace blas = multi::blas;
+//	multi::array<complex, 2> const a = {
+//		{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
+//		{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
+//	};
+//	blas::context ctxt;
+//	{
+//		multi::array<complex, 2> c({2, 2}, 9999.);
+//		blas::herk_n(ctxt, blas::filling::upper, 1., a.begin(), a.size(), 0., c.begin());
+//		BOOST_TEST_REQUIRE( c[0][1] == complex(50., +49.) );
+//		BOOST_TEST_REQUIRE( c[1][0] == 9999. );
+//	}
+//	{
+//		multi::array<complex, 2> c({2, 2}, 9999.);
+//		blas::herk_n(ctxt, blas::filling::lower, 1., a.begin(), a.size(), 0., c.begin());
+//		BOOST_TEST_REQUIRE( c[0][1] == 9999. );
+//		BOOST_TEST_REQUIRE( c[1][0] == complex(50., -49.) );
+//	}
+//	{
+//		multi::array<complex, 2> c({2, 2}, 9999.);
+//		blas::herk_n(ctxt, blas::filling::lower, 1., a.begin(), a.size(), 0., c.begin());
+//		blas::herk_n(ctxt, blas::filling::upper, 1., a.begin(), a.size(), 0., c.begin());
+//		BOOST_TEST_REQUIRE( c[0][1] == complex(50., +49.) );
+//		BOOST_TEST_REQUIRE( c[1][0] == complex(50., -49.) );
+//	}
+//	{
+//		multi::array<complex, 2> c({3, 3}, 9999.);
+//		blas::herk_n(ctxt, blas::filling::lower, 1., blas::H(a).begin(), blas::H(a).size(), 0., c.begin());
+//		BOOST_TEST_REQUIRE( c[1][2] == 9999. );
+//		BOOST_TEST_REQUIRE( c[2][1] == complex(41., +2.) );
+//	}
+//	{
+//		multi::array<complex, 2> c({3, 3}, 9999.);
+//		blas::herk_n(ctxt, blas::filling::upper, 1., blas::H(a).begin(), blas::H(a).size(), 0., c.begin());
+//		BOOST_TEST_REQUIRE( c[1][2] == complex(41., -2.) );
+//		BOOST_TEST_REQUIRE( c[2][1] == 9999. );
+//	}
+//	{
+//		multi::array<complex, 2> c({3, 3}, 9999.);
+//		blas::herk_n(ctxt, blas::filling::lower, 1., blas::H(a).begin(), blas::H(a).size(), 0., c.begin());
+//		blas::herk_n(ctxt, blas::filling::upper, 1., blas::H(a).begin(), blas::H(a).size(), 0., c.begin());
+//		BOOST_TEST_REQUIRE( c[1][2] == complex(41., -2.) );
+//		BOOST_TEST_REQUIRE( c[2][1] == complex(41., +2.) );
+//	}
+//	{
+//		multi::array<complex, 2> c({3, 3}, 9999.);
+//		blas::herk_n(ctxt, 1., blas::H(a).begin(), blas::H(a).size(), c.begin());
+//		BOOST_TEST_REQUIRE( c[1][2] == complex(41., -2.) );
+//		BOOST_TEST_REQUIRE( c[2][1] == complex(41., +2.) );
+//	}
+//}
+
+//BOOST_AUTO_TEST_CASE(multi_blas_cuda_herk_row){
+//	namespace blas = multi::blas;
+//	auto const a = []{ 
+//		multi::array<complex, 2> ret({1, 100});
+//		std::generate(begin(ret[0]), end(ret[0]), [c=complex{1, 2}]()mutable{return c+=2.;}); 
+//		return ret;
+//	}();
+//	BOOST_REQUIRE( size(a) == 1 );
+//	{
+//		BOOST_REQUIRE( +blas::gemm(1., a, blas::H(a)) == blas::herk(a) );
+
+//		cuda::array<complex, 2> const agpu = a;
+//		BOOST_REQUIRE( blas::gemm(agpu, blas::H(agpu)) == blas::herk(agpu) );
+
+//		cuda::managed::array<complex, 2> const amng = a;
+//		BOOST_REQUIRE( blas::gemm(amng, blas::H(amng)) == blas::herk(amng) );
+//	}
+//}
+
+//#if 1
+//BOOST_AUTO_TEST_CASE(multi_blas_cuda_herk_real){
+//	namespace blas = multi::blas;
+//	multi::array<double, 2> const a = {
+//		{ 1., 3., 4.},
+//		{ 9., 7., 1.}
+//	};
+//	{
+//		multi::array<double, 2> c({2, 2}, 9999);
+//		blas::herk(1., a, c);
+//		BOOST_REQUIRE( c[1][0] == 34 );
+//		BOOST_REQUIRE( c[0][1] == 34 );
+
+//	//	multi::array<double, 2> const c_copy = blas::herk(1., a);
+//	//	BOOST_REQUIRE( c == c_copy );
+//	}
+//	{
+//		cuda::array<double, 2> acu = a; 
+//		BOOST_REQUIRE(a == acu);
+
+//		cuda::array<double, 2> ccu({2, 2}, 9999.);
+
+//	//	blas::herk(acu, ccu);
+//	//	BOOST_REQUIRE( ccu[1][0] == 34 );
+//	//	BOOST_REQUIRE( ccu[0][1] == 34 );
+
+//	//	cuda::array<double, 2> const ccu_copy = blas::herk(1., acu);
+//	//	BOOST_REQUIRE( herk(1., acu) == ccu );
+//	}
+
+//}
+//#endif
+
+#if 0
+	{
+		cuda::array<double, 2> const acu = a; BOOST_REQUIRE(a == acu);
+	//	cuda::array<double, 2> ccu({2, 2}, 9999.);
+		using multi::blas::herk;
+		cuda::array<double, 2> ccu = herk(acu);
+		BOOST_REQUIRE( ccu[1][0] == 34 );
+		BOOST_REQUIRE( ccu[0][1] == 34 );
+
+		cuda::array<double, 2> const ccu_copy = herk(1., acu);
+		BOOST_REQUIRE( herk(1., acu) == ccu );
+	}
+	{
+		cuda::managed::array<double, 2> const amcu = a; BOOST_REQUIRE(a == amcu);
+		cuda::managed::array<double, 2> cmcu({2, 2}, 9999.);
+		using multi::blas::herk;
+		herk(1., amcu, cmcu);
+		BOOST_REQUIRE( cmcu[1][0] == 34 );
+		BOOST_REQUIRE( cmcu[0][1] == 34 );
+
+		cuda::managed::array<double, 2> const cmcu_copy = herk(1., amcu);
+		BOOST_REQUIRE( cmcu_copy == cmcu );
+	}
+	if(0){
+		multi::array<double, 2> c({3, 3}, 9999.);
+		using multi::blas::herk;
+		using multi::blas::hermitized;
+		herk(1., hermitized(a), c);
+		BOOST_REQUIRE( c[2][1] == 19 );
+		BOOST_REQUIRE( c[1][2] == 19 );
+
+		multi::array<double, 2> const c_copy = herk(1., hermitized(a));
+		BOOST_REQUIRE( c_copy == c );
+	}
+	if(0){
+		cuda::array<double, 2> const acu = a; BOOST_REQUIRE(acu == a);
+		cuda::array<double, 2> ccu({3, 3}, 9999.);
+		using multi::blas::herk;
+		using multi::blas::hermitized;
+		herk(1., hermitized(acu), ccu);
+		BOOST_REQUIRE( ccu[2][1] == 19 );
+		BOOST_REQUIRE( ccu[1][2] == 19 );
+
+		cuda::array<double, 2> const c_copy = herk(1., hermitized(a));
+		BOOST_REQUIRE( c_copy == ccu );
+	}
+	if(0){
+		cuda::managed::array<double, 2> const amcu = a; BOOST_REQUIRE(amcu == a);
+		cuda::managed::array<double, 2> cmcu({3, 3}, 9999.);
+		using multi::blas::herk;
+		using multi::blas::hermitized;
+		herk(1., hermitized(amcu), cmcu);
+		BOOST_REQUIRE( cmcu[2][1] == 19 );
+		BOOST_REQUIRE( cmcu[1][2] == 19 );
+
+		cuda::managed::array<double, 2> const c_copy = herk(1., hermitized(a));
+		BOOST_REQUIRE( c_copy == cmcu );
+	}
+}
+#endif
+
--- a/external_codes/boost_multi/multi/adaptors/blas/tests/iamax.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/tests/iamax.cpp
@ -1,5 +1,5 @@
-#ifdef COMPILATION_INSTRUCTIONS
-$CXX -Wall -Wextra -Wpedantic $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas -lboost_unit_test_framework&&$0x&&rm $0x; exit
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+$CXX $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas -lboost_unit_test_framework&&$0x&&rm $0x;exit
 #endif
 // © Alfredo A. Correa 2019-2020

@ -19,8 +19,7 @@ using std::cout;
 namespace multi = boost::multi;
 namespace blas = multi::blas;

-using complex = std::complex<double>;
-complex const I{0, 1};
+using complex = std::complex<double>; constexpr complex I{0, 1};

 BOOST_AUTO_TEST_CASE(multi_adaptors_blas_iamax){
 	multi::array<complex, 2> const A = {
@ -39,9 +38,9 @@ BOOST_AUTO_TEST_CASE(multi_adaptors_blas_iamax){

 BOOST_AUTO_TEST_CASE(multi_adaptors_blas_iamax_cuda){
 	multi::cuda::array<complex, 2> const A = {
-		{1. + 2.*I,  2.,  3.,  4.},
-		{5.,  6. + 3.*I,  7.,  8.},
-		{9., 10., 11.+ 4.*I, 12.}
+		{1. + 2.*I,  2.       ,  3.      ,  4.},
+		{5.       ,  6. + 3.*I,  7.      ,  8.},
+		{9.       , 10.       , 11.+ 4.*I, 12.}
 	};
 	using blas::iamax;
 	BOOST_REQUIRE(iamax(A[1])==1);
--- a/external_codes/boost_multi/multi/adaptors/blas/test/nrm2.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/nrm2.cpp
@ -0,0 +1,47 @@
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4-*-
+$CXX $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas -lboost_unit_test_framework&&$0x&&rm $0x;exit
+#endif
+// © Alfredo A. Correa 2019-2020
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS nrm2"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+#include "../../blas.hpp"
+#include "../../../array.hpp"
+#include "../../../adaptors/cuda.hpp"
+#include "../../../adaptors/blas/cuda.hpp"
+
+#include<complex>
+
+namespace multi = boost::multi;
+
+using complex = std::complex<double>; constexpr complex I{0,1};
+
+BOOST_AUTO_TEST_CASE(multi_blas_nrm2){
+	namespace blas = multi::blas;
+	multi::array<double, 2> const A = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	BOOST_REQUIRE( blas::nrm2(A[1]) == std::sqrt(blas::dot(A[1], A[1])) );
+
+	{
+		multi::array<complex, 1> A = {1.+I, 3.+2.*I, 3.+4.*I};
+		BOOST_REQUIRE( blas::dot(A, A)() == (1.+I)*(1.+I) + (3.+2.*I)*(3.+2.*I) + (3.+4.*I)*(3.+4.*I) );
+	}
+	{
+		multi::cuda::array<double, 2> const Agpu = A;
+		multi::cuda::static_array<double, 0> n = 1.2;
+		blas::nrm2(Agpu[1], n);
+	}
+	{
+		multi::cuda::array<double, 2> Agpu = A;
+		double n = 99.;
+		blas::nrm2(Agpu[1], n); // cuda supports putting scalar results in CPU
+		double n2{blas::nrm2(Agpu[1])};
+		BOOST_REQUIRE( n == n2 );
+	}
+}
+
--- a/external_codes/boost_multi/multi/adaptors/blas/test/numeric.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/numeric.cpp
@ -0,0 +1,253 @@
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+// © Alfredo A. Correa 2019-2021
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS numeric"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+#include "config.hpp"
+
+#include "../../../array.hpp"
+#include "../../blas/numeric.hpp"
+#include "../../blas/operations.hpp"
+
+#include<complex>
+
+namespace multi = boost::multi;
+
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_numeric_imag){
+	using complex = std::complex<double>; constexpr complex I{0, 1};
+
+	namespace blas = multi::blas;
+	multi::array<complex, 1> a = { 1. + 2.*I, 3. + 5.*I, 9. + 2.*I };
+	BOOST_REQUIRE( blas::imag(a)[2] == 2. );
+	BOOST_REQUIRE( blas::real(a)[2] == 9. );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_numeric_real_conjugated){
+
+	using complex = std::complex<double>; complex const I{0, 1};
+
+	multi::array<complex, 2> B = {
+		{1. - 3.*I, 6. + 2.*I},
+		{8. + 2.*I, 2. + 4.*I},
+		{2. - 1.*I, 1. + 1.*I}
+	};
+	BOOST_REQUIRE( B[0][0] == 1. - 3.*I );
+
+	multi::array<complex, 2> const Bconst = {
+		{1. - 3.*I, 6. + 2.*I},
+		{8. + 2.*I, 2. + 4.*I},
+		{2. - 1.*I, 1. + 1.*I}
+	};
+	BOOST_REQUIRE( Bconst[0][0] == 1. - 3.*I );
+
+	namespace blas = multi::blas;
+	auto BdataC = blas::make_conjugater(B.data_elements());
+
+	decltype(blas::make_conjugater(Bconst.data_elements())) ppp;// = BdataC;
+	ppp = BdataC;
+
+	BOOST_REQUIRE( *ppp == 1. + 3.*I );
+
+//	static_assert(    multi::blas::is_complex_array<multi::array<thrust::complex<double>, 2>>{}, "!");
+	static_assert(    blas::is_complex_array<decltype(B)>{} );
+	static_assert(not blas::is_conjugated<decltype(B)>{} );
+
+	auto&& Bconj = blas::conj(B);
+	static_assert( blas::is_conjugated<decltype(Bconj)>{} );
+
+	BOOST_REQUIRE( Bconj[0][0] == 1. + 3.*I );
+	BOOST_REQUIRE( imag(*base(Bconj)) == +3 );
+
+//	BOOST_TEST_REQUIRE( base(Bconj)->imag() == +3 );
+	BOOST_REQUIRE( rotated(Bconj)[1][0] == Bconj[0][1] );
+
+//	BOOST_REQUIRE( base(Bconj) == -3.*I );
+	static_assert( blas::is_complex_array<decltype(Bconj)>{} );
+
+	BOOST_REQUIRE( blas::conj(Bconj) == B );
+
+	BOOST_REQUIRE( blas::conj(B)[1][0] == std::conj(B[1][0]) );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_numeric_decay){
+
+	using complex = std::complex<double>; complex const I{0, 1};
+
+	multi::array<complex, 2> B = {
+		{ 1. - 3.*I, 6. + 2.*I},
+		{ 8. + 2.*I, 2. + 4.*I},
+		{ 2. - 1.*I, 1. + 1.*I}
+	};
+
+	namespace blas = multi::blas;
+	multi::array<complex, 2> conjB = blas::conj(B);
+
+	BOOST_REQUIRE( conjB[2][1] == std::conj(B[2][1]) );
+	BOOST_REQUIRE( blas::conj(B)[2][1] == std::conj(B[2][1]) );
+
+	BOOST_REQUIRE( blas::transposed(B)[1][2] == B[2][1] );
+	BOOST_REQUIRE( blas::transposed(B) == ~B );
+
+	BOOST_REQUIRE( blas::hermitized(B)[2][1] == blas::conj(B)[1][2] );
+	BOOST_REQUIRE( blas::hermitized(B)       == blas::conj(blas::transposed(B)) );
+
+	BOOST_REQUIRE( blas::real(B)[2][1] == std::real(B[2][1]) );
+	BOOST_REQUIRE( blas::imag(B)[2][1] == std::imag(B[2][1]) );
+
+	multi::array<double, 2> B_real_doubled = {
+		{ 1., -3., 6., 2.},
+		{ 8.,  2., 2., 4.},
+		{ 2., -1., 1., 1.}
+	};
+	BOOST_REQUIRE( blas::real_doubled(B) == B_real_doubled );
+
+}
+
+#if defined(CUDA_FOUND) and CUDA_FOUND
+#include<thrust/complex.h>
+
+BOOST_AUTO_TEST_CASE(multi_blas_numeric_decay_thrust){
+
+	using complex = thrust::complex<double>; complex const I{0, 1};
+
+	multi::array<complex, 2> B = {
+		{1. - 3.*I, 6. + 2.*I},
+		{8. + 2.*I, 2. + 4.*I},
+		{2. - 1.*I, 1. + 1.*I}
+	};
+
+	namespace blas = multi::blas;
+	multi::array<complex, 2> conjB = blas::conj(B);
+	BOOST_REQUIRE( conjB[1][2] == conj(B[1][2]) );
+}
+#endif
+
+//#if defined(CUDA_FOUND) and CUDA_FOUND
+//#include "../../blas/cuda.hpp"
+//#include "../../../adaptors/cuda.hpp"
+//namespace cuda = multi::cuda;
+
+//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_numeric_imag_cuda){
+//	cuda::array<complex, 1> a = { 1. + 2.*I, 3. + 5.*I, 9. + 2.*I };
+//	namespace blas = multi::blas;
+//	BOOST_REQUIRE( blas::imag(a)[2] == 2. );
+//}
+
+//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_numeric_imag_cuda_managed){
+//	cuda::managed::array<complex, 1> a = { 1. + 2.*I, 3. + 5.*I, 9. + 2.*I };
+//	using multi::blas::imag;
+//	BOOST_REQUIRE( imag(a)[2] == 2. );
+//}
+
+//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_numeric_hermitized_cuda){
+//	cuda::array<complex, 2> const a = {
+//		{ 1. + 2.*I, 3. + 5.*I, 9. + 2.*I },
+//		{ 1. + 2.*I, 3. + 5.*I, 9. + 2.*I },
+//		{ 1. + 2.*I, 3. + 5.*I, 9. + 2.*I },
+//	};
+//	using multi::blas::hermitized;
+//	hermitized(a);
+//}
+//#endif
+
+BOOST_AUTO_TEST_CASE(multi_blas_numeric_real_imag_part){
+
+	using complex = std::complex<double>; complex const I{0., 1.};
+
+	multi::array<double, 2> A = {
+		{1., 3., 4.}, 
+		{9., 7., 1.}
+	};
+	multi::array<complex, 2> Acplx = A;
+	BOOST_REQUIRE( Acplx[1][1] == A[1][1] );
+
+	multi::array<complex, 2> B = {
+		{1. - 3.*I, 6. + 2.*I},
+		{8. + 2.*I, 2. + 4.*I},
+		{2. - 1.*I, 1. + 1.*I}
+	};
+
+	multi::array<double, 2> Breal = {
+		{1., 6.},
+		{8., 2.},
+		{2., 1.}
+	};
+	multi::array<double, 2> Bimag = {
+		{-3., +2.},
+		{+2., +4.},
+		{-1., +1.}
+	};
+
+	using multi::blas::real;
+	using multi::blas::imag;
+
+	BOOST_REQUIRE( Breal == real(B) );
+	BOOST_REQUIRE( real(B) == Breal );
+	BOOST_REQUIRE( imag(B) == Bimag );
+
+	BOOST_REQUIRE( B[1][0] == 8. + 2.*I );
+	BOOST_REQUIRE( B[1][0].imag() == 2. );
+	
+	namespace blas = multi::blas;
+	
+	BOOST_REQUIRE( blas::hermitized(B)[1][2] == std::conj( B[2][1] ) );
+	
+	blas::hermitized(B)[1][2] = 20. + 30.*I;
+	BOOST_REQUIRE( B[2][1] == 20. - 30.*I );
+// 	using multi::blas::hermitized;
+//	BOOST_REQUIRE( hermitized(B)[0][1] == 8. - 2.*I );
+//	BOOST_REQUIRE( imag(hermitized(B)[0][1]) == -2. );
+}
+
+#if 0
+
+	namespace cuda = multi::cuda;
+	{
+		cuda::array<complex, 2> Bgpu = B;
+		using multi::blas::imag;
+		BOOST_REQUIRE( imag(Bgpu)[1][1] == imag(B)[1][1] );
+		BOOST_REQUIRE( real(Bgpu)[1][1] == real(B)[1][1] );
+	}
+	{
+		cuda::managed::array<complex, 2> Bgpu = B;
+		using multi::blas::imag;
+		BOOST_REQUIRE( imag(Bgpu)[1][1] == imag(B)[1][1] );
+		BOOST_REQUIRE( real(Bgpu)[1][1] == real(B)[1][1] );
+	}
+
+	multi::array_ref<double, 2> rB(reinterpret_cast<double*>(data_elements(B)), {size(B), 2*size(*begin(B))});
+
+	auto&& Bconj = multi::static_array_cast<complex, multi::blas::detail::conjugater<complex*>>(B);
+	assert( size(Bconj) == size(B) );
+	assert( conj(B[1][2]) == Bconj[1][2] );
+
+//	auto&& BH = multi::blas::hermitized(B);
+//	assert( BH[1][2] == conj(B[2][1]) );
+//	std::cout << BH[1][2] << " " << B[2][1] << std::endl;
+
+//	auto&& BH1 = multi::static_array_cast<complex, multi::blas::detail::conjugater<complex*>>(rotated(B));
+//	auto&& BH2 = rotated(multi::static_array_cast<complex, multi::blas::detail::conjugater<complex*>>(B));
+
+//	what( BH1, BH2 );
+//	using multi::blas::imag;
+
+//	assert( real(A)[1][2] == 1. );
+//	assert( imag(A)[1][2] == -3. );
+
+//	print(A) <<"--\n";
+//	print(real(A)) <<"--\n";
+//	print(imag(A)) <<"--\n";
+
+	multi::array<complex, 2> C({2, 2});
+	multi::array_ref<double, 2> rC(reinterpret_cast<double*>(data_elements(C)), {size(C), 2*size(*begin(C))});
+
+//	gemm('T', 'T', 1., A, B, 0., C);
+//	gemm('T', 'T', 1., A, B, 0., C);
+//	gemm('T', 'T', 1., real(A), B, 0., C);
+}
+#endif
+
+
+
--- a/external_codes/boost_multi/multi/adaptors/blas/test/operations.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/operations.cpp
@ -0,0 +1,51 @@
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+$CXX $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas -lboost_unit_test_framework&&$0x&&rm $0x;exit
+#endif
+// © Alfredo A. Correa 2019-2020
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS operations and cuda"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+#include "../../blas/dot.hpp"
+
+#include "../../../array.hpp"
+#include "../../blas/cuda.hpp"
+
+#include "../../../adaptors/cuda.hpp"
+#include "../../../complex.hpp"
+
+#include<complex>
+#include<cassert>
+#include<numeric>
+
+using std::cout;
+namespace multi = boost::multi;
+namespace blas = multi::blas;
+
+using complex = std::complex<double>; constexpr complex I{0, 1};
+
+BOOST_AUTO_TEST_CASE(blas_conjugated_cpu){
+	multi::array<complex, 1> const a = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
+	BOOST_REQUIRE( blas::C(a)[1] == conj(a[1]) );
+
+	namespace cuda = multi::cuda;
+
+	cuda::array<complex, 1> const agpu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
+	BOOST_REQUIRE( blas::C(agpu)[1] == conj(agpu[1]) );
+}
+
+BOOST_AUTO_TEST_CASE(blas_conjugated_gpu){
+#if 0
+	cuda::array<complex, 1> const acu = {1. +    I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
+	cuda::array<complex, 1> const bcu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
+
+	{
+		cuda::array<complex, 0> ccu;
+		blas::dot(acu, bcu, ccu);
+		BOOST_REQUIRE( ccu() == 19. - 27.*I );
+	}
+	BOOST_REQUIRE( blas::C(bcu)[1] == 2. - 3.*I );
+#endif
+}
+
--- a/external_codes/boost_multi/multi/adaptors/blas/test/scal.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/scal.cpp
@ -0,0 +1,153 @@
+// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+// © Alfredo A. Correa 2019-2020
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS scal"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+#include "../../blas/scal.hpp"
+
+#include "../../../array.hpp"
+
+#include<complex>
+
+namespace multi = boost::multi;
+namespace blas = multi::blas;
+
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_n){
+	multi::array<double, 2> A = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	BOOST_REQUIRE( (A[0][2] == 3.) and (A[2][2] == 11.) );
+
+	blas::scal_n(2., A[2].begin(), A[2].size());
+	BOOST_REQUIRE( A[0][2] == 3. and A[2][2] == 11.*2. );
+}
+
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_it){
+	multi::array<double, 2> A = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	BOOST_REQUIRE( A[0][2] == 3. );
+	BOOST_REQUIRE( A[2][2] == 11.);
+
+	blas::scal(2., A[2].begin(), A[2].end());
+	BOOST_REQUIRE( A[0][2] == 3. );
+	BOOST_REQUIRE(A[2][2] == 11.*2. );
+}
+
+template<class T> void what(T&&) = delete;
+
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_real){
+	multi::array<double, 2> A = {
+		{1.,  2.,  3.,  4.},
+		{5.,  6.,  7.,  8.},
+		{9., 10., 11., 12.}
+	};
+	BOOST_REQUIRE( A[0][2] ==  3. );
+	BOOST_REQUIRE( A[2][2] == 11. );
+
+	BOOST_REQUIRE(  blas::scal(1., A[2]) ==  A[2] );
+	BOOST_REQUIRE( &blas::scal(1., A[2]) == &A[2] );
+	BOOST_REQUIRE( +blas::scal(1., A[2]) ==  A[2] );
+
+	blas::scal(2., A[2]);
+	BOOST_REQUIRE( A[0][2] == 3. and A[2][2] == 11.*2. );
+
+	BOOST_REQUIRE( &blas::scal(1., A[2]) == &A[2] );
+
+}
+
+//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_complex_real_case){
+//	using complex = std::complex<double>;
+//	multi::array<complex, 2> A = {
+//		{1.,  2.,  3.,  4.},
+//		{5.,  6.,  7.,  8.},
+//		{9., 10., 11., 12.}
+//	};
+//	BOOST_TEST( A[0][2] == 3. );
+//	BOOST_TEST( A[2][2] == 11. );
+
+//	blas::scal(2., A[2]); // zscal (2. is promoted to complex later)
+//	BOOST_TEST( A[0][2] == 3. );
+//	BOOST_REQUIRE( A[2][2] == 11.*2. );
+
+//	blas::scal(1./2, A[2]); // zdscal
+//	BOOST_TEST( A[0][2] == 3. );
+//	BOOST_TEST( A[2][1] == 10. );
+//	BOOST_TEST( A[2][2] == 11. );
+
+//}
+
+//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_complex){
+//	multi::array<complex, 2> A = {
+//		{1. + 2.*I, 2. + 3.*I, 3. + 4.*I, 4. + 5.*I},
+//		{5. + 2.*I, 6. + 3.*I, 7. + 4.*I, 8. + 5.*I},
+//		{1. + 1.*I, 2. + 2.*I, 3. + 3.*I, 4. + 4.*I}
+//	};
+//	blas::scal(2., A[1]); // zscal (2. is promoted to complex later)
+//	BOOST_TEST( A[1][2] == 14. + 8.*I );
+
+//	blas::scal(3.*I, A[0]);
+//	BOOST_TEST( A[0][1] == (2. + 3.*I)*3.*I );
+
+//	blas::scal(2., blas::imag(A[2]));
+//	assert( A[2][1] == 2. + 4.*I );
+//}
+
+////BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_cuda_noconst){
+////	namespace cuda = multi::cuda;
+////	cuda::array<complex, 2> A = {
+////		{1. + 2.*I, 2. + 3.*I, 3. + 4.*I, 4. + 5.*I},
+////		{5. + 2.*I, 6. + 3.*I, 7. + 4.*I, 8. + 5.*I},
+////		{1. + 1.*I, 2. + 2.*I, 3. + 3.*I, 4. + 4.*I}
+////	};
+////	blas::scal(2., A[1]); // zscal (2. is promoted to complex later)
+////	BOOST_REQUIRE( A[1][2] == 14. + 8.*I );
+
+////	cuda::array<complex, 1> a = {1. + 10.*I, 2. + 20.*I, 3. + 30.*I};
+////	blas::scal(2., a);
+////	BOOST_REQUIRE(( a[1] == complex{4, 40} ));
+
+//////	blas::scal(3., blas::imag(a)); // gives internal compilation error in gcc
+//////	BOOST_REQUIRE(( a[1] == complex{4, 120} ));
+////}
+
+////BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_cuda_const){
+////	namespace cuda = multi::cuda;
+////	cuda::array<complex, 2> const A = {
+////		{1. + 2.*I, 2. + 3.*I, 3. + 4.*I, 4. + 5.*I},
+////		{5. + 2.*I, 6. + 3.*I, 7. + 4.*I, 8. + 5.*I},
+////		{1. + 1.*I, 2. + 2.*I, 3. + 3.*I, 4. + 4.*I}
+////	};
+////	auto A1cpy = blas::scal(2., A[1]); // zscal (2. is promoted to complex later)
+////	BOOST_REQUIRE( A1cpy[2] == 14. + 8.*I );
+
+//////	cuda::array<complex, 1> a = {1. + 10.*I, 2. + 20.*I, 3. + 30.*I};
+//////	blas::scal(2., a);
+//////	BOOST_REQUIRE(( a[1] == complex{4, 40} ));
+
+//////	blas::scal(3., blas::imag(a));
+//////	BOOST_REQUIRE(( a[1] == complex{4, 120} ));
+////}
+
+//#if 0
+//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_cuda_managed){
+//	cuda::managed::array<complex, 2> A = {
+//		{1. + 2.*I, 2. + 3.*I, 3. + 4.*I, 4. + 5.*I},
+//		{5. + 2.*I, 6. + 3.*I, 7. + 4.*I, 8. + 5.*I},
+//		{1. + 1.*I, 2. + 2.*I, 3. + 3.*I, 4. + 4.*I}
+//	};
+//	using blas::scal;
+//	scal(2., A[1]);
+//	BOOST_REQUIRE( A[1][2] == 14. + 8.*I );
+
+//	scal(2., blas::imag(A[1]));
+//	BOOST_REQUIRE( A[1][2] == 14. + 16.*I );
+//}
+//#endif
+
--- a/external_codes/boost_multi/multi/adaptors/blas/tests/swap.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/tests/swap.cpp
@ -1,7 +1,12 @@
-#ifdef COMPILATION_INSTRUCTIONS
-$CXX -Wall -Wextra $0 -o$0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x; exit
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x; exit
 #endif
 // © Alfredo A. Correa 2019-2020
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS swap"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
 #include "../../blas.hpp"

 #include "../../../array.hpp"
@ -12,10 +17,6 @@ $CXX -Wall -Wextra $0 -o$0x `pkg-config --libs blas` -lboost_unit_test_framework
 using std::cout;
 namespace multi = boost::multi;

-#define BOOST_TEST_MODULE "C++ Unit Tests for Multi lapack adaptor potrf"
-#define BOOST_TEST_DYN_LINK
-#include<boost/test/unit_test.hpp>
-
 BOOST_AUTO_TEST_CASE(lapack_potrf, *boost::unit_test::tolerance(0.00001) ){
 	{
 		multi::array<double, 2> A = {
--- a/external_codes/boost_multi/multi/adaptors/blas/test/traits.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/traits.cpp
@ -0,0 +1,33 @@
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+$CXXX $CXXFLAGS $0 -o $0.$X `pkg-config --cflags --libs blas cuda-11.0` -lboost_unit_test_framework&&$0.$X&&rm $0.$X;exit
+#endif
+
+#include "../../blas/traits.hpp"
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS traits"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+#include "./config.hpp"
+
+#include<complex>
+
+namespace multi = boost::multi;
+namespace blas = multi::blas;
+
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_traits){
+	static_assert( blas::is_d<double>{} );
+	static_assert( blas::is_s<float >{} );
+	
+	static_assert( blas::is_c<std::complex<float>>{} );
+	static_assert( blas::is_z<std::complex<double>>{} );
+}
+
+#if CUDA_FOUND
+#include<thrust/complex.h>
+BOOST_AUTO_TEST_CASE(multi_adaptors_blas_traits_thrust){
+	static_assert( blas::is_c<thrust::complex<float>>{} );
+	static_assert( blas::is_z<thrust::complex<double>>{} );
+}
+#endif
+
--- a/external_codes/boost_multi/multi/adaptors/blas/test/trsm.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/trsm.cpp
@ -0,0 +1,604 @@
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
+$CXX $0 -o $0x -lcudart -lcublas `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
+#endif
+// © Alfredo A. Correa 2019-2021
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS trsm"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+//#include "../../../memory/adaptors/cuda/managed/ptr.hpp"
+
+#include "../../../adaptors/blas/gemm.hpp"
+#include "../../../adaptors/blas/trsm.hpp"
+//#include "../../../adaptors/blas/cuda.hpp"
+
+//#include "../../../adaptors/cuda.hpp"
+#include "../../../array.hpp"
+
+#include <config.hpp>
+
+namespace multi = boost::multi;
+
+template<class Matrix>
+auto triangular(multi::blas::filling f, Matrix const& m){
+	auto ret =+ m;
+	switch(f){
+	case multi::blas::filling::upper:
+		for(multi::size_type i = 0; i != size( ret); ++i){
+			for(multi::size_type j = 0; j != std::min(i, size(~ret)); ++j){
+				ret[i][j] = 0.;
+			}
+		}
+		break;
+	case multi::blas::filling::lower:
+		for(multi::size_type j = 0; j != size(~ret); ++j){
+			for(multi::size_type i = 0; i != std::min(j, size( ret)); ++i){
+				ret[i][j] = 0.;
+			}
+		}
+		break;
+	}
+	return ret;
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_double_0x0){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	multi::array<double, 2> const A;
+	{
+		multi::array<double, 2> B;
+		// B=Solve(A.X=alpha*B, X) B=A⁻¹B, B⊤=B⊤.(A⊤)⁻¹, A upper triangular (implicit zeros below)
+		blas::trsm(blas::side::left, blas::filling::upper, blas::diagonal::general, 1., A, B);
+	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_double_1x1){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	multi::array<double, 2> const A = {
+		{10.,},
+	};
+	{
+		multi::array<double, 2> B = {
+			{3.,},
+		};
+		auto const B_cpy = B;
+		blas::trsm(blas::side::left, blas::filling::upper, blas::diagonal::general, 1., A, B); 
+		// B=Solve(A.X=alpha*B, X) B=A⁻¹B, B⊤=B⊤.(A⊤)⁻¹, A upper triangular (implicit zeros below)
+		BOOST_REQUIRE_CLOSE( B[0][0] , 3./10. , 0.00001 );
+		BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A, B))[0][0] , B_cpy[0][0] , 0.00001 );
+	}
+	{
+		multi::array<double, 2> B = {
+			{3.,},
+		};
+		auto const B_cpy = B;
+		// B=Solve(A.X=alpha*B, X) B=A⁻¹B, B⊤=B⊤.(A⊤)⁻¹, A upper triangular (implicit zeros below)
+		blas::trsm(blas::side::left, blas::filling::upper, blas::diagonal::general, 2., A, B); 
+		BOOST_REQUIRE_CLOSE( B[0][0] , 2.*3./10. , 0.00001 );
+		BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A, B))[0][0] , 2.*B_cpy[0][0] , 0.00001 );
+	}
+	{
+		multi::array<double, 2> B = {
+			{3., 4., 5.},
+		};
+		auto const B_cpy = B;
+		// B=Solve(A.X=alpha*B, X) B=A⁻¹B, B⊤=B⊤.(A⊤)⁻¹, A upper triangular (implicit zeros below)
+		blas::trsm(blas::side::left, blas::filling::upper, blas::diagonal::general, 1., A, B);
+		BOOST_REQUIRE_CLOSE( B[0][0] , 3./10. , 0.00001 );
+		BOOST_REQUIRE_CLOSE( B[0][1] , 4./10. , 0.00001 );
+		BOOST_REQUIRE_CLOSE( B[0][2] , 5./10. , 0.00001 );
+		BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A, B))[0][1] , B_cpy[0][1] , 0.00001 );
+	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_real_square){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	multi::array<double, 2> const A = {
+		{     1.,      3.,  4.},
+		{    NAN,      7.,  1.},
+		{    NAN,     NAN,  8.}
+	};
+	auto const A_cpy = triangular(blas::filling::upper, A);
+	{
+		multi::array<double, 2> B = {
+			{1., 3., 4.},
+			{2., 7., 1.},
+			{3., 4., 2.}
+		};
+		auto const B_cpy = B;
+		// B=Solve(A.X=alpha*B, X) B=A⁻¹B, B⊤=B⊤.(A⊤)⁻¹, A upper triangular (implicit zeros below)
+		blas::trsm(blas::side::left, blas::filling::upper, 1., A, B); 
+		BOOST_REQUIRE_CLOSE( B[1][2] , 0.107143 , 0.001 );
+		BOOST_REQUIRE( (+blas::gemm(1., A_cpy, B))[1][2] == B_cpy[1][2] );
+	}
+	{
+		auto const AT =+ ~A;
+		auto const AT_cpy = triangular(blas::filling::lower, AT);
+		multi::array<double, 2> B = {
+			{1., 3., 4.},
+			{2., 7., 1.},
+			{3., 4., 2.}
+		};
+		auto const B_cpy = B;
+		blas::trsm(blas::side::left, blas::filling::upper, 1., blas::T(AT), B);
+		BOOST_REQUIRE_CLOSE( B[1][2] , 0.107143 , 0.001 );
+		BOOST_REQUIRE( (+blas::gemm(1., blas::T(AT_cpy), B))[1][2] == B_cpy[1][2] );
+	}
+	{
+		auto const AT =+ ~A;
+		auto const AT_cpy = triangular(blas::filling::lower, AT);
+		multi::array<double, 2> const B = {
+			{1., 3., 4.},
+			{2., 7., 1.},
+			{3., 4., 2.}
+		};
+		auto BT =+ ~B;
+		blas::trsm(blas::side::left, blas::filling::upper, 1., blas::T(AT), blas::T(BT));
+		BOOST_REQUIRE_CLOSE( blas::T(BT)[1][2] , 0.107143 , 0.001 );
+		BOOST_REQUIRE( (+blas::gemm(1., blas::T(AT_cpy), blas::T(BT)))[1][2] == B[1][2] );
+	}
+	{
+	//	auto const AT =+ ~A;
+		multi::array<double, 2> const B = {
+			{1., 3., 4.},
+			{2., 7., 1.},
+			{3., 4., 2.}
+		};
+		auto BT =+ ~B;
+		blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::T(BT));
+		BOOST_REQUIRE_CLOSE( (~BT)[1][2] , 0.107143 , 0.001 );
+	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	using complex = std::complex<double>; complex const I{0, 1};
+	multi::array<complex, 2> const A = {
+		{ 1. + 2.*I,  3. - 1.*I,  4. + 9.*I},
+		{NAN       ,  7. + 4.*I,  1. + 8.*I},
+		{NAN       , NAN       ,  8. + 2.*I}
+	};
+	multi::array<complex, 2> B = {
+		{1. - 9.*I, 3. + 2.*I, 4. + 3.*I},
+		{2. - 2.*I, 7. - 2.*I, 1. - 1.*I},
+		{3. + 1.*I, 4. + 8.*I, 2. + 7.*I}
+	};
+	blas::trsm(blas::side::left, blas::filling::lower, 2.+1.*I, blas::H(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
+	BOOST_REQUIRE_CLOSE( real(B[1][2]) ,  2.33846   , 0.0001 );
+	BOOST_REQUIRE_CLOSE( imag(B[1][2]) , -0.0923077 , 0.0001 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_rectangular){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	using complex = std::complex<double>; complex const I{0, 1};
+	multi::array<complex, 2> const A = {
+		{ 1. + 2.*I,  3. - 1.*I,  4. + 9.*I},
+		{NAN       ,  7. + 4.*I,  1. + 8.*I},
+		{NAN       , NAN       ,  8. + 2.*I}
+	};
+	multi::array<complex, 2> B = {
+		{1. - 9.*I, 3. + 2.*I},
+		{2. - 2.*I, 7. - 2.*I},
+		{3. + 1.*I, 4. + 8.*I}
+	};
+	blas::trsm(blas::side::left, blas::filling::lower, 2.+1.*I, blas::H(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
+	BOOST_REQUIRE_CLOSE( real(B[2][0]) , -4.16471 , 0.0001 );
+	BOOST_REQUIRE_CLOSE( imag(B[2][0]) ,  8.25882 , 0.0001 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_column){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	using complex = std::complex<double>; complex const I{0, 1};
+	multi::array<complex, 2> const A = {
+		{ 1. + 2.*I,  3. - 1.*I,  4. + 9.*I},
+		{NAN       ,  7. + 4.*I,  1. + 8.*I},
+		{NAN       , NAN       ,  8. + 2.*I}
+	};
+	multi::array<complex, 2> B = {
+		{1. - 9.*I},
+		{2. - 2.*I},
+		{3. + 1.*I}
+	};
+	blas::trsm(blas::side::left, blas::filling::lower, 2.+1.*I, blas::H(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
+	BOOST_REQUIRE_CLOSE( real(B[2][0]) , -4.16471 , 0.0001);
+	BOOST_REQUIRE_CLOSE( imag(B[2][0]) ,  8.25882 , 0.0001);
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_column_cpu){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	using complex = std::complex<double>; complex const I{0, 1};
+	multi::array<complex, 2> const A = {
+		{ 1. + 2.*I,  3. - 1.*I,  4. + 9.*I},
+		{NAN       ,  7. + 4.*I,  1. + 8.*I},
+		{NAN       , NAN       ,  8. + 2.*I}
+	};
+	multi::array<complex, 2> B = {
+		{1. - 9.*I},
+		{2. - 2.*I},
+		{3. + 1.*I}
+	};
+	blas::trsm(blas::side::left, blas::filling::lower, 2.+1.*I, blas::H(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
+	BOOST_REQUIRE_CLOSE( real(B[2][0]) , -4.16471 , 0.0001 );
+	BOOST_REQUIRE_CLOSE( imag(B[2][0]) ,  8.25882 , 0.0001 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_hydrogen_inq_case_real){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	multi::array<double, 2> const A = {{2.,},};
+	{
+		multi::array<double, 2> B = {{1., 2., 3.},};
+		auto const B_cpy = B;
+		blas::trsm(blas::side::left, blas::filling::lower, 1., A, B);
+		BOOST_REQUIRE( B[0][1] == B_cpy[0][1]/A[0][0] );
+	}
+	{
+		multi::array<double, 2> B = {
+			{1.}, 
+			{2.}, 
+			{3.},
+		};
+		auto const B_cpy = B;
+		blas::trsm(blas::side::left, blas::filling::lower, 1., A, blas::T(B));
+		BOOST_REQUIRE( blas::T(B)[0][1] == blas::T(B_cpy)[0][1]/A[0][0] );
+	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_hydrogen_inq_case_complex){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	using complex = std::complex<double>;
+	multi::array<complex, 2> const A = {{2.,},};
+	{
+		multi::array<complex, 2> B = {{1., 2., 3.},};
+		auto const B_cpy = B;
+		blas::trsm(blas::side::left, blas::filling::lower, 1., A, B);
+		BOOST_REQUIRE( B[0][1] == B_cpy[0][1]/A[0][0] );
+	}
+	multi::array<complex, 2> B1 = {
+		{1.}, 
+		{2.}, 
+		{3.},
+	};
+	multi::array<complex, 2> B2 = {
+		{1.}, 
+		{2.}, 
+		{3.},
+	};
+	{
+	//	auto const B_cpy = B1;
+		blas::trsm(blas::side::left, blas::filling::lower, 1., A, blas::H(B1));
+	//	BOOST_REQUIRE( (+blas::gemm(1., A, blas::H(B1)))[0][1] == blas::H(B_cpy)[0][1] );
+	}
+	{
+		auto const B_cpy = B2;
+		blas::trsm(blas::side::right, blas::filling::upper, 1., blas::H(A), B2);
+	//	BOOST_REQUIRE( (+blas::gemm(1., A, blas::H(B)))[0][1] == blas::H(B_cpy)[0][1] );
+		BOOST_REQUIRE( (+blas::gemm(1., B2, blas::H(A)))[1][0] == B_cpy[1][0] );
+	}
+	BOOST_REQUIRE( B1 == B2 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_real_nonsquare){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	multi::array<double, 2> const A = {
+		{  1.,   3.,  4.},
+		{ NAN,   7.,  1.},
+		{ NAN,  NAN,  8.}
+	};
+	auto const A_cpy = triangular(blas::filling::upper, A);
+	{
+		multi::array<double, 2> B = {
+			{1., 3., 4., 8.},
+			{2., 7., 1., 9.},
+			{3., 4., 2., 1.},
+		};
+		auto const B_cpy =+ B;
+		multi::array<double, 2> BT =+ ~B;
+		BOOST_REQUIRE( BT == ~B );
+		blas::trsm(blas::side::left, blas::filling::upper, 1., A, B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B⊤=B⊤.(A⊤)⁻¹, A upper triangular (implicit zeros below)
+		BOOST_REQUIRE_CLOSE( B[1][2] , 0.107143                              , 0.001);
+		BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A_cpy, B))[1][2] , B_cpy[1][2] , 0.001);
+
+		auto const BT_cpy = BT;
+		blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::T(BT));
+		BOOST_REQUIRE_CLOSE( blas::T(BT)[1][2],  0.107143, 0.001 );
+
+		BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A_cpy, blas::T(BT)))[1][2] , blas::T(BT_cpy)[1][2] , 0.00001 );
+	}
+	{
+		multi::array<double, 2> B = {
+			{1., 3., 4., 8.},
+			{2., 7., 1., 9.},
+			{3., 4., 2., 1.},
+		};
+		multi::array<double, 2> AT = ~A;
+		multi::array<double, 2> BT = ~B;
+		blas::trsm(blas::side::left, blas::filling::upper, 1., blas::T(AT), B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B⊤=B⊤.(A⊤)⁻¹, A upper triangular (implicit zeros below)
+		BOOST_REQUIRE_CLOSE( B[1][2] , 0.107143 , 0.001 );
+
+		blas::trsm(blas::side::left, blas::filling::upper, 1., blas::T(AT), blas::T(BT));
+		BOOST_REQUIRE_CLOSE( (~BT)[1][2] , 0.107143, 0.001 );
+	}
+	{
+		multi::array<double, 2> B = {
+			{1.},
+			{2.},
+			{3.},
+		};
+		auto const B_cpy =+ B;
+		blas::trsm(blas::side::left, blas::filling::upper, 1., A, B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B⊤=B⊤.(A⊤)⁻¹, A upper triangular (implicit zeros below)
+		BOOST_REQUIRE_CLOSE( B[2][0] , 0.375 , 0.00001 );
+		BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A_cpy, B))[1][0] , B_cpy[1][0] , 0.00001 );
+	}
+	{
+		multi::array<double, 2> B = {
+			{1.},
+			{2.},
+			{3.},
+		};
+		auto const B_cpy =+ B;
+		blas::trsm(blas::side::left, blas::filling::upper, 1.2, A, B);
+		BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A_cpy, B))[1][0] , 1.2*B_cpy[1][0] , 0.00001 );
+		BOOST_REQUIRE_CLOSE( (+blas::gemm(1./1.2, A_cpy, B))[1][0] , B_cpy[1][0] , 0.00001 );
+	}
+	{
+		multi::array<double, 2> B = {
+			{1.},
+			{2.},
+			{3.},
+		};
+		multi::array<double, 2> BT = rotated(B);
+		blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::T(BT));
+		BOOST_REQUIRE_CLOSE( (~BT)[2][0] , 0.375 , 0.00001);
+	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_nonsquare_default_diagonal_hermitized_gemm_check_no_const){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	using complex = std::complex<double>; complex const I{0, 1};
+	multi::array<complex, 2> const A = {
+		{ 1. + 4.*I,  3.,  4.- 10.*I},
+		{ 0.,  7.- 3.*I,  1.},
+		{ 0.,  0.,  8.- 2.*I}
+	};
+	multi::array<complex, 2> B = {
+		{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
+		{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
+	};
+	using multi::blas::trsm;
+	using multi::blas::filling;
+	using multi::blas::hermitized;
+	blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::H(B)); // B†←A⁻¹.B†, B←B.A⁻¹†, B←(A⁻¹.B†)†
+	BOOST_REQUIRE_CLOSE( imag(B[1][2]) , -0.147059 , 0.001);
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_nonsquare_default_diagonal_hermitized_gemm_check){//, *utf::tolerance(0.00001)){
+	using complex = std::complex<double>; complex const I{0, 1};
+	multi::array<complex, 2> const A = {
+		{ 1. + 4.*I,  3.,  4.- 10.*I},
+		{ 0.,  7.- 3.*I,  1.},
+		{ 0.,  0.,  8.- 2.*I}
+	};
+	namespace blas = multi::blas;
+	{
+		{
+			multi::array<complex, 2> B = {
+				{1. + 1.*I, 5. + 3.*I},
+				{2. + 1.*I, 9. + 3.*I},
+				{3. + 1.*I, 1. - 1.*I},
+			};
+			auto S = blas::trsm(blas::side::left, blas::filling::lower, 1., blas::H(A), B); // S = A⁻¹†.B, S† = B†.A⁻¹
+			BOOST_REQUIRE_CLOSE( real(S[2][1]) , 1.71608 , 0.001 );
+		}
+		{
+			multi::array<complex, 2> B = {
+				{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
+				{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
+			};
+			auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::H(B)); // S = A⁻¹B†, S†=B.A⁻¹†, S=(B.A⁻¹)†, B <- S†, B <- B.A⁻¹†
+			BOOST_REQUIRE_CLOSE( imag(S[2][1]) , +0.147059 , 0.001);
+			BOOST_REQUIRE_CLOSE( imag(B[1][2]) , -0.147059 , 0.001);
+		}
+		{
+			multi::array<complex, 2> B = {
+				{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
+				{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
+			};
+			auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 2., A, blas::H(B)); // S = A⁻¹B†, S†=B.A⁻¹†, S=(B.A⁻¹)†, B <- S†, B <- B.A⁻¹†
+			BOOST_REQUIRE_CLOSE( imag(S[2][1]) , +0.147059*2. , 0.001 );
+			BOOST_REQUIRE_CLOSE( imag(B[1][2]) , -0.147059*2. , 0.001 );
+		}
+	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_real_1x1_check){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	multi::array<double, 2> const A = {
+		{ 4.},
+	};
+	{
+		{
+			multi::array<double, 2> B = {
+				{5.},
+			};
+			auto S =+ blas::trsm(blas::side::left, blas::filling::upper, blas::diagonal::general, 3., A, B);
+			BOOST_REQUIRE( S[0][0] == 3.*5./4. );
+		}
+		{
+			multi::array<double, 2> B = {
+				{5.},
+			};
+			auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 1., A, B);
+			BOOST_REQUIRE( S[0][0] == 1.*5./4. );
+		}
+		{
+			multi::array<double, 2> B = {
+				{5.},
+			};
+			auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 1., A, B);
+			BOOST_REQUIRE( S[0][0] == 1.*5./4. );
+		}
+	}
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_1x1_check){//, *utf::tolerance(0.00001)){
+	using complex = std::complex<double>; complex const I = complex{0, 1};
+	multi::array<complex, 2> const A = {
+		{ 4. + 2.*I},
+	};
+	namespace blas = multi::blas;
+	{
+		multi::array<complex, 2> B = {
+			{5. + 1.*I},
+		};
+		auto const B_cpy =+ B;
+
+		blas::trsm(blas::side::left, blas::filling::upper, 3.+5.*I, A, B);
+		BOOST_REQUIRE_CLOSE( real((+blas::gemm(1., A, B))[0][0]) , real((3.+5.*I)*B_cpy[0][0]) , 0.00001 );
+		BOOST_REQUIRE_CLOSE( imag((+blas::gemm(1., A, B))[0][0]) , imag((3.+5.*I)*B_cpy[0][0]) , 0.00001 );
+
+		BOOST_REQUIRE_CLOSE( real((+blas::gemm(1./(3.+5.*I), A, B))[0][0]) , real(B_cpy[0][0]) , 0.00001 );
+		BOOST_REQUIRE_CLOSE( imag((+blas::gemm(1./(3.+5.*I), A, B))[0][0]) , imag(B_cpy[0][0]) , 0.00001 );
+	}
+}
+
+#if defined(CUDA_FOUND) and CUDA_FOUND
+#include<thrust/complex.h>
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_thrust_nonsquare_default_diagonal_hermitized_gemm_check){//, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	using complex = thrust::complex<double>; complex const I{0, 1};
+	multi::array<complex, 2> const A = {
+		{ 1. + 4.*I,  3.      ,  4.- 10.*I},
+		{ 0.       ,  7.- 3.*I,  1.       },
+		{ 0.       ,  0.      ,  8.-  2.*I}
+	};
+	{
+		{
+			multi::array<complex, 2> B = {
+				{1. + 1.*I, 5. + 3.*I},
+				{2. + 1.*I, 9. + 3.*I},
+				{3. + 1.*I, 1. - 1.*I},
+			};
+			auto S = blas::trsm(blas::side::left, blas::filling::lower, 1., blas::H(A), B); // S = A⁻¹†.B, S† = B†.A⁻¹
+			BOOST_REQUIRE_CLOSE( S[2][1].real() , 1.71608 , 0.001 );
+			BOOST_REQUIRE( S == B );
+		}
+		{
+			multi::array<complex, 2> B = {
+				{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
+				{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
+			};
+			auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::H(B)); // S = A⁻¹B†, S†=B.A⁻¹†, S=(B.A⁻¹)†, B <- S†, B <- B.A⁻¹†
+			BOOST_REQUIRE_CLOSE( B[1][2].imag() , -0.147059 , 0.001 );
+			BOOST_REQUIRE( S == blas::H(B) );
+		}
+		{
+			multi::array<complex, 2> B = {
+				{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
+				{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
+			};
+			auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 2., A, blas::H(B)); // S = A⁻¹B†, S†=B.A⁻¹†, S=(B.A⁻¹)†, B <- S†, B <- B.A⁻¹†
+			BOOST_REQUIRE_CLOSE( B[1][2].imag() , -0.147059*2. , 0.001  );
+			BOOST_REQUIRE( S == blas::H(B) );
+		}
+	}
+}
+//BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_column_cuda, *utf::tolerance(0.00001)){
+//	namespace cuda = multi::cuda;
+//	cuda::array<complex, 2> A = {
+//		{ 1.,  3.,  4.},
+//		{NAN,  7.,  1.},
+//		{NAN, NAN,  8.}
+//	};
+////	multi::cuda::array<complex, 2> const B = {
+////		{1.},
+////		{2.},
+////		{3.}
+////	};
+//	namespace blas = multi::blas;
+////	auto Bcpy = blas::trsm(blas::filling::upper, 1., A, B); // B ⬅ α Inv[A].B, B† ⬅ B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
+////	multi::array<complex, 2> Bcpu = Bcpy;
+////	BOOST_TEST_REQUIRE( std::real(Bcpu[2][0]) == 0.375 );
+////	BOOST_TEST_REQUIRE( std::imag(Bcpu[2][0]) == 0.    );
+//}
+#endif
+
+#if 0
+
+//template<class T> void what(T&&) = delete;
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_double_column_cuda, *utf::tolerance(0.00001)){
+	multi::cuda::array<double, 2> const A = {
+		{ 1.,  3.,  4.},
+		{NAN,  7.,  1.},
+		{NAN, NAN,  8.}
+	};
+	multi::cuda::array<double, 2> B = {
+		{1.},
+		{2.},
+		{3.}
+	};
+	namespace blas = multi::blas;
+	using blas::filling;	
+	using blas::hermitized;
+	trsm(filling::upper, 1., A, B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
+	BOOST_REQUIRE( B[2][0] == 0.375 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_column_cuda2, *utf::tolerance(0.00001)){
+	multi::cuda::array<complex, 2> const A = {
+		{ 1. + 2.*I,  3. - 1.*I,  4. + 9.*I},
+		{NAN       ,  7. + 4.*I,  1. + 8.*I},
+		{NAN       , NAN       ,  8. + 2.*I}
+	};
+	multi::cuda::array<complex, 2> B = {
+		{1. - 9.*I},
+		{2. - 2.*I},
+		{3. + 1.*I}
+	};
+	namespace blas = multi::blas;
+	using blas::filling;	
+	using blas::hermitized;
+	trsm(filling::lower, 2.+1.*I, hermitized(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
+	multi::array<complex, 2> Bcpu = B;
+	BOOST_REQUIRE( real(Bcpu[2][0]) == -4.16471 );
+	BOOST_REQUIRE( imag(Bcpu[2][0]) ==  8.25882 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_cuda_trsm_complex, *utf::tolerance(0.00001)){
+	multi::cuda::array<complex, 2> const A = {
+		{ 1. + 2.*I,  3. - 1.*I,  4. + 9.*I},
+		{NAN       ,  7. + 4.*I,  1. + 8.*I},
+		{NAN       , NAN       ,  8. + 2.*I}
+	};
+	multi::cuda::array<complex, 2> const B = {
+		{1. - 9.*I, 3. + 2.*I, 4. + 3.*I},
+		{2. - 2.*I, 7. - 2.*I, 1. - 1.*I},
+		{3. + 1.*I, 4. + 8.*I, 2. + 7.*I}
+	};
+
+	namespace blas = multi::blas;
+	using blas::filling;	
+	using blas::hermitized;
+//	auto C = trsm(filling::lower, 2.+1.*I, hermitized(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
+	auto C = trsm(filling::lower, 1., hermitized(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit 
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_cuda_managed_trsm_complex, *utf::tolerance(0.00001)){
+	multi::cuda::managed::array<complex, 2> const A = {
+		{ 1. + 2.*I,  3. - 1.*I,  4. + 9.*I},
+		{NAN       ,  7. + 4.*I,  1. + 8.*I},
+		{NAN       , NAN       ,  8. + 2.*I}
+	};
+	multi::cuda::managed::array<complex, 2> const B = {
+		{1. - 9.*I, 3. + 2.*I, 4. + 3.*I},
+		{2. - 2.*I, 7. - 2.*I, 1. - 1.*I},
+		{3. + 1.*I, 4. + 8.*I, 2. + 7.*I}
+	};
+
+	namespace blas = multi::blas;
+	using blas::filling;
+	using blas::hermitized;
+	auto C = trsm(filling::lower, 2.+1.*I, hermitized(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
+}
+#endif
--- a/external_codes/boost_multi/multi/adaptors/blas/test/trsv.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/test/trsv.cpp
@ -0,0 +1,111 @@
+#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
+$CXX $0 -o $0x -lcudart -lcublas -lboost_unit_test_framework `pkg-config --libs blas`&&$0x&&rm $0x;exit
+#endif
+// © Alfredo A. Correa 2019-2020
+
+#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS trsv"
+#define BOOST_TEST_DYN_LINK
+#include<boost/test/unit_test.hpp>
+
+#include "../../../memory/adaptors/cuda/managed/ptr.hpp"
+
+#include "../../../adaptors/blas/trsv.hpp"
+#include "../../../adaptors/blas/cuda.hpp"
+
+#include "../../../adaptors/cuda.hpp"
+#include "../../../array.hpp"
+
+namespace multi = boost::multi;
+
+template<class M> decltype(auto) print(M const& C){
+	using multi::size; using std::cout;
+	for(int i = 0; i != size(C); ++i){
+		for(int j = 0; j != size(C[i]); ++j) cout<< C[i][j] <<' ';
+		cout<<std::endl;
+	}
+	return cout<<std::endl;
+}
+
+namespace utf = boost::unit_test;
+
+using complex = std::complex<double>;
+complex const I{0, 1};
+
+namespace multi = boost::multi;
+namespace blas = multi::blas;
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_cpu, *utf::tolerance(0.00001)){
+	
+	multi::array<complex, 2> const A = {
+		{ 1. + 1.*I,  3. -  2.*I,  4. + 1.*I},
+		{NAN       ,  7. - 10.*I,  1. + 2.*I},
+		{NAN       , NAN        ,  8. + 1.*I}
+	};
+	multi::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
+	blas::trsv(blas::filling::upper, blas::diagonal::general, A, b);
+	BOOST_TEST_REQUIRE( real(b[0]) == -1.37259 );
+	BOOST_TEST_REQUIRE( real(b[1]) ==  0.2127 );
+	BOOST_TEST_REQUIRE( real(b[2]) ==  0.569231 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_cuda, *utf::tolerance(0.0001)){
+	namespace cuda = multi::cuda;
+	cuda::managed::array<complex, 2> const A = {
+		{ 1. + 1.*I,  3. -  2.*I,  4. + 1.*I},
+		{NAN       ,  7. - 10.*I,  1. + 2.*I},
+		{NAN       , NAN        ,  8. + 1.*I}
+	};
+	cuda::managed::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
+	blas::trsv(blas::filling::upper, blas::diagonal::general, A, b);
+
+	BOOST_TEST_REQUIRE( real(b[0]) == -1.37259  );
+	BOOST_TEST_REQUIRE( real(b[1]) ==  0.2127   );
+	BOOST_TEST_REQUIRE( real(b[2]) ==  0.569231 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_cuda_managed, *utf::tolerance(0.00001)){
+	namespace cuda = multi::cuda;
+	cuda::managed::array<complex, 2> const A = {
+		{ 1. + 1.*I,  3. -  2.*I,  4. + 1.*I},
+		{NAN       ,  7. - 10.*I,  1. + 2.*I},
+		{NAN       , NAN        ,  8. + 1.*I}
+	};
+	cuda::managed::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
+	blas::trsv(blas::filling::upper, A, b); // this operation happens in GPU when #include "adaptors/blas/cuda.hpp"
+
+	multi::array<complex, 1> const b_cpu = b;
+	BOOST_TEST_REQUIRE( real(b_cpu[0]) == -1.37259  );
+	BOOST_TEST_REQUIRE( real(b_cpu[1]) ==  0.2127   );
+	BOOST_TEST_REQUIRE( real(b_cpu[2]) ==  0.569231 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_double_cuda_managed, *utf::tolerance(0.00001)){
+	namespace cuda = multi::cuda;
+	cuda::managed::array<double, 2> const A = {
+		{ 1.,  3.,  4.},
+		{NAN,  7.,  1.},
+		{NAN, NAN,  8.}
+	};
+	cuda::managed::array<double, 1> b = {1., 3., 4.};
+
+	blas::trsv(blas::filling::upper, A, b); // this operation happens in GPU when #include "adaptors/blas/cuda.hpp"
+	multi::array<double, 1> const b_cpu = b;
+	BOOST_TEST_REQUIRE( b_cpu[0] == -2.07143  );
+	BOOST_TEST_REQUIRE( b_cpu[1] ==  0.357143  );
+	BOOST_TEST_REQUIRE( b_cpu[2] ==  0.5 );
+}
+
+BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_cuda2, *utf::tolerance(0.00001)){
+	namespace blas = multi::blas;
+	multi::cuda::array<complex, 2> const A = {
+		{ 1. + 1.*I,  3. -  2.*I,  4. + 1.*I},
+		{NAN       ,  7. - 10.*I,  1. + 2.*I},
+		{NAN       , NAN        ,  8. + 1.*I}
+	};
+	multi::cuda::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
+	blas::trsv(blas::filling::upper, blas::diagonal::general, A, b);
+	BOOST_TEST_REQUIRE( real(b[0]) == -1.37259 );
+	BOOST_TEST_REQUIRE( real(b[1]) ==  0.2127 );
+	BOOST_TEST_REQUIRE( real(b[2]) ==  0.569231 );
+}
+
--- a/external_codes/boost_multi/multi/adaptors/blas/tests/axpy.cpp
+++ b/external_codes/boost_multi/multi/adaptors/blas/tests/axpy.cpp
@ -1,44 +0,0 @@
-#ifdef COMPILATION_INSTRUCTIONS
-$CXX -Wall -Wextra -Wpedantic $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
-#endif
-// © Alfredo A. Correa 2019
-
-#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS axpy"
-#define BOOST_TEST_DYN_LINK
-#include<boost/test/unit_test.hpp>
-
-#include       "../../blas.hpp"
-#include "../../../array.hpp"
-
-#include<complex>
-
-namespace multi = boost::multi;
-
-BOOST_AUTO_TEST_CASE(multi_blas_axpy){
-	{
-		multi::array<double, 2> A = {
-			{1.,  2.,  3.,  4.},
-			{5.,  6.,  7.,  8.},
-			{9., 10., 11., 12.}
-		};
-		auto const AC = A;
-		multi::array<double, 1> const B = A[2];
-		using multi::blas::axpy;
-		axpy(2., B, A[1]); // daxpy
-		BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
-	}
-	{
-		using Z = std::complex<double>;
-		multi::array<Z, 2> A = {
-			{1.,  2.,  3.,  4.},
-			{5.,  6.,  7.,  8.},
-			{9., 10., 11., 12.}
-		};
-		auto const AC = A;
-		multi::array<Z, 1> const B = A[2];
-		using multi::blas::axpy;
-		axpy(2., B, A[1]); // zaxpy (2. is promoted to 2+I*0 internally and automatically)
-		BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
-	}
-}
-
--- a/Show More
+++ b/Show More