Merge commit '36dbee95198f21eec8e5befe850c3183303788df' as 'external_codes/boost_multi/multi'

This commit is contained in:
Alfredo Correa 2021-04-21 19:13:15 -07:00
commit 966aa031ae
225 changed files with 40977 additions and 0 deletions

View File

@ -0,0 +1,3 @@
build*
.build*

View File

@ -0,0 +1,657 @@
# -*-indent-tabs-mode:nil;c-basic-offset:2;tab-width:4;-*-
# © Alfredo A. Correa 2020-2021
image: debian:testing
g++-latest:
stage: test
script:
- export CXX="g++"
- apt-get -qq update && apt-get -qq install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake ..
- cmake --build .
- ctest --output-on-failure
- make install
g++-latest-std20:
stage: test
script:
- export CXX="g++"
- apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake .. -DCMAKE_CXX_STANDARD=20
- cmake --build . --verbose
- ctest --output-on-failure
g++-current-debug:
stage: test
script:
- export CXX="g++"
- apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake .. -DCMAKE_BUILD_TYPE=Debug
- cmake --build . --verbose
- ctest --output-on-failure
g++-current-asan:
stage: test
script:
- export CXX="g++"
- apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- CXXFLAGS="-fsanitize=address" cmake .. -DCMAKE_BUILD_TYPE=Debug
- cmake --build . --verbose
- ctest --output-on-failure
g++-current-codecov:
stage: test
script:
- export CXX="g++"
- apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev curl gcovr lcov
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- CXXFLAGS="-ftest-coverage -fprofile-arcs --coverage -fno-inline -fno-inline-small-functions -fno-default-inline" cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXE_LINKER_FLAGS="-lgcov --coverage" -DCMAKE_BUILD_TYPE=Debug
- cmake --build . --verbose
- ctest --output-on-failure -T Test -T Coverage
- gcovr --xml-pretty --exclude-unreachable-branches --print-summary -o coverage.xml --root ${CI_PROJECT_DIR}
- lcov --directory . --capture --output-file coverage.info
- lcov --remove coverage.info '/usr/*' --output-file coverage.info
- lcov --list coverage.info
coverage: /^\s*lines:\s*\d+.\d+\%/
artifacts:
name: ${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}-${CI_COMMIT_SHA}
expire_in: 2 days
reports:
cobertura: build/coverage.xml
g++-current-release:
stage: test
script:
- export CXX="g++"
- apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake .. -DCMAKE_BUILD_TYPE=Release
- cmake --build . --verbose
- ctest --output-on-failure
g++-7-std17:
stage: test
image: debian:stable-backports
script:
- export CXX="g++-7"
- apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake .. -DCMAKE_CXX_STANDARD=17
- cmake --build .
- ctest --output-on-failure
g++-8:
stage: test
image: debian:stable-backports
script:
- export CXX="g++-8"
- apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake ..
- cmake --build .
- ctest --output-on-failure
g++-9:
stage: test
script:
- export CXX="g++-9"
- apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake ..
- cmake --build . --verbose
- ctest --output-on-failure
g++-9-std17:
stage: test
script:
- export CXX="g++-9"
- apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake .. -DCMAKE_CXX_STANDARD=17
- cmake --build . --verbose
- ctest --output-on-failure
g++-9-openblas:
stage: test
script:
- export CXX="g++-9"
- apt-get update && apt-get install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libopenblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake ..
- cmake --build . --verbose
- ctest --output-on-failure
memcheck:
stage: test
script:
- export CXX="g++-9"
- apt-get update && apt-get install --no-install-recommends -y --quiet $CXX valgrind cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake ..
- cmake --build . --verbose
- ctest -T memcheck --output-on-failure
clang++-9:
stage: test
script:
- export CXX="clang++-9"
- apt-get update && apt-get install --no-install-recommends -y --quiet clang-9 cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake ..
- cmake --build . --verbose
- ctest --output-on-failure
clang++-9-std17:
stage: test
script:
- export CXX="clang++-9"
- apt-get update && apt-get install --no-install-recommends -y --quiet clang-9 cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake .. -DCMAKE_CXX_STANDARD=17
- cmake --build . --verbose
- ctest --output-on-failure
clang++-9-asan:
stage: test
script:
- export CXX="clang++-9"
- export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-9/bin/llvm-symbolizer
- export CXXFLAGS="-fsanitize=undefined -fsanitize=address -fno-omit-frame-pointer"
- apt-get update && apt-get install --no-install-recommends -y --quiet clang-9 llvm-9 cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev llvm
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake ..
- cmake --build . --verbose
- export ASAN_OPTIONS="halt_on_error=1 detect_leaks=1"
- ctest --output-on-failure
# clang 10 and clang 11 have a bug when compiling in c++17 mode
clang++:
stage: test
script:
- export CXX="clang++"
- apt-get update && apt-get install --no-install-recommends -y --quiet clang cmake make libboost-test-dev libblas-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake ..
- cmake --build . --verbose -- --quiet --no-print-directory
- ctest --output-on-failure
clang++-tidy:
stage: test
script:
- export CXX="clang++"
- apt-get -qq update && apt-get -qq install --no-install-recommends -y --quiet clang clang-tidy cmake make libboost-test-dev libblas-dev libblas-dev liblapack-dev libfftw3-dev
- ln --symbolic --force . ../multi
- $CXX --version
- clang-tidy --version
- mkdir build && cd build
- clang-tidy -checks=*,-fuchsia-default-arguments-calls,-fuchsia-statically-constructed-objects,-cppcoreguidelines-pro-type-vararg,-hicpp-vararg,-cppcoreguidelines-avoid-magic-numbers,-readability-magic-numbers,-cppcoreguidelines-macro-usage,-cppcoreguidelines-avoid-non-const-global-variables,-llvmlibc-implementation-in-namespace,-llvmlibc-callee-namespace,-llvmlibc-restrict-system-libc-headers,-cert-err58-cpp --warnings-as-errors=* --list-checks
- cmake .. -DCMAKE_CXX_CLANG_TIDY="clang-tidy;-checks=*,-fuchsia-default-arguments-calls,-fuchsia-statically-constructed-objects,-fuchsia-overloaded-operator,-cppcoreguidelines-pro-type-vararg,-hicpp-vararg,-cppcoreguidelines-avoid-magic-numbers,-readability-magic-numbers,-cppcoreguidelines-macro-usage,-cppcoreguidelines-avoid-non-const-global-variables,-llvmlibc-implementation-in-namespace,-llvmlibc-callee-namespace,-llvmlibc-restrict-system-libc-headers,-cert-err58-cpp;--warnings-as-errors=*"
- cmake --build . --verbose -- --quiet --no-print-directory
- ctest --output-on-failure
clang++-std17:
stage: test
script:
- export CXX="clang++"
- apt-get update && apt-get install --no-install-recommends -y --quiet clang cmake make libboost-test-dev libblas-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake .. -DCMAKE_CXX_STANDARD=17
- cmake --build . --verbose -- --quiet --no-print-directory
- ctest --output-on-failure
clang++-11:
stage: test
script:
- export CXX="clang++-11"
- apt-get update && apt-get install --no-install-recommends -y --quiet clang-11 cmake make libboost-test-dev libblas-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake ..
- cmake --build . --verbose -- --quiet --no-print-directory
- ctest --output-on-failure
clang++-11-gcc9:
stage: test
image: vistart/cuda:10.2-ubuntu20.04
script:
- export CXX="clang++-11"
- apt-get update
- DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y --quiet gcc-9 clang-11 cmake make libboost-test-dev libblas-dev
- ln --symbolic --force . ../multi
- $CXX -v
- mkdir build && cd build
- cmake ..
- cmake --build . --verbose -- --quiet --no-print-directory
- ctest --output-on-failure
clang++-11-std17:
stage: test
script:
- export CXX="clang++-11"
- apt-get update && apt-get install --no-install-recommends -y --quiet clang-11 cmake make libboost-test-dev libblas-dev
- ln --symbolic --force . ../multi
- $CXX --version
- mkdir build && cd build
- cmake .. -DCMAKE_CXX_STANDARD=17
- cmake --build . --verbose -- --quiet --no-print-directory
- ctest --output-on-failure
#icc:
# image: meteocima/dkr-intel
# stage: test
# script:
# - . /opt/intel/bin/compilervars.sh intel64
# - export CXX="icpc"
# - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libblas-dev liblapack-dev libfftw3-dev
# - ln --symbolic --force . ../multi
# - $CXX -v
# - mkdir build && cd build
# - cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
# - cmake --build . -- --quiet --no-print-directory
# - export MKL_VERBOSE=1
# - ctest --output-on-failure
#icc-nomkl:
# image: meteocima/dkr-intel
# stage: test
# script:
# - export CXX="/opt/intel/bin/icpc"
# - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libblas-dev liblapack-dev libfftw3-dev
# - ln --symbolic --force . ../multi
# - $CXX -v
# - mkdir build && cd build
# - cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
# - cmake --build . -- --quiet --no-print-directory
# - export MKL_VERBOSE=1
# - ctest --output-on-failure
#
#icc-memcheck:
# image: meteocima/dkr-intel
# stage: test
# script:
# - . /opt/intel/bin/compilervars.sh intel64
# - export CXX="icpc"
# - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make valgrind libboost-test-dev libblas-dev liblapack-dev libfftw3-dev
# - ln --symbolic --force . ../multi
# - $CXX -v
# - mkdir build && cd build
# - cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DMEMORYCHECK_COMMAND_OPTIONS="--leak-check=full --show-reachable=yes --track-origins=yes --malloc-fill=0xEE --free-fill=0xFF --leak-check-heuristics=none -v --track-fds=yes --error-limit=no --show-below-main=yes --read-var-info=yes --gen-suppressions=all"
# - cmake --build . -- --quiet --no-print-directory
# - export MKL_VERBOSE=1
# - ctest -T memcheck --output-on-failure || (cat Testing/Temporary/MemoryChecker.*.log && exit 0)
#icc-std17:
# image: meteocima/dkr-intel
# stage: test
# script:
# - . /opt/intel/bin/compilervars.sh intel64
# - export CXX="icpc"
# - apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libblas-dev liblapack-dev libfftw3-dev
# - ln --symbolic --force . ../multi
# - $CXX -v
# - mkdir build && cd build
# - cmake .. -DCMAKE_CXX_STANDARD=17 -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
# - cmake --build . -- --quiet --no-print-directory
# - export MKL_VERBOSE=1
# - ctest --output-on-failure
cuda-10.0:
image: vistart/cuda:10.2-ubuntu20.04
stage: test
script:
- export DEBIAN_FRONTEND=noninteractive
- apt-get update && apt-get install --no-install-recommends -y --quiet gcc-8 g++-8 cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- nvcc --version
- g++-8 --version
- cd test
- mkdir build && cd build
- cmake -DENABLE_CUDA=1 -DCMAKE_CUDA_FLAGS="-ccbin=g++-8" ..
- cmake --build . --verbose
- ctest --output-on-failure
cuda-11.0:
image: nvidia/cuda:11.0-devel
stage: test
script:
- export DEBIAN_FRONTEND=noninteractive
- apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- nvcc --version
- mkdir build && cd build
- cmake .. -DENABLE_CUDA=1
- cmake --build . --verbose
- ctest --output-on-failure
cuda-11.1:
image: nvidia/cuda:11.1-devel
stage: test
script:
- export DEBIAN_FRONTEND=noninteractive
- apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- nvcc --version
- mkdir build && cd build
- cmake .. -DENABLE_CUDA=1
- cmake --build . --verbose
- ctest --output-on-failure
cuda-11.0-std17:
image: nvidia/cuda:11.0-devel
stage: test
script:
- export DEBIAN_FRONTEND=noninteractive
- apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- nvcc --version
- mkdir build && cd build
- cmake .. -DCMAKE_CXX_STANDARD=17 -DENABLE_CUDA=1
- cmake --build . --verbose
- ctest --output-on-failure
cuda-11.1-std17:
image: nvidia/cuda:11.1-devel
stage: test
script:
- export DEBIAN_FRONTEND=noninteractive
- apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- nvcc --version
- mkdir build && cd build
- cmake .. -DCMAKE_CXX_STANDARD=17 -DENABLE_CUDA=1
- cmake --build . --verbose
- ctest --output-on-failure
cuda-11.2-std17:
image: nvidia/cuda:11.2.0-devel
stage: test
script:
- export DEBIAN_FRONTEND=noninteractive
- apt-get update && apt-get install --no-install-recommends -y --quiet cmake make libboost-test-dev libboost-timer-dev libblas-dev libfftw3-dev
- ln --symbolic --force . ../multi
- nvcc --version
- mkdir build && cd build
- cmake .. -DCMAKE_CXX_STANDARD=17 -DENABLE_CUDA=1
- cmake --build . --verbose
- ctest --output-on-failure
g++-cppcheck:
stage: test
script:
- export CXX="g++"
- apt-get -qq update && apt-get -qq install --no-install-recommends -y --quiet $CXX cmake make libboost-test-dev libboost-timer-dev libblas-dev liblapack-dev libfftw3-dev cppcheck
- ln --symbolic --force . ../multi
- $CXX --version
- cppcheck --version
- mkdir build && cd build
- cmake -DCMAKE_CXX_CPPCHECK="cppcheck;--enable=all;--suppress=missingIncludeSystem;--suppress=unmatchedSuppression;--suppress=missingInclude;--inline-suppr;-D__align__;-DCUDARTAPI;--language=c++;--std=c++17;--error-exitcode=666" ..
- cmake --build .
- ctest --output-on-failure
qmcpack-g++:
stage: test
script:
- apt-get -qq update && apt-get -qq install --no-install-recommends -y libblas-dev liblapack-dev libfftw3-dev libboost-serialization-dev libopenmpi-dev gfortran g++ cmake make git ca-certificates numdiff python3 python3-numpy python3-h5py python3-mpi4py python3-scipy libxml2-dev libhdf5-dev
- git clone https://github.com/QMCPACK/qmcpack.git
- cd qmcpack
- git config --global user.email "alfredo.correa@gmail.com" && git config --global user.name "Alfredo Correa"
- git rm -r external_codes/boost_multi/multi && git commit -m "remove multi subtree"
- git subtree add --squash -P external_codes/boost_multi/multi $CI_REPOSITORY_URL $CI_COMMIT_BRANCH # e.g. https://gitlab.com/correaa/boost-multi.git
- cd build
- cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DBUILD_AFQMC=1 -DBUILD_PPCONVERT=1 -DQMC_MIXED_PRECISION=1 -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS="-Werror" -DMPIEXEC_PREFLAGS="--allow-run-as-root;--bind-to;none" ..
- make ppconvert afqmc test_afqmc_matrix test_afqmc_numerics test_afqmc_slaterdeterminantoperations test_afqmc_walkers test_afqmc_hamiltonians test_afqmc_hamiltonian_operations test_afqmc_phmsd test_afqmc_wfn_factory test_afqmc_prop_factory test_afqmc_estimators qmc-afqmc-performance
- ctest -R ppconvert --output-on-failure
- ctest -R afqmc --output-on-failure
qmcpack-cuda-11.2-compileonly:
image: nvidia/cuda:11.2.0-devel
stage: test
script:
- export DEBIAN_FRONTEND=noninteractive
- apt-get -qq update && apt-get -qq install --no-install-recommends -y libblas-dev liblapack-dev libfftw3-dev libboost-serialization-dev libopenmpi-dev gfortran g++ cmake make git ca-certificates numdiff python3 python3-numpy python3-h5py python3-mpi4py python3-scipy libxml2-dev libhdf5-dev
- git clone https://github.com/QMCPACK/qmcpack.git
- cd qmcpack
- git config --global user.email "alfredo.correa@gmail.com" && git config --global user.name "Alfredo Correa"
- git rm -r external_codes/boost_multi/multi && git commit -m "remove multi subtree"
- git subtree add --squash -P external_codes/boost_multi/multi $CI_REPOSITORY_URL $CI_COMMIT_BRANCH # e.g. https://gitlab.com/correaa/boost-multi.git
- cd build
- cmake -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DBUILD_AFQMC=1 -DBUILD_PPCONVERT=1 -DQMC_CXX_STANDARD=17 -DENABLE_CUDA=1 ..
- make ppconvert afqmc test_afqmc_matrix test_afqmc_numerics test_afqmc_slaterdeterminantoperations test_afqmc_walkers test_afqmc_hamiltonians test_afqmc_hamiltonian_operations test_afqmc_phmsd test_afqmc_wfn_factory test_afqmc_prop_factory test_afqmc_estimators qmc-afqmc-performance
- ctest -R ppconvert --output-on-failure
inq-g++-latest:
stage: test
script:
- apt-get update && apt-get install --no-install-recommends -y --quiet libblas-dev liblapack-dev libfftw3-dev libboost-serialization-dev libopenmpi-dev gfortran g++ pkg-config cmake make git ca-certificates
- git clone --recurse-submodules --remote-submodules https://gitlab.com/npneq/inq.git
- cd inq
- cd external_libs/multi
- git checkout $CI_COMMIT_BRANCH
- cd ../..
- mkdir build && cd build
- CXX=mpic++ ../configure --prefix=$HOME
- make
- make install
- ctest --output-on-failure
inq-cuda-11.2-compileonly:
image: nvidia/cuda:11.2.0-devel
stage: test
script:
- export DEBIAN_FRONTEND=noninteractive
- apt-get update && apt-get install --no-install-recommends -y --quiet libblas-dev liblapack-dev libfftw3-dev libboost-serialization-dev libopenmpi-dev gfortran g++ pkg-config cmake make git ca-certificates
- git clone --recurse-submodules https://gitlab.com/npneq/inq.git
- cd inq
- cd external_libs/multi
- git checkout $CI_COMMIT_BRANCH
- cd ../..
- mkdir build && cd build
- export CUDACXX=/usr/local/cuda/bin/nvcc
- export CUDAFLAGS="$(for x in `mpic++ --showme:incdirs`; do echo -n -I$x" " ; done) -std=c++17 -DFMT_USE_UDL_TEMPLATE=0 -D_DISABLE_CUDA_SLOW -O0 --gpu-architecture sm_70 --expt-relaxed-constexpr --expt-extended-lambda --Werror=cross-execution-space-call --compiler-options -std=c++17,-O0,-Wall,-Wfatal-errors"
- export LDFLAGS=$(for x in `mpic++ --showme:libdirs`; do echo -n -L$x" " ; done)
- export LIBS=$(for x in `mpic++ --showme:libs`; do echo -n -l$x" " ; done)
- $CUDACXX -V
- ../configure --prefix=$PREFIX --enable-cuda --with-cuda-prefix=/usr/local/cuda
- make silicon
g++-codecov-runner:
stage: test
tags:
- intel_compiler
script:
- export CXX="g++"
- $CXX --version
- mkdir build && cd build
- cmake --version
- CXXFLAGS="-ftest-coverage -fprofile-arcs --coverage" cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DCMAKE_EXE_LINKER_FLAGS="-lgcov --coverage"
- cmake --build . -j 12
- ctest --output-on-failure -T Test -T Coverage # - gcovr --xml-pretty --exclude-unreachable-branches --print-summary -o coverage.xml --root ${CI_PROJECT_DIR}
- lcov --directory . --capture --output-file coverage.info
- lcov --remove coverage.info '/usr/*' --output-file coverage.info
- lcov --list coverage.info
- bash <(curl -s https://codecov.io/bash) -t 999feb5b-a599-4d02-b9c5-46d977247f3a || echo "Codecov did not collect coverage reports"
icpc-nomkl-runner:
stage: test
tags:
- intel_compiler
script:
- export CXX="/opt/intel/system_studio_2020/bin/icpc"
- $CXX --version
- mkdir build && cd build
- cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
- cmake --build . -j 12
- export MKL_VERBOSE=1
- ctest --output-on-failure
icpc-runner:
stage: test
tags:
- intel_compiler
script:
- . /opt/intel/system_studio_2020/bin/compilervars.sh intel64
- export CXX="icpc"
- $CXX --version
- mkdir build && cd build
- cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
- cmake --build . -j 12
- export MKL_VERBOSE=1
- ctest --output-on-failure
#icpc-memcheck-runner:
# stage: test
# tags:
# - intel_compiler
# script:
# - . /opt/intel/system_studio_2020/bin/compilervars.sh intel64
# - export CXX="icpc"
# - $CXX --version
# - mkdir build && cd build
# - cmake .. -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
# - cmake --build . -j 12
# - ctest -T memcheck --output-on-failure || (cat Testing/Temporary/MemoryChecker.*.log && exit 0)
#icpc-std17-runner:
# stage: test
# tags:
# - intel_compiler
# script:
# - . /opt/intel/system_studio_2020/bin/compilervars.sh intel64
# - export CXX="icpc"
# - $CXX --version
# - mkdir build && cd build
# - cmake .. -DCMAKE_CXX_STANDARD=17 -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON
# - cmake --build . -j 12
# - export MKL_VERBOSE=1
# - ctest --output-on-failure
inq-nvcc-ompi:
stage: test
tags:
- intel_compiler
script:
- export PREFIX=`mktemp -d`
- git clone --recurse-submodules https://gitlab.com/npneq/inq.git
- cd inq
- cd external_libs/multi
- git checkout $CI_COMMIT_BRANCH
- cd ../..
- mkdir build && cd build
- export CUDACXX=/usr/local/cuda/bin/nvcc
- export CUDAFLAGS="$(for x in `mpic++ --showme:incdirs`; do echo -n -I$x" " ; done) -std=c++17 -DFMT_USE_UDL_TEMPLATE=0 -D_DISABLE_CUDA_SLOW -O3 --gpu-architecture sm_70 --expt-relaxed-constexpr --expt-extended-lambda --Werror=cross-execution-space-call --compiler-options -Ofast,-std=c++17,-Wall,-Wfatal-errors"
- export LDFLAGS=$(for x in `mpic++ --showme:libdirs`; do echo -n -L$x" " ; done)
- export LIBS=$(for x in `mpic++ --showme:libs`; do echo -n -l$x" " ; done)
- $CUDACXX -V
- ../configure --prefix=$PREFIX --enable-cuda --with-cuda-prefix=/usr/local/cuda
- make -j8
- make -j8 install
- ctest --output-on-failure --timeout 600
- cd src; INQ_EXEC_ENV="mpirun --oversubscribe -np 4" ctest --output-on-failure --timeout 600; cd ..
- rm -rf $PREFIX
#blas&fft:
# stage: test
# script:
# - perl -pi -e 's/main/main\ contrib\ non-free/g' /etc/apt/sources.list
# - apt update --quiet
# - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends -f-assume-yes --quiet libboost-test-dev libboost-timer-dev libtbb-dev libboost-serialization-dev libboost-iostreams-dev librange-v3-dev valgrind
# - DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends --assume-yes --quiet clang pkg-config libblas-dev libblas64-dev libfftw3-dev nvidia-cuda-toolkit
# - ln --symbolic --force . ../multi
# - export CXXX="clang++ -x c++"
# - export CXXFLAGS="-Wall -Wextra -Wpedantic -O3 -lcudart -lfftw3 -lcublas -lcufft -lboost_timer -lboost_unit_test_framework `pkg-config --libs blas`"
# - export CXX="${CXXX} ${CXXFLAGS}"
# - $CXX --version
# - cd adaptors/blas
# - for a in ./*.hpp; do echo $a; $CXX $a || exit; done;
# - cd tests
# - for a in ./*.cpp; do echo $a; $CXX $a || exit; done;
# - cd ..
# - cd ../..
# - cd adaptors
# - sh ./fftw.hpp
# - $CXX fft.hpp
#blas&fftGPU-11:
# stage: build
# tags:
# - cuda_gpu
# stage: test
# script:
# - export PATH=/usr/local/cuda-11.0/bin:$PATH #export PATH=/usr/local/cuda/bin:$PATH
# - export LD_LIBRARY_PATH=/usr/local/cuda-11.0/lib64:$LD_LIBRARY_PATH
# - export CXXX="clang++ -x c++"
# - export CXXFLAGS="`#-Wall -Wextra -Wpedantic` -Ofast -Wl,-rpath=/usr/local/cuda/lib64 -L/usr/local/cuda-11.0/lib64 -I/usr/local/cuda-11.0/include -lcudart -lfftw3 -lcublas -lcufft -lboost_timer -lboost_unit_test_framework `pkg-config --libs blas` "
# - export CXX="${CXXX} ${CXXFLAGS}"
# - $CXX --version
# - cd adaptors/blas
# - for a in ./*.hpp; do echo $a; sh $a || exit; echo "\n"; done;
# - cd tests
# - for a in ./*.cpp; do echo $a; sh $a || exit; echo "\n"; done;
# - cd ..
# - cd ../..
# - cd adaptors
# - sh ./fftw.hpp
# - sh ./fft.hpp
#blas&fftGPU:
# stage: build
# tags:
# - cuda_gpu
# stage: test
# script:
# - export PATH=/usr/local/cuda/bin:$PATH #export PATH=/usr/local/cuda/bin:$PATH
# - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# - export CXXX="clang++ -x c++"
# - export CXXFLAGS="`#-Wall -Wextra -Wpedantic` -Ofast -Wl,-rpath=/usr/local/cuda/lib64 -L/usr/local/cuda/lib64 -I/usr/local/cuda/include -lcudart -lfftw3 -lcublas -lcufft -lboost_timer -lboost_unit_test_framework `pkg-config --libs blas` "
# - export CXX="${CXXX} ${CXXFLAGS}"
# - $CXX --version
# - cd adaptors/blas
# - for a in ./*.hpp; do echo $a; sh $a || exit; echo "\n"; done;
# - cd tests
# - for a in ./*.cpp; do echo $a; sh $a || exit; echo "\n"; done;
# - cd ..
# - cd ../..
# - cd adaptors
# - sh ./fftw.hpp
# - sh ./fft.hpp

View File

@ -0,0 +1,72 @@
cmake_minimum_required(VERSION 3.11)
project(multi VERSION 0.76.0
DESCRIPTION "A header only C++ library that provides multidimensional array access to contiguous or regularly contiguous memory (or ranges)."
HOMEPAGE_URL "https://gitlab.com/correaa/boost-multi"
LANGUAGES CXX)
#set(CMAKE_CXX_STANDARD 14)
#set(CMAKE_CXX_STANDARD_REQUIRED True)
#set(CMAKE_CXX_EXTENSIONS OFF)
include(GNUInstallDirs)
add_library(${PROJECT_NAME} INTERFACE)
target_include_directories(
${PROJECT_NAME}
INTERFACE $<BUILD_INTERFACE:${${PROJECT_NAME}_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_14)
enable_testing()
find_program(MEMORYCHECK_COMMAND valgrind)
set(MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --error-exitcode=1")
include(CTest)
add_subdirectory(test)
install(TARGETS ${PROJECT_NAME}
EXPORT ${PROJECT_NAME}_Targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
include(CMakePackageConfigHelpers)
write_basic_package_version_file("${PROJECT_NAME}ConfigVersion.cmake"
VERSION ${PROJECT_VERSION}
COMPATIBILITY SameMajorVersion)
configure_package_config_file(
"${PROJECT_SOURCE_DIR}/cmake/multiConfig.cmake.in"
"${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
INSTALL_DESTINATION
${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/cmake)
install(EXPORT ${PROJECT_NAME}_Targets
FILE ${PROJECT_NAME}Targets.cmake
NAMESPACE boost::multi::
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/cmake)
install(FILES "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
"${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/cmake)
#install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/SI DESTINATION include)
install(FILES ${PROJECT_SOURCE_DIR}/array_ref.hpp ${PROJECT_SOURCE_DIR}/array.hpp DESTINATION include/multi)
install(DIRECTORY ${PROJECT_SOURCE_DIR}/detail DESTINATION include/multi)
install(DIRECTORY ${PROJECT_SOURCE_DIR}/adaptors DESTINATION include/multi)
add_subdirectory(adaptors/blas)
#to install this project:
#cmake .. -DCMAKE_INSTALL_PREFIX:PATH=$HOME
#cmake --build . --config Release --target test --target install -- -j $(nproc)
# to use this project do
#project("Your project")
#find_package(boost-multi CONFIG REQUIRED)
#add_executable(${PROJECT_NAME} src/your_main.cpp)
#target_link_libraries(${PROJECT_NAME} boost-multi::boost-multi)

View File

@ -0,0 +1,671 @@
<!--
(pandoc `#--from gfm` --to html --standalone --metadata title=" " $0 > $0.html) && firefox --new-window $0.html; sleep 5; rm $0.html; exit
-->
# [Boost.]Multi
(not an official Boost library)
_© Alfredo A. Correa, 2018-2021_
`Multi` provides multidimensional array access to contiguous or regularly contiguous memory (or ranges).
It shares the goals of [Boost.MultiArray](https://www.boost.org/doc/libs/1_69_0/libs/multi_array/doc/index.html),
although the code is completely independent and the syntax has slight differences or has been extended.
`Multi` and `Boost.MultiArray` types can be used interchangeably for the most part, they differ in the semantics of reference and value types.
Multi aims to simplify the semantics of Boost.MultiArray and make it more compatible with the Standard (STL) Algorithms and special memory.
It requires C++14.
Some features:
* Arbitrary pointer types (minimal requirements)
* Simplified implementation (~1200 lines)
* Fast access of subarrays (view) types
* Value semantics of multi-dimensional array container
* Better semantics of subarray (view) types
* Interoperability with other libraries, STL, ranges,
(Do not confuse this library with Boost.MultiArray or Boost.MultiIndex.)
## Contents
[[_TOC_]]
## Installation and Tests
`Multi` doesn't require instalation, single file `#include<multi/array.hpp>` is enough to use the full core library.
`Multi`'s _only_ dependecy is the standard C++ library.
It is important to compile programs that use the library with a decent level of optimization (e.g. `-O2`) to avoid slowdown if indiviudual element-access is intensively used.
For example, when testing speed, please make sure that you are compiling in release mode (`-DNDEBUG`) and with optimizations (`-O3`),
if your test involves mathematical operations add arithmetic optimizations (`-Ofast`) to compare with Fortran code.
A CMake build system is provided to automatically run basic tests.
Test do depend on Boost.Test.
```bash
git clone https://gitlab.com/correaa/boost-multi.git multi
cd multi
```
```bash
#export CXX="nvcc -DBOOST_PP_VARIADICS=1 -x cu -O3" #optional spec. compiler
mkdir -p test/build
cd test/build
cmake ..
make -j
make test -j
```
The code is developed on `clang` (10.0), `gcc` (9.3) and `nvcc` 11 compilers, and [tested regularly ](https://gitlab.com/correaa/boost-multi/pipelines) with clang 9.0, NVCC 10.1, Intel (19.1), and PGI(nvc++) 20.7 compilers.
For detailed compilation instructions of test see the Continuous Integration (CI) definition file https://gitlab.com/correaa/boost-multi/-/blob/master/.gitlab-ci.yml
## Types
* `multi::array<T, D, A>`: Array of dimension `D`, it has value semantics if `T` has value semantics. Memory is requested by allocator of type `A`, should support stateful allocators.
* `multi::array_ref<T, D, P = T*>`: Array interpretation of a random access range, usually a memory block. It has reference semantics. Thanks to (non-virtual) inheritance an `array<T, D, A>` is-a `array_ref<T, D, A::pointer>`.
* other derived "unspecified types" fulfil (a still loosely defined) `MultiArrayView` concept, for example by taking partial indices or rotations (transpositions). These reference types cannot be stored except through life-time extensions `auto&&`. Due to language limitations `auto` will not deduce a corresponding value-sematics type; for this reason it is necessary to use a "decay" idiom to obtain value object.
* `MultiArrayView<T,D,P>::(const_)iterator`: Iterator to subarrays of dimension `D - 1`. For `D == 1` this is an iterator to an element. This types are generated by `begin` and `end` functions.
* `MultiArrayView<T, D, P>::(const_)reference`: Reference to subarrays of dimension `D - 1`. For `D > 1` this are not true C++-references but types emulate them (with reference semantics), therefore `auto` is not well behaved. For `D==1` this is a true C++ reference to an elements. These types are generated by dereferencing iterators, e.g. `*begin(MA)`.
## Basic Usage
Declare an array specifying the element type and the dimension.
Elements can be input with nested braced notation.
```cpp
std::array<double, 2> A = {
{1, 2, 3}
{4, 5, 6}
};
```
The size is automatically deduced; the first dimension are the (two) "rows" above.
```cpp
assert( A.size()==2 );
assert( std::get<1>(A.sizes()) == 3 );
```
The value of an array can be copied, moved, and compared.
Copies are equal but independent.
```cpp
std::array<double, 2> B = A;
assert( extensions(B) == extensions(A) );
assert( B[0][1] == A[0][1] );
assert( &B[0][1] != &A[0][1] );
assert( B == A );
```
Array can be initialized by the size alone, in which case the element values are default constructed:
```cpp
std::array<double, 3> C({3, 4, 5}); // 3*4*5 = 60 elements
```
Arrays can be passed by value or by reference, most of the time they should be passed through generic parameters.
Most useful function work on the concept of array rather than on a concrete type.
```cpp
template<class ArrayDouble2D> // instead of the over specific argument std::array<double, 2>
double const& element_1_1(ArrayDouble2D const& m){return m[1][1];}
...
assert( element_1_1(A) == A[1][1] );
```
These generic function arguments that are not intended to be modified are passed by `const&`; otherwise pass by forward-reference `&&`.
In this way the functions can be called on subblocks of larger matrices.
```cpp
assert( &element_1_1(C3D[0]) == &C3D[0][1][1] );
```
## Advanced Usage
We create a static C-array of `double`s, and refer to it via a bidimensional array `multi::array_ref<double, 2>`.
```cpp
#include "../array_ref.hpp"
#include "../array.hpp"
#include<algorithm> // for sort
#include<iostream> // for print
namespace multi = boost::multi;
using std::cout; using std::cerr;
int main(){
double d2D[4][5] = {
{150, 16, 17, 18, 19},
{ 30, 1, 2, 3, 4},
{100, 11, 12, 13, 14},
{ 50, 6, 7, 8, 9}
};
multi::array_ref<double, 2> d2D_ref{&d2D[0][0], {4, 5}};
...
```
Note that the syntax of creating a reference array involves passing the pointer to a memory block (20 elements here) and the logical dimensions of that memory block (4 by 5 here).
Next we print the elements in a way that corresponds to the logical arrangement:
```cpp
...
for(auto i : d2D_ref.extension(0)){
for(auto j : d2D_ref.extension(1))
cout << d2D_ref[i][j] <<' ';
cout <<'\n';
}
...
```
This will output:
> ```cpp
> 150 16 17 18 19
> 30 1 2 3 4
> 100 11 12 13 14
> 50 6 7 8 9
> ```
It is sometimes said (by Sean Parent) that the whole of STL algorithms can be seen as intermediate pieces to implement`std::stable_sort`.
Pressumably if one can sort over a range, one can perform any other standard algorithm.
```cpp
...
std::stable_sort( begin(d2D_ref), end(d2D_ref) );
...
```
If we print this we will get
> ```cpp
> 30 1 2 3 4
> 50 6 7 8 9
> 100 11 12 13 14
> 150 16 17 18 19
> ```
The array has been changed to be in row-based lexicographical order.
Since the sorted array is a reference to the original data, the original array has changed.
```cpp
...
assert( d2D[1][1] == 6 );
...
```
(Note that `std::*sort` cannot be applied directly to a multidimensional C-array or to Boost.MultiArray types.)
If we want to order the matrix in a per-column basis we need to "view" the matrix as range of columns. This is done in the bidimensional case, by accessing the matrix as a range of columns:
```cpp
...
std::stable_sort( d2D_ref.begin(1), d2D_ref.end(1) );
}
```
Which will transform the matrix into.
> ```cpp
> 1 2 3 4 30
> 6 7 8 9 50
> 11 12 13 14 100
> 16 17 18 19 150
> ```
In other words, a matrix of dimension `D` can be viewed simultaneously as `D` different ranges of different "transpositions" by passing an interger value to `begin` and `end` indicating the preferred dimension.
`begin(0)` is equivalent to `begin()`.
## Initialization
`array_ref` is initialized from a preexisting contiguous range, the index extensions should compatible with the total number of elements.
```cpp
double* dp = new double[12];
multi::array_ref<double, 2> A({3,4}, dp);
multi::array_ref<double, 2> B({2,6}, dp);
...
delete[] dp;
```
`array` is initialized by specifying the index extensions (and optionally a default value) or alternatively from a rectangular list.
```cpp
/*In C++17 the element-type and the dimensionality can be omitted*/
multi::array/*<double, 1>*/ A1 = {1.,2.,3.};
assert(A1.dimensionality==1 and A1.num_elements()==3);
multi::array/*<double, 2>*/ A2 {
{1.,2.,3.},
{4.,5.,6.}
}; assert(A2.dimensionality==2 and A2.num_elements()==2*3);
multi::array/*<double, 3>*/ const A3 = {
{{ 1.2, 0.}, { 2.4, 1.}},
{{11.2, 3.}, {34.4, 4.}},
{{15.2, 99.}, {32.4, 2.}}
}; assert(A3.dimensionality==3 and A3.num_elements()==3*2*2);
```
## Iteration
Accessing arrays by iterators (`begin`/`end`) enables the use of many iterator based algorithms (see the sort example above).
`begin/end(A)` (or equivalently `A.begin/end()`) gives iterators that linear and random access in the leading dimension.
`A.begin/end(n)` gives access in non-leading nested dimension number `n`.
`cbegin/cend(A)` (or equivalently `A.cbegin/cend()`) gives read-only iterators.
For example in three dimensional array,
(cbegin(A)+1)->operator[](1).begin()[0] = 342.4; //error, read-only
(begin(A)+1)->operator[](1).begin()[0] = 342.4; // assigns to A[1][1][0]
assert( (begin(A)+1)->operator[](1).begin()[0] == 342.4 );
As an example, this function allows printing arrays of arbitrary dimension into a linear comma-separated form.
```cpp
void print(double const& d){cout<<d;};
template<class MultiArray>
void print(MultiArray const& ma){
cout<<"{";
if(not ma.empty()){
print(*cbegin(ma));
std::for_each(cbegin(ma)+1, cend(ma), [](auto&& e){cout<<","; print(e);});
}
cout<<"}";
}
...
print(A);
```
> {{{1.2,1.1},{2.4,1}},{{11.2,3},{34.4,4}},{{15.2,99},{32.4,2}}}
Except for those corresponding to the one-dimensional case, derreferencing iterators generally produce proxy-reference objects.
Therefore this is not allowed:
auto row = *begin(A); // compile error
This because `row` doesn't have the expected value semantics, and didn't produce any data copy.
However this express the intention better
decltype(A)::value_type row = *begin(A); // there is a real copy.
In my experience, however, this produces a more consistent idiom to hold references without copying elements.
auto const& crow = *cbegin(A); // same as decltype(A)::const_reference crow = *cbegin(A);
auto&& row = * begin(A); // same as decltype(A):: reference row = * begin(A);
## Indexing
Arrays provide random access to elements or subviews.
Many algorithms on arrays are oriented to linear algebra, which are ubiquitously implemented in terms of multidimensional index access.
### Element access and partial access
Index access mimics that of C-fixed sizes arrays, for example a 3-dimensional array will access to an element by `m[1][2][3]`,
which can be used for write and read operations.
Partial index arguments `m[1][2]` generate a view 1-dimensional object.
Transpositions are also multi-dimensional arrays views in which the index are *logically* rearranged, for example `m.rotated(1)[2][3][1] == rotated(m)[2][3][1] == m[1][2][3]`.
(rotate refers to the fact that the logical indices are rotated.)
As an illustration of an algorithm based on index access (as opposed to iterators),
this example code implements Gauss Jordan Elimination without pivoting:
```cpp
template<class Matrix, class Vector>
auto gj_solve(Matrix&& A, Vector&& y)->decltype(y[0]/=A[0][0], y){
std::ptrdiff_t Asize = size(A);
for(std::ptrdiff_t r = 0; r != Asize; ++r){
auto&& Ar = A[r];
auto&& Arr = Ar[r];
for(std::ptrdiff_t c = r + 1; c != Asize; ++c) Ar[c] /= Arr;
auto const yr = (y[r] /= Arr);
for(std::ptrdiff_t r2 = r + 1; r2 != Asize; ++r2){
auto&& Ar2 = A[r2];
auto const& Ar2r = Ar2[r]; // auto&& Ar = A[r];
for(std::ptrdiff_t c = r + 1; c != Asize; ++c) Ar2[c] -= Ar2r*Ar[c];
y[r2] -= Ar2r*yr;
}
}
for(std::ptrdiff_t r = Asize - 1; r > 0; --r){
auto const& yr = y[r];
for(std::ptrdiff_t r2 = r-1; r2 >=0; --r2) y[r2] -= yr*A[r2][r];
}
return y;
}
```
This function can be applied to a `multi::array` container:
```cpp
multi::array<double, 2> A = {{-3., 2., -4.},{0., 1., 2.},{2., 4., 5.}};
multi::array<double, 1> y = {12.,5.,2.}; //(M); assert(y.size() == M); iota(y.begin(), y.end(), 3.1);
gj_solve(A, y);
```
and also to a combination of `MultiArrayView`-type objects:
```cpp
multi::array<double, 2> A({6000, 7000}); std::iota(A.data(), A.data() + A.num_elements(), 0.1);
std::vector<double> y(3000); std::iota(y.begin(), y.end(), 0.2);
gj_solve(A({1000, 4000}, {0, 3000}), y);
```
### Slices and strides
Given an array, a slice in the first dimension can be taken with the `sliced` function. `sliced` takes two arguments, the first index of the slice and the last index (not included) of the slice. For example,
```cpp
multi::array<double, 2> d2D({4, 5});
assert( d2D.size(0) == 4 and d2D.size(1) == 5 );
auto&& d2D_sliced = d2D.sliced(1, 3); // {{d2D[1], d2D[2]}}
assert( d2D_sliced.size(0) == 2 and d2D_sliced.size(1) == 5 );
```
The number of rows in the sliced matrix is 2 because we took only two rows, row 1 and row 2 (row 3 is excluded).
In the same way a strided view of the original array can be taken with the `strided` function.
```cpp
auto&& d2D_strided = d2D.strided(2); // {{ d2D[0], d2D[1] }};
assert( d2D_strided.size(0) == 2 and d2D_strided.size(1) == 5 );
```
In this case the number of rows is 2 because, out of the 4 original rows we took one every two.
Operations can be combined in a single line:
```cpp
auto&& d2D_slicedstrided = d2D.sliced(1, 3).strided(2); // {{ d2D[1] }};
assert( d2D_slicedstrided.size(0) == 1 and d2D_slicedstrided.size(1) == 5 );
```
For convenience, `A.sliced(a, b, c)` is the same as `A.sliced(a, b).strided(c)`.
By combining `rotated`, `sliced` and `strided` one can take sub arrays at any dimension.
For example in a two dimensional array one can take a subset of columns by defining.
```cpp
auto&& subA = A.rotated(1).strided(1, 3).sliced(2).rotated(-1);
```
Other notations are available, but when in doubt the `rotated/strided/sliced/rotated` and combinations of them idioms provides the most control over the subview operations.
(At the moment the `strided` argument has to divide the total size of the slice (or matrix), otherwise the behavior is undefined.)
Blocks (slices) in multidimensions can be obtained but pure index notation using `.operator()`:
```cpp
multi::array<double, 2> A({6, 7}); // 6x7 array
A({1, 4}, {2, 4}) // 3x2 array, containing indices 1 to 4 in the first dimension and 2 to 4 in the second dimension.
```
## Concept Requirements
The design tries to impose the minimum possible requirements over the used referred types.
Pointer-like random access types can be used as substitutes of built-in pointers.
```cpp
namespace minimal{
template<class T> class ptr{ // minimalistic pointer
T* impl_;
T& operator*() const{return *impl_;}
auto operator+(std::ptrdiff_t n) const{return ptr{impl_ + n};}
// operator[], operator+=, etc are optional but not necessary
};
}
int main(){
double* buffer = new double[100];
multi::array_ref<double, 2, minimal::ptr<double> > CC(minimal::ptr<double>{buffer}, {10, 10});
CC[2]; // requires operator+
CC[1][1]; // requires operator*
CC[1][1] = 9;
assert(CC[1][1] == 9);
delete[] buffer;
}
```
### Linear Sequences: Pointers
An `array_ref` can reference to an arbitrary random access iterator sequence.
This way, any linear (random access) sequence (e.g. `raw memory`, `std::vector`, `std::queue`) can be efficiently arranged as a multidimensional array.
```cpp
std::vector<double> buffer(100);
multi::array_ref<double, 2, std::vector<double>::iterator> A({10, 10}, buffer.begin());
A[1][1] = 9;
assert(A[1][1] == 9);
assert(buffer[11]==9);
```
Since `array_ref` does not manage the memory associated with it, the reference can be simply dangle if the `buffer` memory is reallocated (e.g. by `resize`).
### Special Memory: Allocators and Fancy Pointers
`array`'s manages its memory through allocators.
It can handle special memory, as long as the underlying types behave coherently, these include fancy pointers and fancy references.
Associated fancy pointers and fancy reference (if any) are deduced from the allocator types.
The behavior regarding memory managament of the [fancy pointers](https://en.cppreference.com/w/cpp/named_req/Allocator#Fancy_pointers) can be customized (if necessary) by specializations of some or all of these functions:
```cpp
destroy(a, first, last)
destroy_n(a, first, n) -> last
uninitialized_copy_n(a, first, n, dest) -> last;
uninitialized_fill_n(a, first, n, value) -> last
uninitialized_default_construct_n(a, first, n) -> last
uninitialized_value_construct_n(a, first, n) -> last
```
where `a` is the special allocator, `n` is a size (usually the number of elements), `first`, `last` and `dest` are fancy pointers.
Copying underlying memory can be customized by specializing
```cpp
copy_n(first, n, dest)
fill_n(first, n, value)
```
Specific cases of fancy memory are file-mapped memory or interprocess shared memory.
This example illustrates memory persistency by combining with Boost.Interprocess library.
The arrays support their allocators and fancy pointers (`boost::interprocess::offset_ptr`).
```cpp
#include <boost/interprocess/managed_mapped_file.hpp>
using namespace boost::interprocess;
using manager = managed_mapped_file;
template<class T> using mallocator = allocator<T, manager::segment_manager>;
decltype(auto) get_allocator(manager& m){return m.get_segment_manager();}
template<class T, auto D> using marray = multi::array<T, D, mallocator<T>>;
int main(){
{
manager m{create_only, "mapped_file.bin", 1 << 25};
auto&& arr2d = *m.construct<marray<double, 2>>("arr2d")(std::tuple{1000, 1000}, 0.0, get_allocator(m));
arr2d[4][5] = 45.001;
}
// imagine execution restarts here
{
manager m{open_only, "mapped_file.bin"};
auto&& arr2d = *m.find<marray<double, 2>>("arr2d").first;
assert( arr2d[7][8] == 0. );
assert( arr2d[4][5] == 45.001 );
m.destroy<marray<double, 2>>("arr2d");
}
}
```
# Interoperability with other software
## STL (Standard Template Library)
The fundamental goal of the library is that the arrays and iterators can be used with STL algorithms out-of-the-box with a reasonable efficiency.
The most dramatic example of this is that `std::sort` works with array as it is shown in a previous example.
Along with STL itself, the library tries to interact with other existing C++ libraries.
## Range v3
```cpp
#include <range/v3/all.hpp>
int main(){
multi::array const d2D = {
{ 0, 1, 2, 3},
{ 5, 6, 7, 8},
{10, 11, 12, 13},
{15, 16, 17, 18}
};
assert( ranges::inner_product(d2D[0], d2D[1], 0.) == 6+2*7+3*8 );
assert( ranges::inner_product(d2D[0], rotated(d2D)[0], 0.) == 1*5+2*10+15*3 );
static_assert(ranges::RandomAccessIterator<multi::array<double, 1>::iterator>{});
static_assert(ranges::RandomAccessIterator<multi::array<double, 2>::iterator>{});
}
```
## Boost.Interprocess
Using Interprocess allows for shared memory and for persistent mapped memory.
```cpp
#include <boost/interprocess/managed_mapped_file.hpp>
#include "multi/array.hpp"
#include<cassert>
namespace bip = boost::interprocess;
using manager = bip::managed_mapped_file;
template<class T> using mallocator = bip::allocator<T, manager::segment_manager>;
auto get_allocator(manager& m){return m.get_segment_manager();}
namespace multi = boost::multi;
template<class T, int D> using marray = multi::array<T, D, mallocator<T>>;
int main(){
{
manager m{bip::create_only, "bip_mapped_file.bin", 1 << 25};
auto&& arr2d = *m.construct<marray<double, 2>>("arr2d")(std::tuple{1000, 1000}, 0., get_allocator(m));
arr2d[4][5] = 45.001;
m.flush();
}
{
manager m{bip::open_only, "bip_mapped_file.bin"};
auto&& arr2d = *m.find<marray<double, 2>>("arr2d").first;
assert( arr2d[4][5] == 45.001 );
m.destroy<marray<double, 2>>("arr2d");// eliminate<marray<double, 2>>(m, "arr2d");}
}
}
```
(Similarly works with [LLNL's Meta Allocator](https://github.com/llnl/metall))
## Cuda thrust
```cpp
#include "multi/adaptors/thrust/allocator_traits.hpp"
#include "multi/adaptors/thrust/algorithms.hpp"
#include "multi/array.hpp"
namespace multi = boost::multi;
int main(){
multi::array<double, 2, thrust::device_allocator<double>> A2({10,10});
multi::array<double, 2, thrust::device_allocator<double>> B2({10,10});
A2[5][0] = 50.;
thrust::copy(begin(rotated(A2)[0]), end(rotated(A2)[0]), begin(rotated(B2)[0]));
assert( B2[5][0] == 50. );
}
```
## TotalView
TotalView visual debugger (commercial) can display arrays in human-readable form (for simple types, like `double` or `std::complex`).
To use it, simply `#include "multi/adaptors/totalview.hpp"` and link to the TotalView libraries, compile and run the code with the debugger.
## Memory Resources
The library is compatible with C++17's polymorphic memory resources which allows using preallocated buffers.
This enables the use of stack memory or in order to reduce the number of allocations.
For example, this code ends up with `buffer` containing the string `"aaaabbbbbb "`.
```cpp
#include<pmr>
int main(){
char buffer[13] = "____________"; // a small buffer on the stack
std::pmr::monotonic_buffer_resource pool{std::data(buffer), std::size(buffer)}; // or multi::memory::monotonic<char*>
multi::array<char, 2, std::pmr::polymorphic_allocator<char>> A({2, 2}, 'a', &pool); // or multi::memory::monotonic_allocator<double>
multi::array<char, 2, std::pmr::polymorphic_allocator<char>> B({3, 2}, 'b', &pool);
}
```
The library comes with its own customized (non-polymorphic) memory resources if, for any reason, the standard PMRs are not sufficiently general.
The headers to include are:
```cpp
#include<multi/memory/monotonic.hpp> // multi::memory::monotonic<char*> : no memory reclaim
#include<multi/memory/stack.hpp> // multi::memory::stack<char*> : FIFO memory reclaim
```
# Technical points
### What's up with the multiple bracket notation?
The chained bracket notation (`A[i][j][k]`) allows to refer to elements and subarrays lower dimensional subarrays in a consistent and _generic_ manner and it is the recommended way to access the array objects.
It is a frequently raised question whether the chained bracket notation is good for performance, since it appears that each utilization of the bracket leads to the creation of a temporary which in turn generates a partial copy of the layout.
Moreover, this goes against [historical recommendations](https://isocpp.org/wiki/faq/operator-overloading#matrix-subscript-op).
It turns out that [modern compilers with a fair level of optimization (`-O2`)](https://godbolt.org/z/3fYd5c) can elide these temporary objects, so that `A[i][j][k]` generates identical assembly code as `A.base() + i*stride1 + j*stride2 + k*stride3` (+offsets not shown).
In a subsequent optimization, constant indices can have their "partial stride" computation removed from loops.
As a result, these two loops lead to the [same machine code](https://godbolt.org/z/z1se74):
```cpp
for(int j = 0; j != nj; ++j)
++A[i][j][k];
```
```cpp
double* Ai_k = A.base() + i*A_stride1 + k*A_stride3;
for(int j = 0; j != nj; ++jj)
++(*(Ai_k + j*A_stride2));
```
Incidentally, the library also supports parenthesis notation with multiple indices `A(i, j, k)` for element or partial access, but it does so for accidental reasons as part of a more general syntax to generate sub-blocks.
In any case `A(i, j, k)` is expanded to `A[i][j][k]` internally in the library when `i, j, k` are normal integer indices.
Additionally, array coordinates can be directly stored in tuple-like data structures, allowing this functional syntax:
```cpp
std::array p = {2,3,4};
std::apply(A, p) = 234; // A[2][3][4] = 234;
```
### Customizing recursive operations: SCARY iterators
A custom level of customization can be achieved by intercepting internal recursive algorithms.
Multi iterators are [SCARY](http://www.open-std.org/jtc1/sc22/WG21/docs/papers/2009/n2980.pdf).
SCARY means that they are independent of any container and can be accessed generically through their dimension and underlying pointer types:
For example, `boost::multi::array_iterator<double, 2, double*> it` is a row (or column) iterator of an array of dimension 2 or higher, whose underlying pointer type is `double*`.
This row (or column) and subsequent ones can be accessed by the normal iterator(pointer) notation `*it` and `it[n]` respectively.
Indirection `it->...` is supported (even for iterators if high dimension).
The base pointer, the strides and the size of the arrow can be accessed by `base(it)`, `stride(it)`, `it->size()`.
The template arguments of the iterator can be used to customize operations that are recursive (and possibly inefficient in certain context) in the library:
```cpp
namespace boost{namespace multi{
template<class It, class T> // custom copy 1D (aka strided copy)
void copy(It first, It last, multi::array_iterator<T, 1, fancy::ptr<T> > dest){
assert( stride(first) == stride(last) );
std::cerr<<"1D copy(it1D, it1D, it1D) with strides "<< stride(first) <<" "<< stride(dest) <<std::endl;
}
template<class It, class T> // custom copy 2D (aka double strided copy)
void copy(It first, It last, multi::array_iterator<T, 2, fancy::ptr<T> > dest){
assert( stride(first) == stride(last) );
std::cerr<<"2D copy(It, It, it2D) with strides "<< stride(first) <<" "<< stride(dest) <<std::endl;
}
}}
```
For example, if your custom pointers refers a memory type in which 2D memory copying (strided copy) is faster than sequencial copying, that kind of instruction can be ejecuted when the library internally calls `copy`.
This customization must be performed (unfortunately) in the `boost::multi` namespace (this is where the Multi iterators are defined) and the customization happens through matching the dimension and the pointer type.

View File

@ -0,0 +1,80 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXXX $CXXFLAGS $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2018-2020
#ifndef MULTI_ADAPTORS_BLAS_HPP
#define MULTI_ADAPTORS_BLAS_HPP
#include "../adaptors/blas/iamax.hpp"
#include "../adaptors/blas/asum.hpp"
#include "../adaptors/blas/axpy.hpp"
#include "../adaptors/blas/copy.hpp"
#include "../adaptors/blas/dot.hpp"
#include "../adaptors/blas/gemm.hpp"
#include "../adaptors/blas/syrk.hpp"
#include "../adaptors/blas/herk.hpp"
#include "../adaptors/blas/gemv.hpp"
#include "../adaptors/blas/ger.hpp"
#include "../adaptors/blas/nrm2.hpp"
#include "../adaptors/blas/trsm.hpp"
#include "../adaptors/blas/scal.hpp"
#include "../adaptors/blas/swap.hpp"
#if not __INCLUDE_LEVEL__
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../array.hpp"
#include "../utility.hpp"
#include<iostream>
#include<complex>
#include<numeric> // iota
#include<algorithm> // transform
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex){
using complex = std::complex<double>; complex const I{0, 1};
using multi::blas::herk;
{
multi::array<complex, 2> const A = {
{1. + 3.*I, 9. + 1.*I},
{3. - 2.*I, 7. - 8.*I},
{4. + 1.*I, 1. - 3.*I}
};
multi::array<complex, 2> C({3, 3}, 9999.);
herk(1., A, C); // herk(A, C); // C†=C=AA†=(A†A)†
BOOST_REQUIRE( C[1][2] == complex(41., 2.) );
BOOST_REQUIRE( C[2][1] == conj(C[1][2]) );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_asum_complex){
using complex = std::complex<double>;
multi::array<complex, 1> arr(1000, 0.);
// std::iota(begin(arr), end(arr), -700.);
// std::transform(cbegin(arr), cend(arr), begin(arr), [](auto&& a){return sqrt(a);});
{
using multi::blas::asum;
BOOST_REQUIRE( asum(arr) == 0 );
// std::cout << asum(arr) << std::endl;
}
}
BOOST_AUTO_TEST_CASE(multi_blas_nrm2_complex){
multi::array<complex, 1> arr(1000, 0.);
// std::iota(begin(arr), end(arr), -700.);
// std::transform(cbegin(arr), cend(arr), begin(arr), [](auto&& a){return sqrt(a);});
{
using multi::blas::nrm2;
BOOST_REQUIRE( nrm2(arr) == 0. );
}
}
#endif
#endif

View File

@ -0,0 +1,32 @@
cmake_minimum_required(VERSION 3.11)
set(CMAKE_VERBOSE_MAKEFILE ON)
project(boost-multi-adaptors-blas VERSION 0.1 LANGUAGES CXX)
set(BLA_VENDOR Intel10_64lp)
find_package(BLAS)
if(BLAS_FOUND) # in some systems with MKL, regular BLAS headers need to be found for it to work
message("Multi/BLAS: MKL environment detected")
add_definitions(-DRETURN_BY_STACK)
else()
message("Multi/BLAS: MKL environment not detected, looking for other BLAS")
unset(BLA_VENDOR)
find_package(BLAS REQUIRED)
endif()
#find_path(BLAS_INCLUDE_DIRS cblas.h
# /usr/include
# /usr/local/include
# $ENV{BLAS_HOME}/include)
link_libraries(${BLAS_LIBRARIES})
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
include_directories(${CMAKE_BINARY_DIR})
add_subdirectory(test)

View File

@ -0,0 +1,71 @@
<!--
(pandoc `#--from gfm` --to html --standalone --metadata title=" " $0 > $0.html) && firefox --new-window $0.html; sleep 5; rm $0.html; exit
-->
# [Boost.]Multi BLAS Adaptor
(not an official Boost library)
_© Alfredo A. Correa, 2018-2021_
The BLAS Adaptor provides an interface for BLAS-like libraries.
## Contents
[[_TOC_]]
## Numeric Arrays, Conjugation Real and Imaginary parts
This functions produce views (not copies) related to conjugation, real and imaginary parts.
```cpp
using complex = std::complex<double>;
complex const I{0, 1};
multi::array<complex, 2> B = {
{1. - 3.*I, 6. + 2.*I},
{8. + 2.*I, 2. + 4.*I},
{2. - 1.*I, 1. + 1.*I}
};
namespace blas = multi::blas;
multi::array<complex, 2> conjB = blas::conj(B);
assert( blas::conj(B)[2][1] == std::conj(B[2][1]) );
assert( blas::transposed(B)[1][2] == B[2][1] );
assert( blas::transposed(B) == ~B );
assert( blas::hermitized(B)[2][1] == blas::conj(B)[1][2] );
assert( blas::hermitized(B) == blas::conj(blas::transposed(B)) );
assert( blas::real(B)[2][1] == std::real(B[2][1]) );
assert( blas::imag(B)[2][1] == std::imag(B[2][1]) );
multi::array<double, 2> B_real_doubled = {
{ 1., -3., 6., 2.},
{ 8., 2., 2., 4.},
{ 2., -1., 1., 1.}
};
assert( blas::real_doubled(B) == B_real_doubled );
```
Usage:
```cpp
multi::array<double, 2> const a_real = {
{ 1., 3., 1.},
{ 9., 7., 1.},
};
multi::array<complex, 2> const b = {
{ 11.+1.*I, 12.+1.*I, 4.+1.*I, 8.-2.*I},
{ 7.+8.*I, 19.-2.*I, 2.+1.*I, 7.+1.*I},
{ 5.+1.*I, 3.-1.*I, 3.+8.*I, 1.+1.*I}
};
multi::array<complex, 2> c({2, 4});
blas::real_doubled(c) = blas::gemm(1., a_real, blas::real_doubled(b)); // c = a_real*b
```
## Installation and Tests
...

View File

@ -0,0 +1,85 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXXX $CXXFLAGS $0 -o $0x `pkg-config --cflags --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
// TODO make it work with thrust complex
#ifndef MULTI_ADAPTORS_BLAS_ASUM_HPP
#define MULTI_ADAPTORS_BLAS_ASUM_HPP
#include "../blas/core.hpp"
namespace boost{
namespace multi{namespace blas{
template<class It, typename Size>
auto asum_n(It first, Size n)
->decltype(asum(n, base(first), stride(first))){
return asum(n, base(first), stride(first));}
using std::distance;
template<class It>
auto asum(It f, It last)
->decltype(asum_n(f, distance(f, last))){assert(stride(f) == stride(last));
return asum_n(f, distance(f, last));}
using std::begin; using std::end;
template<class X1D>
auto asum(X1D const& x)
->decltype(asum(begin(x), end(x))){assert( not offset(x) );
return asum(begin(x), end(x));}
}}
}
#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_SCAL
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi.BLAS asum"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
//#include<boost/test/tools/floating_point_comparison.hpp>
#include "../../array.hpp"
//#include "../../utility.hpp"
#include<numeric> // accumulate
namespace multi = boost::multi;
using multi::blas::asum;
BOOST_AUTO_TEST_CASE(multi_blas_asum_double){
multi::array<double, 2> const A = {
{1., 2., 3., 4.},
{-5., 6., -7., 8.},
{9., 10., 11., 12.}
};
BOOST_REQUIRE(asum(A[1]) == std::accumulate(begin(A[1]), end(A[1]), 0., [](auto&& a, auto&& b){return a+std::abs(b);}));
}
BOOST_AUTO_TEST_CASE(multi_blas_asum_complex){
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 1.*I, 2., 3., 4.},
{-5. + 3.*I, 6., -7., 8.},
{ 9. - 2.*I, 10., 11., 12.}
};
BOOST_REQUIRE(asum(rotated(A)[0]) == 1.+1. + 5.+3. + 9.+2.);
}
BOOST_AUTO_TEST_CASE(multi_blas_asum_double_carray){
// double A[3][4] = {
// {1., 2., 3., 4.},
// {-5., 6., -7., 8.},
// {9., 10., 11., 12.}
// }; (void)A;
// using std::begin; using std::end;
// BOOST_REQUIRE(asum(A[1]) == std::accumulate(begin(A[1]), end(A[1]), 0., [](auto&& a, auto&& b){return a+abs(b);}));
}
#endif
#endif

View File

@ -0,0 +1,91 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
// © Alfredo Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_AXPY_HPP
#define MULTI_ADAPTORS_BLAS_AXPY_HPP
#include "../../adaptors/blas/core.hpp"
#include "../../config/NODISCARD.hpp"
#include "../../array_ref.hpp"
namespace boost{
namespace multi{namespace blas{
using core::axpy;
template<class It1, class Size, class OutIt>
auto axpy_n(typename It1::value_type alpha, It1 first, Size n, OutIt d_first)
->decltype(axpy(n, &alpha, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n){
return axpy(n, &alpha, base(first) , stride(first) , base(d_first) , stride(d_first) ) , d_first + n;}
template<class Context, class It1, class Size, class OutIt, class=std::enable_if_t<is_context<Context>{}>>
auto axpy_n(Context&& ctxt, typename It1::value_type alpha, It1 first, Size n, OutIt d_first)
->decltype(std::forward<Context>(ctxt).axpy(n, &alpha, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n){
return std::forward<Context>(ctxt).axpy(n, &alpha, base(first) , stride(first) , base(d_first) , stride(d_first)) , d_first + n;}
template<class X1D, class Y1D, typename = decltype( std::declval<Y1D&&>()[0] = 0. )>
auto axpy(typename X1D::element alpha, X1D const& x, Y1D&& y)
->decltype(axpy_n(alpha, x.begin(), x.size(), y.begin()), std::forward<Y1D>(y)){assert(size(x)==size(y)); // intel doesn't like ADL in deduced/sfinaed return types
return axpy_n(alpha, begin(x), size(x), begin(y)), std::forward<Y1D>(y);
}
template<class Context, class X1D, class Y1D, typename = decltype( std::declval<Y1D&&>()[0] = 0. )>
auto axpy(Context&& ctxt, typename X1D::element alpha, X1D const& x, Y1D&& y)
->decltype(axpy_n(std::forward<Context>(ctxt), alpha, x.begin( ), x.size( ), y.begin( )), std::forward<Y1D>(y)){assert(size(x)==size(y)); // intel doesn't like ADL in deduced/sfinaed return types
return axpy_n(std::forward<Context>(ctxt), alpha, begin(x), size(x), begin(y)), std::forward<Y1D>(y);
}
template<class X1D, class Y1D>
Y1D&& axpy(X1D const& x, Y1D&& y){return axpy(+1., x, std::forward<Y1D>(y));}
template<class Context, class X1D, class Y1D, std::enable_if_t<is_context<Context>{}> >
Y1D&& axpy(Context&& ctxt, X1D const& x, Y1D&& y){return axpy(std::forward<Context>(ctxt), +1., x, std::forward<Y1D>(y));}
template<class Context, class Scale, class ItX>
class axpy_range{
Context ctxt_;
Scale alpha_;
ItX x_begin_;
size_type count_;
public:
axpy_range(axpy_range const&) = delete;
axpy_range(Context ctxt, Scale alpha, ItX x_first, ItX x_last)
: ctxt_{ctxt}, alpha_{alpha}, x_begin_{x_first}, count_{x_last - x_first}{}
template<class Other>
friend Other&& operator+=(Other&& other, axpy_range const& self){
assert(other.size() == self.count_);
blas::axpy_n(std::forward<Context>(self.ctxt_), +self.alpha_, self.x_begin_, self.count_, other.begin());
return std::forward<Other>(other);
}
template<class Other>
friend Other&& operator-=(Other&& other, axpy_range const& self){
assert(other.size() == self.count_);
blas::axpy_n(std::forward<Context>(self.ctxt_), -self.alpha_, self.x_begin_, self.count_, other.begin());
return std::forward<Other>(other);
}
axpy_range& operator*=(Scale s)&{alpha_ *= s;}
};
template<class Context, class Scale, class X, class=std::enable_if_t<is_context<Context>{}>>
axpy_range<Context, Scale, typename X::const_iterator> axpy(Context&& ctxt, Scale a, X const& x){
return {std::forward<Context>(ctxt), a, begin(x), end(x)};}
template<class Scale, class X>
axpy_range<blas::context const&, Scale, typename X::const_iterator> axpy(Scale a, X const& x){return {blas::context{}, a, begin(x), end(x)};}
namespace operators{
template<class X1D, class Y1D> auto operator+=(X1D&& x, Y1D const& other) DECLRETURN(axpy(+1., other, std::forward<X1D>(x)))
template<class X1D, class Y1D> auto operator-=(X1D&& x, Y1D const& other) DECLRETURN(axpy(-1., other, std::forward<X1D>(x)))
template<class X1D, class Y1D> auto operator+(X1D const& x, Y1D const& y)->std::decay_t<decltype(x.decay())>{auto X=x.decay(); X+=y; return X;}
template<class X1D, class Y1D> auto operator-(X1D const& x, Y1D const& y)->std::decay_t<decltype(x.decay())>{auto X=x.decay(); X-=y; return X;}
}
}}
}
#endif

View File

@ -0,0 +1,126 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
// © Alfredo A. Correa 2020
#ifndef MULTI_ADAPTORS_BLAS_COPY_HPP
#define MULTI_ADAPTORS_BLAS_COPY_HPP
#include "../blas/core.hpp"
#include "../blas/operations.hpp"
#include "../../config/NODISCARD.hpp"
#include<type_traits>
namespace boost{
namespace multi::blas{
using core::copy;
template<class It, typename Size, class OutIt>
auto copy_n(It first, Size n, OutIt d_first)
->decltype(copy(n, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n){
return copy(n, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n;}
template<class Context, class It, typename Size, class OutIt, class=std::enable_if_t<blas::is_context<Context>{}> >
auto copy_n(Context&& ctxt, It first, Size n, OutIt d_first)
->decltype(copy(std::forward<Context>(ctxt), n, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n){
return copy(std::forward<Context>(ctxt), n, first.base(), first.stride(), d_first.base(), d_first.stride()), d_first + n;}
template<class It, class OutIt>
auto copy(It first, It last, OutIt d_first)
->decltype(copy_n(first, last - first, d_first)){
return copy_n(first, last - first, d_first);}
template<class Context, class It, class OutIt, class=std::enable_if_t<blas::is_context<Context>{}>>
auto copy(Context&& ctxt, It first, It last, OutIt d_first)
->decltype(copy_n(std::forward<Context>(ctxt), first, last - first, d_first)){
return copy_n(std::forward<Context>(ctxt), first, last - first, d_first);}
template<class X1D, class Y1D>
auto copy(X1D const& x, Y1D&& y)
->decltype(blas::copy_n(x.begin(), x.size(), y.begin()), std::forward<Y1D>(y)){assert(x.size()==y.size());
return blas::copy_n(x.begin(), x.size(), y.begin()), std::forward<Y1D>(y);}
template<class Context, class X1D, class Y1D>
auto copy(Context&& ctxt, X1D const& x, Y1D&& y)
->decltype(blas::copy_n(std::forward<Context>(ctxt), x.begin(), x.size(), y.begin()), std::forward<Y1D>(y)){assert(x.size()==y.size());
return blas::copy_n(std::forward<Context>(ctxt), x.begin(), x.size(), y.begin()), std::forward<Y1D>(y);}
template<class ContextPtr, class It1D>
class copy_iterator{
ContextPtr ctxt = {};
It1D it_;
public:
using difference_type = typename std::iterator_traits<It1D>::difference_type;
using value_type = typename std::iterator_traits<It1D>::value_type;
using pointer = void;
using reference = void;
using iterator_category = std::output_iterator_tag;
using iterator_type = It1D;
using context_type = ContextPtr;
constexpr explicit copy_iterator(It1D it) : it_{it}{}
constexpr copy_iterator(ContextPtr cp, It1D it) : ctxt{cp}, it_{it}{}
constexpr iterator_type base() const{return it_;}
template<class It1DOut>
friend constexpr It1DOut copy_n(copy_iterator first, difference_type count, It1DOut result){
return blas::copy_n(first.ctxt, first.base(), count, result);
}
template<class It1DOut>
friend constexpr It1DOut copy(copy_iterator first, copy_iterator last, It1DOut d_first){
return copy_n(first, distance(first, last), d_first);
}
template<class It1DOut>
friend constexpr It1DOut uninitialized_copy(copy_iterator first, copy_iterator last, It1DOut d_first){
return copy_n(first, distance(first, last), d_first);
}
friend constexpr difference_type distance(copy_iterator const& a, copy_iterator const& b){assert(stride(b.it_) == stride(a.it_));
return b.it_-a.it_;
}
constexpr value_type operator*() const{return *it_;}
};
template<class ContextPtr, class It1D, class DecayType = void, class DiffType = typename std::iterator_traits<It1D>::difference_type>
class copy_range{
ContextPtr ctxp_ = {};
It1D begin_, end_;
public:
using difference_type = DiffType;
using iterator = copy_iterator<ContextPtr, It1D>;
using decay_type = DecayType;
copy_range(copy_range&&) = default;
constexpr copy_range(It1D first, It1D last) : begin_{first}, end_{last}{}
constexpr copy_range(ContextPtr ctxp, It1D first, It1D last) : ctxp_{ctxp}, begin_{first}, end_{last}{}
constexpr difference_type size() const{return end_ - begin_;}
constexpr auto begin() const{return iterator{ctxp_, begin_};}
constexpr auto end() const{return iterator{ctxp_, end_ };}
constexpr typename decay_type::extensions_type extensions() const{return {size()};}
template<class Other, class=decltype(Other(std::declval<iterator>(), std::declval<iterator>()))>
operator Other() const{return Other(begin(), end());}
friend auto operator+(copy_range const& s){return s.operator decay_type();}
};
template<class DecayType, class It> NODISCARD()
auto copy(It const& first, It const& last)
->decltype(copy_range<void*, It, DecayType>{first, last}){
return copy_range<void*, It, DecayType>{first, last};}
template<class DecayType, class Context, class It> NODISCARD()
auto copy(Context&& ctxt, It const& first, It const& last)
->decltype(copy_range<Context, It, DecayType>{ctxt, first, last}){
return copy_range<Context, It, DecayType>{ctxt, first, last};}
template<class A> NODISCARD()
auto copy(A const& a) // need to specify templates (instead of deduced for intel)
->decltype(copy<typename A::decay_type, typename A::const_iterator>(a.begin(), a.end())){
return copy<typename A::decay_type, typename A::const_iterator>(a.begin(), a.end());}
template<class Context, class A, class=std::enable_if_t<blas::is_context<Context>{}>> NODISCARD()
auto copy(Context&& ctxt, A const& a)
->decltype(copy<typename A::decay_type, Context, typename A::const_iterator>(std::forward<Context>(ctxt), a.begin(), a.end())){
return copy<typename A::decay_type, Context, typename A::const_iterator>(std::forward<Context>(ctxt), a.begin(), a.end());}
}
}
#endif

View File

@ -0,0 +1,598 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
$CXXX $CXXFLAGS $0 -o $0.$X `pkg-config --libs blas`&&$0.$X&&rm $0.$X;exit
#endif
//(for a in `find tests/ -name '*.cpp'`; do sh $a || break; done); exit
// https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_CORE_HPP
#define MULTI_ADAPTORS_BLAS_CORE_HPP
//#include <cblas/cblas.h> // consider being replaceable by cblas.h
#include<iostream> // debug
#include<cassert>
#include<complex>
#include<stdint.h> // int64_t
#include<limits> // numeric_limits
#include<type_traits> // is_convertible
#include<cstring> // std::memcpy
#include "../blas/traits.hpp"
#include "../../config/MARK.hpp"
#if 0
#define MULTI_ASSERT1(ExpR) assert (ExpR)
#define MULTI_ASSERT2(ExpR, DescriptioN) MULTI_ASSERT1(ExpR && ##DescriptioN)
#else
#if not defined(NDEBUG)
#include<stdexcept>
#include<string>
#define MULTI_ASSERT1(ExpR) (void)((ExpR)?0:throw std::logic_error("\n" __FILE__ ":"+std::to_string(__LINE__)+"::\n"+std::string(__PRETTY_FUNCTION__)+"\nLogic assertion `" #ExpR "' failed."))
#define MULTI_ASSERT2(ExpR, DescriptioN) (void)((ExpR)?0:throw std::DescriptioN("\n" __FILE__ ":"+std::to_string(__LINE__)+"::\n"+std::string(__PRETTY_FUNCTION__)+"\nLogic assertion `" #ExpR "' failed."))
#else
#define MULTI_ASSERT1(ExpR) assert(ExpR)
#define MULTI_ASSERT2(ExpR, DescriptioN) assert(EXpR)
#endif
#endif
#ifdef CBLAS_H
#define BLAS(NamE) cblas_##NamE
#else
#define BLAS(NamE) NamE##_
extern "C"{
#ifndef _BLAS_INT
#if defined(__INTPTR_WIDTH__)
#define _BLAS_INT __INTPTR_WIDTH__
#endif
#endif
#define s float
#define d double
#define c std::complex<s>
#define z std::complex<d>
#define v void
typedef struct { float real, imag; } Complex_float ;
typedef struct { double real, imag; } Complex_double;
#define C Complex_float // _Complex s
#define Z Complex_double // _Complex d
#if defined(_BLAS_INT)
#if _BLAS_INT==32
#define INT int32_t
#elif _BLAS_INT==64
#define INT int64_t
#else
#define INT int32_t // 32bit safe? pesimistic?
#endif
#else
#define INT int32_t // 32bit safe? pesimistic?
#endif
namespace core{
using size_t = INT;
using ssize_t = std::make_signed_t<size_t>;
}
#define INTEGER INT const&
#define N INTEGER n
#define INCX INTEGER incx
#define INCY INTEGER incy
static_assert(sizeof(INT)==32/8 or sizeof(INT)==64/8, "please set _BLAS_INT to int32_t or int64_t");
// TODO indent declarations like here https://www.netlib.org/lapack/lug/node145.html
#define xROTG(T1, T2) v BLAS( T1##rotg)( T1 const*, T1 const*, T2*, T1*)
#define xROTMG(T) v BLAS( T##rotmg)( T*, T*, T*, T const&, T(&param)[5])
#define xROT(TT, T, S) v BLAS( TT##rot )(N, T *x, INCX, T *y, INCY, S const&, S const&)
#define xROTM(T) v BLAS( T##rotm )(N, T* x, INCX, T* y, INCY, T const(&p)[5])
#define xSWAP(T) v T ##swap##_ (N, T *x, INCX, T *y, INCY)
#define xSCAL(TT, TA, TX) v TT##scal##_ (N, TA const& a, TX *x, INCX )
#define xCOPY(T) v T ##copy##_ (N, T const *x, INCX, T *y, INCY)
#define xAXPY(T) v T ##axpy##_ (N, T const* a, T const *x, INCX, T *y, INCY)
#define xDOT(R, TT, T) R BLAS( TT##dot )(N, T const *x, INCX, T const *y, INCY)
#if defined(RETURN_BY_STACK) || (defined(FORTRAN_COMPLEX_FUNCTIONS_RETURN_VOID) && FORTRAN_COMPLEX_FUNCTIONS_RETURN_VOID)
#define xDOTU(R, T) v BLAS( T##dotu )(R*, N, T const *x, INCX, T const *y, INCY)
#define xDOTC(R, T) v T##dotc ##_ (R*, N, T const *x, INCX, T const *y, INCY)
#else
#define xDOTU(R, T) R T ##dotu##_ ( N, T const *x, INCX, T const *y, INCY)
#define xDOTC(R, T) R T ##dotc##_ ( N, T const *x, INCX, T const *y, INCY)
#endif
#define xxDOT(TT, T) T TT##dot ##_ ( N, T const& a, T const *x, INCX, T const *y, INCY)
#define xNRM2(R, TT, T) R TT##nrm2##_ ( N, T const *x, INCX )
#define xASUM(R, TT, T) R TT##asum##_ ( N, T const *x, INCX )
#define IxAMAX(T) INT i##T ##amax##_ ( N, T const* x, INCX )
xROTG(s, s) ; xROTG(d,d) ;// MKL extension xROTG(c, s); xROTG(z, d);
xROTMG(s) ; xROTMG(d) ;
xROT(s, s, s) ; xROT(d, d, d) ; xROT(cs, c, s); xROT(zd, z, d);
xROTM(s) ; xROTM(d) ;
xSWAP(s) ; xSWAP(d) ; xSWAP(c) ; xSWAP(z);
xSCAL(s, s, s); xSCAL(d, d, d); xSCAL(c, c, c); xSCAL(z, z, z); xSCAL(zd, d, z); xSCAL(cs, s, c);
xCOPY(s) ; xCOPY(d) ; xCOPY(c) ; xCOPY(z) ;
xAXPY(s) ; xAXPY(d) ; xAXPY(c) ; xAXPY(z) ;
xDOT(s, s, s); xDOT(d, d, d); xDOT(d, ds, s);
xDOTU(C, c); xDOTU(Z, z);
//xDOTU(c, c); xDOTU(z, z);
xDOTC(C, c); xDOTC(Z, z);
xxDOT(sds, s);
xNRM2(s, s, s); xNRM2(d, d, d); xNRM2(s, sc, c); xNRM2(d, dz, z);
xASUM(s, s, s); xASUM(d, d, d); xASUM(s, sc, c); xASUM(d, dz, z);
IxAMAX(s); IxAMAX(d); IxAMAX(c); IxAMAX(z);
#define TRANS const char& trans
#define NR INTEGER nr
#define NC INTEGER nc
#define LDA INTEGER lda
#define UPLO const char& uplo
#define DIAG const char& diag
#define xGEMV(T) void T## gemv ##_ ( TRANS, NR, NC, T const& a, T const* A, LDA, T const* X, INCX, T const& beta, T* Y, INCY )
#define xGER(T) void T## ger ##_ ( NR, NC, T const& a, T const* X, INCX, T const* Y, INCY, T* A, LDA)
#define xGERU(T) void T## geru ##_ ( NR, NC, T const& a, T const* X, INCX, T const* Y, INCY, T* A, LDA)
#define xGERC(T) void T## gerc ##_ ( NR, NC, T const& a, T const* X, INCX, T const* Y, INCY, T* A, LDA)
#define xTRSV(T) void T## trsv ##_ (UPLO, TRANS, DIAG, N, T const* A, LDA, T* X , INCX )
xGEMV(s); xGEMV(d); xGEMV(c); xGEMV(z);
xGER(s); xGER(d);
xGERU(c); xGERU(z);
xGERC(c); xGERC(z);
xTRSV(s); xTRSV(d); xTRSV(c); xTRSV(z);
#define TRANSA const char& transa
#define TRANSB const char& transb
#define NK INTEGER nk
#define LDB INTEGER ldb
#define LDC INTEGER ldc
#define SIDE const char& side
#define xGEMM(T) void T ##gemm ##_ ( TRANSA, TRANSB, NR, NC, NK, T const& a, T const* A, LDA, T const* B, LDB, T const& b , T const* CC, LDC)
#define xSYRK(T) void T ##syrk ##_ ( UPLO, TRANSA, NR, NK, T const& a, T const* A, LDA, T const& b , T* CC, LDC)
#define xHERK(TT, T) void T ##herk ##_ ( UPLO, TRANSA, NR, NK, TT const& a, T const* A, LDA, TT const& b , T* CC, LDC)
#define xTRSM(T) void T ##trsm ##_ (SIDE, UPLO, TRANSA, DIAG, NR, NK, T const& a, T const* A, LDA, T const* B, LDB )
xGEMM(s); xGEMM(d); xGEMM(c) ; xGEMM(z) ;
xSYRK(s); xSYRK(d); xSYRK(c) ; xSYRK(z) ;
xHERK(s, c); xHERK(d, z);
xTRSM(s); xTRSM(d); xTRSM(c) ; xTRSM(z) ;
#undef TRANS
#undef UPLO
#undef SIDE
#undef DIAG
#undef xROTG
#undef xROTMG
#undef xROT
#undef xROTM
#undef xSCAL
#undef xSWAP
#undef xCOPY
#undef xAXPY
#undef xDOT
#undef xDOTU
#undef xDOTC
#undef xxDOT
#undef xNRM2
#undef xASUM
#undef IxAMAX
#undef xGEMV
#undef xGER
#undef xGERU
#undef xGERC
#undef xGEMM
#undef xHERK
#undef xTRSM
#undef s
#undef d
#undef c
#undef z
#undef C
#undef Z
#undef v
#undef INTEGER
#undef N
#undef INCX
#undef INCY
#undef TRANSA
#undef TRANSB
#undef LDA
#undef LDB
#undef LDC
}
#endif
namespace boost{
namespace multi{
namespace blas{
template<class T> struct complex_ptr{
std::complex<T>* impl_;
template<class TT, class=std::enable_if_t<sizeof(*TT{})==sizeof(std::complex<T>) and sizeof(*TT{})==sizeof(TT{}->real())+sizeof(TT{}->imag())>>
explicit complex_ptr(TT tt) : impl_{reinterpret_cast<std::complex<T>*>(tt)}{}
complex_ptr(complex_ptr const&) = delete;
operator std::complex<T>*() const{return impl_;}
std::complex<T>& operator*() const{return *impl_;}
};
template<class T> struct complex_const_ptr{
std::complex<T> const* impl_;
template<class TT, class=std::enable_if_t<sizeof(*TT{})==sizeof(std::complex<T>) and sizeof(*TT{})==sizeof(TT{}->real())+sizeof(TT{}->imag())>>
explicit complex_const_ptr(TT tt) : impl_{reinterpret_cast<std::complex<T> const*>(tt)}{}
complex_const_ptr(complex_const_ptr const&) = delete;
operator std::complex<T> const*() const{return impl_;}
std::complex<T> const& operator*() const{return *impl_;}
};
template<class T> struct add_ptr{using type = T*;};
template<class T> struct add_const_ptr{using type = T const*;};
template<class T> struct add_ptr<std::complex<T>>{using type = complex_ptr<T>;};
template<class T> struct add_const_ptr<std::complex<T>>{using type = complex_const_ptr<T>;};
template<class T> using add_ptr_t = typename add_ptr<T>::type;
template<class T> using add_const_ptr_t = typename add_const_ptr<T>::type;
namespace{
using s = float;
using d = double;
using c = std::complex<s>; using C = Complex_float ;
using z = std::complex<d>; using Z = Complex_double;
using v = void;
}
#define BC(x) [](auto xx){assert(xx>=std::numeric_limits<INT>::min() and xx<std::numeric_limits<INT>::max()); return xx;}(x)
#define xrotg(T1, T2) v rotg (T1 const& a, T1 const& b, T2& cc, T1& ss ){ BLAS(T1##rotg )(const_cast<T1*>(&a), const_cast<T1*>(&b), &cc, &ss); }
#define xrotmg(T) v rotmg(T& d1, T& d2, T& A, T const& B, T(&p)[5] ){ BLAS( T##rotmg)(&d1, &d2, &A, B, p); }
#define xrot(T, TT, CS) template<class S> v rot (S n, T *x, S incx, T *y, S incy, CS const& cos, CS const& sin){ BLAS(TT##rot )(BC(n), x, BC(incx), y, BC(incy), cos, sin); }
#define xrotm(T) template<class S> v rotm (S n, T *x, S incx, T *y, S incy, T const(&p)[5] ){ BLAS( T##rotm )(BC(n), x, BC(incx), y, BC(incy), p); }
#define xswap(T) template<class S> v swap (S n, T *x, S incx, T *y, S incy ){ BLAS( T##swap )(BC(n), x, BC(incx), y, BC(incy)); }
#define xscal(XX, TA, TX) TX* scal (INT n, TA const* a, TX *x, INT incx ){ BLAS(XX##scal )(BC(n), *a, x, BC(incx) ); return x+n*incx;}
//#define xcopy(T) v copy (INT n, T const *x, INT incx, T *y, INT incy ){ BLAS( T##copy )(BC(n), x, BC(incx), y, BC(incy)); }
//#define xaxpy(T) template<class S> T* axpy (S n, T a, T const *x, S incx, T *y, S incy ){ BLAS( T##axpy )(BC(n), a, x, BC(incx), y, BC(incy)); return y+n*incy; }
#define xdot(R, TT, T) template<class S> v dot (S n, T const* x, S incx, T const* y, S incy, R* r ){\
MULTI_MARK_SCOPE("cpu_dot"); *r = BLAS(TT##dot )(BC(n), x, BC(incx), y, BC(incy)); }
xrotg(s, s) xrotg(d, d) //MKL extension xrotg(c, s); xrotg(z, d);
xrotmg(s) xrotmg(d)
xrot(s, s, s) xrot(d, d, d) xrot(c, cs, s) xrot(z, zd, d)
xrotm(s) xrotm(d)
xswap(s) xswap(d) xswap(c) xswap(z)
namespace core{
xscal(s, s, s) xscal(d, d, d) xscal(c, c, c) xscal(z, z, z) xscal(zd, d, z) xscal(cs, s, c)
using std::enable_if_t;
using std::is_assignable;
template<class SX, class SY, enable_if_t<is_s<SX>{} and is_s<SY>{} and is_assignable<SY&, SX&>{},int> =0> void copy(size_t n, SX* x, size_t incx, SY* y, size_t incy){BLAS(scopy)(n, ( float const*)(x), incx, ( float *)(y), incy);}
template<class DX, class DY, enable_if_t<is_d<DX>{} and is_d<DY>{} and is_assignable<DY&, DX&>{},int> =0> void copy(size_t n, DX* x, size_t incx, DY* y, size_t incy){BLAS(dcopy)(n, ( double const*)(x), incx, ( double *)(y), incy);}
template<class CX, class CY, enable_if_t<is_c<CX>{} and is_c<CY>{} and is_assignable<CY&, CX&>{},int> =0> void copy(size_t n, CX* x, size_t incx, CY* y, size_t incy){BLAS(ccopy)(n, (std::complex<float > const*)(x), incx, (std::complex<float >*)(y), incy);}
template<class ZX, class ZY, enable_if_t<is_z<ZX>{} and is_z<ZY>{} and is_assignable<ZY&, ZX&>{},int> =0> void copy(size_t n, ZX* x, size_t incx, ZY* y, size_t incy){BLAS(zcopy)(n, (std::complex<double> const*)(x), incx, (std::complex<double>*)(y), incy);}
xdot(s, s, s) xdot(d, d, d) xdot(d, ds, s)
using std::pointer_traits;
using std::enable_if_t;
using std::is_convertible_v;
#define xaxpy(T) \
template<class ALPHA, class SXP, class SX = typename pointer_traits<SXP>::element_type, class SYP, class SY = typename pointer_traits<SYP>::element_type, enable_if_t< \
is_##T<ALPHA>{} and is_##T<SX>{} and is_##T<SY>{} and is_assignable<SY&, decltype(ALPHA{}*SX{})>{} \
and is_convertible_v<SXP, SX*> and is_convertible_v<SYP, SY*> \
, int> =0> \
void axpy(size_t n, ALPHA const* a, SXP x, size_t incx, SYP y, size_t incy){BLAS(T##axpy)(n, (T const *)a, (T const*)static_cast<SX*>(x), incx, (T*)static_cast<SY*>(y), incy);}
xaxpy(s) xaxpy(d) xaxpy(c) xaxpy(z)
#undef xaxpy
//template<class A, class SX, class SY, enable_if_t<is_s<SX>{} and is_s<SY>{} and is_assignable<SY&, decltype(A{}*SX{})>{}, int> =0> void axpy(size_t n, A a, SX* x, size_t incx, SY* y, size_t incy){BLAS(saxpy)(n, a, (s const*)(x), incx, (s*)(y), incy);}
//template<class A, class DX, class DY, enable_if_t<is_d<DX>{} and is_d<DY>{} and is_assignable<DY&, decltype(A{}*DX{})>{}, int> =0> void axpy(size_t n, A a, DX* x, size_t incx, DY* y, size_t incy){BLAS(daxpy)(n, a, (d const*)(x), incx, (d*)(y), incy);}
//template<class A, class CX, class CY, enable_if_t<is_c<CX>{} and is_c<CY>{} and is_assignable<CY&, decltype(A{}*CX{})>{}, int> =0> void axpy(size_t n, A a, CX* x, size_t incx, CY* y, size_t incy){BLAS(caxpy)(n, a, (c const*)(x), incx, (c*)(y), incy);}
//template<class A, class ZX, class ZY, enable_if_t<is_z<ZX>{} and is_z<ZY>{} and is_assignable<ZY&, decltype(A{}*ZX{})>{}, int> =0> void axpy(size_t n, A a, ZX* x, size_t incx, ZY* y, size_t incy){BLAS(zaxpy)(n, a, (z const*)(x), incx, (z*)(y), incy);}
}
template<class R, class S, class T> R dot(S n, T const* x, S incx, T const* y, S incy){
R ret;
dot(n, x, incx, y, incy, &ret);
return ret;
}
template<class S, class T> T dot(S n, T const* x, S incx, T const* y, S incy){
return dot<T, S, T>(n, x, incx, y, incy);
}
#undef xrotg
#undef xrot
#undef xswap
#undef xscal
#undef xcopy
#undef xaxpy
#undef xdot
#ifndef CBLAS_H
namespace core{
using std::enable_if_t;
using std::is_assignable;
#if defined(RETURN_BY_STACK) || (defined(FORTRAN_COMPLEX_FUNCTIONS_RETURN_VOID) && FORTRAN_COMPLEX_FUNCTIONS_RETURN_VOID)
template<class X, class Y, class R, enable_if_t<is_c<X>{} and is_c<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotu(size_t n, X* x, size_t incx, Y* y, size_t incy, R* r){BLAS(cdotu)((Complex_float *)r, n, (c const*)x, incx, (c const*)y, incy);}
template<class X, class Y, class R, enable_if_t<is_z<X>{} and is_z<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotu(size_t n, X* x, size_t incx, Y* y, size_t incy, R* r){BLAS(zdotu)((Complex_double*)r, n, (z const*)x, incx, (z const*)y, incy);}
template<class X, class Y, class R, enable_if_t<is_c<X>{} and is_c<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotc(size_t n, X* x, size_t incx, Y* y, size_t incy, R* r){BLAS(cdotc)((Complex_float *)r, n, (c const*)x, incx, (c const*)y, incy);}
template<class X, class Y, class R, enable_if_t<is_z<X>{} and is_z<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotc(size_t n, X* x, size_t incx, Y* y, size_t incy, R* r){BLAS(zdotc)((Complex_double*)r, n, (z const*)x, incx, (z const*)y, incy);}
#else
template<class XP, class X = typename std::pointer_traits<XP>::element_type, class YP, class Y = typename std::pointer_traits<YP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_c<X>{} and is_c<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotu(size_t n, XP x, size_t incx, YP y, size_t incy, RP r){auto rr = BLAS(cdotu)(n, (c const*)static_cast<X*>(x), incx, (c const*)static_cast<Y*>(y), incy); std::memcpy(reinterpret_cast<float (*)[2]>(static_cast<R*>(r)), &rr, sizeof(rr)); static_assert(sizeof(rr)==sizeof(*r));}
template<class XP, class X = typename std::pointer_traits<XP>::element_type, class YP, class Y = typename std::pointer_traits<YP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_z<X>{} and is_z<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotu(size_t n, XP x, size_t incx, YP y, size_t incy, RP r){auto rr = BLAS(zdotu)(n, (z const*)static_cast<X*>(x), incx, (z const*)static_cast<Y*>(y), incy); std::memcpy(reinterpret_cast<double(*)[2]>(static_cast<R*>(r)), &rr, sizeof(rr)); static_assert(sizeof(rr)==sizeof(*r));}
template<class XP, class X = typename std::pointer_traits<XP>::element_type, class YP, class Y = typename std::pointer_traits<YP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_c<X>{} and is_c<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotc(size_t n, XP x, size_t incx, YP y, size_t incy, RP r){auto rr = BLAS(cdotc)(n, (c const*)static_cast<X*>(x), incx, (c const*)static_cast<Y*>(y), incy); std::memcpy(reinterpret_cast<float (*)[2]>(static_cast<R*>(r)), &rr, sizeof(rr)); static_assert(sizeof(rr)==sizeof(*r));}
template<class XP, class X = typename std::pointer_traits<XP>::element_type, class YP, class Y = typename std::pointer_traits<YP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_z<X>{} and is_z<Y>{} and is_assignable<R&, decltype(0.+X{}*Y{}+X{}*Y{})>{}, int> =0> void dotc(size_t n, XP x, size_t incx, YP y, size_t incy, RP r){auto rr = BLAS(zdotc)(n, (z const*)static_cast<X*>(x), incx, (z const*)static_cast<Y*>(y), incy); std::memcpy(reinterpret_cast<double(*)[2]>(static_cast<R*>(r)), &rr, sizeof(rr)); static_assert(sizeof(rr)==sizeof(*r));}
#endif
}
#else
// TODO: make cblas version
#define xdotu(T) template<class S> v dotu(S n, add_const_ptr_t<T> x, S incx, add_const_ptr_t<T> y, S incy, add_ptr_t<T> r){BLAS(T##dotu_sub)(BC(n), x, BC(incx), y, BC(incy), r);}
#define xdotc(T) template<class S> v dotc(S n, add_const_ptr_t<T> x, S incx, add_const_ptr_t<T> y, S incy, add_ptr_t<T> r){BLAS(T##dotc_sub)(BC(n), x, BC(incx), y, BC(incy), r);}
namespace core{
xdotu(c) xdotu(z)
xdotc(c) xdotc(z)
}
#undef xdotu
#undef xdotc
#endif
namespace core{
template<class S> s dot(S n, s const& b, s const* x, S incx, s const* y, S incy){return BLAS(sdsdot)(BC(n), b, x, BC(incx), y, BC(incy));}
//template<class S> void dot(S n, s const& b, s const* x, S incx, s const* y, S incy, s* result){*result = BLAS(sdsdot)(BC(n), b, x, BC(incx), y, BC(incy));}
}
//#define xnrm2(R, T, TT) template<class S> v nrm2 (S n, add_const_ptr_t<T> x, S incx, R* r){*r = BLAS(TT##nrm2 )(BC(n), x, BC(incx));}
#define xasum(T, TT) template<class S> auto asum (S n, T const* x, S incx){return BLAS(TT##asum )(BC(n), x, BC(incx));}
#define ixamax(T) template<class S> auto iamax(S n, T const* x, S incx){return BLAS(i##T##amax)(BC(n), x, BC(incx)) - 1;}
xasum(s, s) xasum(d, d) xasum (c, sc) xasum(z, dz)
namespace core{
// xnrm2(s, s, s) xnrm2(d, d, d) xnrm2(s, c, sc) xnrm2(d, z, dz)
template<class XP, class X = typename std::pointer_traits<XP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_s<X>{} and is_s<R>{} and std::is_assignable<R&, decltype(X{})>{} , int> =0> void nrm2(size_t n, XP x, size_t incx, RP r){auto rr = BLAS(snrm2) (n, (s const*)static_cast<X*>(x), incx); std::memcpy((s*)static_cast<R*>(r), &rr, sizeof(s));}
template<class XP, class X = typename std::pointer_traits<XP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_d<X>{} and is_d<R>{} and std::is_assignable<R&, decltype(X{})>{} , int> =0> void nrm2(size_t n, XP x, size_t incx, RP r){auto rr = BLAS(dnrm2) (n, (d const*)static_cast<X*>(x), incx); std::memcpy((s*)static_cast<R*>(r), &rr, sizeof(d));}
template<class XP, class X = typename std::pointer_traits<XP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_c<X>{} and is_s<R>{} and std::is_assignable<R&, decltype(std::norm(X{}))>{}, int> =0> void nrm2(size_t n, XP x, size_t incx, RP r){auto rr = BLAS(scnrm2)(n, (c const*)static_cast<X*>(x), incx); std::memcpy((s*)static_cast<R*>(r), &rr, sizeof(s));}
template<class XP, class X = typename std::pointer_traits<XP>::element_type, class RP, class R = typename std::pointer_traits<RP>::element_type, enable_if_t<is_z<X>{} and is_d<R>{} and std::is_assignable<R&, decltype(std::norm(X{}))>{}, int> =0> void nrm2(size_t n, XP x, size_t incx, RP r){auto rr = BLAS(dznrm2)(n, (z const*)static_cast<X*>(x), incx); std::memcpy((s*)static_cast<R*>(r), &rr, sizeof(d));}
// template<class S> v nrm2 (S n, typename add_const_ptr<std::complex<double>>::type x, S incx, d* r){*r = BLAS(dznrm2 )(BC(n), x, BC(incx));}
ixamax(s) ixamax(d) ixamax(c) ixamax(z)
}
#undef xnrm2
#undef xasum
#undef ixamax
///////////////////////////////////////////////////////////////////////////////
// LEVEL2
#define xgemv(T) template<class C, class S> v gemv(C trans, S m, S n, T const& a, T const* A, S lda, T const* X, S incx, T beta, T* Y, S incy ){BLAS(T##gemv)(trans, BC(m), BC(n), a, A, BC(lda), X, BC(incx), beta, Y, BC(incy) );}
#define xger(T) template< class S> v ger ( S m, S n, T const& a, T const* X, S incx, T const* Y, S incy, T* A, S lda){BLAS(T##ger )( BC(m), BC(n), a, X, BC(incx), Y, BC(incy), A, BC(lda));}
template< class S> v ger ( S m, S n, c const& a, c const* X, S incx, c const* Y, S incy, c* A, S lda){BLAS(cgeru )( BC(m), BC(n), a, X, BC(incx), Y, BC(incy), A, BC(lda));}
template< class S> v ger ( S m, S n, z const& a, z const* X, S incx, z const* Y, S incy, z* A, S lda){BLAS(zgeru )( BC(m), BC(n), a, X, BC(incx), Y, BC(incy), A, BC(lda));}
#define xgeru(T) template< class S> v geru( S m, S n, T const& a, T const* X, S incx, T const* Y, S incy, T* A, S lda){BLAS(T##geru)( BC(m), BC(n), a, X, BC(incx), Y, BC(incy), A, BC(lda));}
#define xgerc(T) template< class S> v gerc( S m, S n, T const& a, T const* X, S incx, T const* Y, S incy, T* A, S lda){BLAS(T##gerc)( BC(m), BC(n), a, X, BC(incx), Y, BC(incy), A, BC(lda));}
namespace core{
//xgemv(s) xgemv(d) xgemv(c) xgemv(z)
xger(s) xger(d)
xgeru(c) xgeru(z)
xgerc(c) xgerc(z)
using std::enable_if_t;
using std::is_assignable;
template<class A, class M, class X, class B, class Y, enable_if_t<is_s<M>{} and is_s<X>{} and is_s<Y>{} and is_assignable<Y&, decltype(A{}*M{}*X{}+B{}*Y{})>{}, int> =0> void gemv(char trans, size_t m, size_t n, A const& a, M* ma, size_t lda, X* x, size_t incx, B b, Y* y, size_t incy){BLAS(sgemv)(trans, m, n, a, (s const*)ma, lda, (s const*)x, incx, b, (s*)y, incy);}
template<class A, class M, class X, class B, class Y, enable_if_t<is_d<M>{} and is_d<X>{} and is_d<Y>{} and is_assignable<Y&, decltype(A{}*M{}*X{}+B{}*Y{})>{}, int> =0> void gemv(char trans, size_t m, size_t n, A const& a, M* ma, size_t lda, X* x, size_t incx, B b, Y* y, size_t incy){BLAS(dgemv)(trans, m, n, a, (d const*)ma, lda, (d const*)x, incx, b, (d*)y, incy);}
template<class A, class M, class X, class B, class Y, enable_if_t<is_c<M>{} and is_c<X>{} and is_c<Y>{} and is_assignable<Y&, decltype(A{}*M{}*X{}+B{}*Y{})>{}, int> =0> void gemv(char trans, size_t m, size_t n, A const& a, M* ma, size_t lda, X* x, size_t incx, B b, Y* y, size_t incy){BLAS(cgemv)(trans, m, n, a, (c const*)ma, lda, (c const*)x, incx, b, (c*)y, incy);}
template<class A, class M, class X, class B, class Y, enable_if_t<is_z<M>{} and is_z<X>{} and is_z<Y>{} and is_assignable<Y&, decltype(A{}*M{}*X{}+B{}*Y{})>{}, int> =0> void gemv(char trans, size_t m, size_t n, A const& a, M* ma, size_t lda, X* x, size_t incx, B b, Y* y, size_t incy){BLAS(zgemv)(trans, m, n, a, (z const*)ma, lda, (z const*)x, incx, b, (z*)y, incy);}
//template<class SX, class SY, enable_if_t<is_s<SX>{} and is_s<SY>{} and is_assignable<SY&, SX&>{},int> =0> void copy(size_t n, SX* x, size_t incx, SY* y, size_t incy){BLAS(scopy)(n, ( float const*)(x), incx, ( float *)(y), incy);}
//template<class DX, class DY, enable_if_t<is_d<DX>{} and is_d<DY>{} and is_assignable<DY&, DX&>{},int> =0> void copy(size_t n, DX* x, size_t incx, DY* y, size_t incy){BLAS(dcopy)(n, ( double const*)(x), incx, ( double *)(y), incy);}
//template<class CX, class CY, enable_if_t<is_c<CX>{} and is_c<CY>{} and is_assignable<CY&, CX&>{},int> =0> void copy(size_t n, CX* x, size_t incx, CY* y, size_t incy){BLAS(ccopy)(n, (std::complex<float > const*)(x), incx, (std::complex<float >*)(y), incy);}
//template<class ZX, class ZY, enable_if_t<is_z<ZX>{} and is_z<ZY>{} and is_assignable<ZY&, ZX&>{},int> =0> void copy(size_t n, ZX* x, size_t incx, ZY* y, size_t incy){BLAS(zcopy)(n, (std::complex<double> const*)(x), incx, (std::complex<double>*)(y), incy);}
}
template<class T>
struct blas2{
// template<class S>
// static v trsv(char ulA, char transA, char di, S m, T const* A, S lda, T* X, S incx) = delete;
};
template<> struct blas2<s>{template<class... As> static v trsv(As... as) {BLAS(strsv)(as...);}};
template<> struct blas2<d>{template<class... As> static v trsv(As... as) {BLAS(dtrsv)(as...);}};
template<> struct blas2<c>{template<class... As> static v trsv(As... as) {BLAS(ctrsv)(as...);}};
template<> struct blas2<z>{template<class... As> static auto trsv(As... as)->decltype(BLAS(ztrsv)(as...)){BLAS(ztrsv)(as...);}};
namespace core{
template<typename TconstP, typename TP, typename S=std::size_t, typename C=char> v trsv(C ulA, C transA, C diA, S n, TconstP A, S lda, TP X, S incx){blas2<std::decay_t<typename std::pointer_traits<TP>::element_type>>::trsv(ulA, transA, diA, n, A, lda, X, incx);}
}
#undef xgemv
#undef xger
#undef xgeru
#undef xgerc
///////////////////////////////////////////////////////////////////////////////
// LEVEL 3
#define xsyrk(T) template<class UL, class C, class S> v syrk( UL ul, C transA, S n, S k, T alpha, T const* A, S lda, T beta, T* CC, S ldc){\
MULTI_MARK_SCOPE("cpu_syrk"); BLAS(T##syrk)( ul, transA, BC(n), BC(k), alpha, A, BC(lda), beta, CC, BC(ldc));}
namespace core{
using std::is_convertible_v;
using std::pointer_traits;
using std::enable_if_t;
using std::max;
#define xherk(T) \
template<class UL, class C, class S, class ALPHA, class AAP, class AA = typename pointer_traits<AAP>::element_type, class BETA, class CCP, class CC = typename pointer_traits<CCP>::element_type, class Real = typename T::value_type,\
enable_if_t< \
is_##T<AA>{} and is_##T<CC>{} and is_assignable<CC&, decltype(ALPHA{}*AA{}*AA{})>{} and \
is_convertible_v<AAP, AA*> and is_convertible_v<CCP, CC*> \
, int> =0> \
v herk( UL ul, C transA, S n, S k, ALPHA const* alpha, AAP aa, S lda, BETA const* beta, CCP cc, S ldc) \
/*=delete;*/ \
{ \
if(transA == 'N' or transA == 'n') MULTI_ASSERT1( lda >= max(1l, n) ); else MULTI_ASSERT1( lda >= max(1l, k) ); \
MULTI_ASSERT1( ldc >= max(1l, n) ); \
MULTI_MARK_SCOPE("cpu_herk"); BLAS(T##herk)( ul, transA, BC(n), BC(k), *(Real const*)alpha, aa, BC(lda), *(Real const*)beta, cc, BC(ldc)); \
}
#define xgemm(T) \
template<class ALPHA, class AAP, class AA = typename pointer_traits<AAP>::element_type, class BBP, class BB = typename pointer_traits<BBP>::element_type, class BETA, class CCP, class CC = typename pointer_traits<CCP>::element_type, \
enable_if_t< \
is_##T<AA>{} and is_##T<BB>{} and is_##T<CC>{} and is_assignable<CC&, decltype(ALPHA{}*AA{}*BB{})>{} and \
is_convertible_v<AAP, AA*> and is_convertible_v<BBP, BB*> and is_convertible_v<CCP, CC*> \
, int> =0 > \
v gemm(char transA, char transB, ssize_t m, ssize_t n, ssize_t k, ALPHA const* alpha, AAP aa, ssize_t lda, BBP bb, ssize_t ldb, BETA const* beta, CCP cc, ssize_t ldc) \
{ \
MULTI_MARK_SCOPE("cpu_gemm"); \
using std::max; \
if(transA =='N') MULTI_ASSERT1(lda >= max(1l, m)); else MULTI_ASSERT1(lda >= max(1l, k)); \
if(transB =='N') MULTI_ASSERT1(ldb >= max(1l, k)); else MULTI_ASSERT1(ldb >= max(1l, n)); \
MULTI_ASSERT1( aa != cc ); \
MULTI_ASSERT1( bb != cc ); \
MULTI_ASSERT1(ldc >= max(ssize_t{1}, m)); \
if(*beta != 0.) MULTI_ASSERT1((is_assignable<CC&, decltype(ALPHA{}*AA{}*BB{} + BETA{}*CC{})>{})); \
BLAS(T##gemm)(transA, transB, BC(m), BC(n), BC(k), *(T const*)alpha, (T const*)static_cast<AA*>(aa), BC(lda), (T const*)static_cast<BB*>(bb), BC(ldb), *(T const*)beta, (T*)static_cast<CC*>(cc), BC(ldc)); \
}
xgemm(s) xgemm(d) xgemm(c) xgemm(z)
#undef xgemm
#define xtrsm(T) \
template<class ALPHA, class AAP, class AA = typename pointer_traits<AAP>::element_type, class BBP, class BB = typename pointer_traits<BBP>::element_type, \
enable_if_t< \
is_##T<AA>{} and is_##T<BB>{} and is_assignable<BB&, decltype(AA{}*BB{}/ALPHA{})>{} and is_assignable<BB&, decltype(ALPHA{}*BB{}/AA{})>{} and \
is_convertible_v<AAP, AA*> and is_convertible_v<BBP, BB*> \
,int> =0> \
v trsm(char side, char ul, char transA, char diag, ssize_t m, ssize_t n, ALPHA alpha, AAP aa, ssize_t lda, BBP bb, ssize_t ldb){ \
MULTI_MARK_SCOPE("cpu_trsm"); \
assert( side == 'L' or side == 'R' ); \
assert( ul == 'U' or ul == 'L' ); \
assert( transA == 'N' or transA == 'T' or transA == 'C' ); \
assert( diag == 'U' or diag == 'N' ); \
MULTI_ASSERT1( m >= 0 and n >= 0 ); \
using std::max; \
if(side == 'L') MULTI_ASSERT1(lda >= max(ssize_t{1}, m)); else if(side == 'R') assert( lda >= max(ssize_t{1}, n) ); \
MULTI_ASSERT1( ldb >= max(ssize_t{1}, m) ); \
BLAS(T##trsm)(side, ul, transA, diag, BC(m), BC(n), alpha, (T const*)static_cast<AA*>(aa), BC(lda), (T*)static_cast<BB*>(bb), BC(ldb)); \
}
xtrsm(s) xtrsm(d) xtrsm(c) xtrsm(z)
#undef xtrsm
xsyrk(s) xsyrk(d) xsyrk(c) xsyrk(z)
xherk(c) xherk(z)
}
#undef xsyrk
#undef xherk
#undef xtrsm
#undef BC
struct context{ // stateless (and thread safe)
template<class... As>
static auto axpy(As... as)
->decltype(core::axpy(as...)){
return core::axpy(as...);}
template<class... As>
static auto gemv(As... as)
->decltype(core::gemv(as...)){
return core::gemv(as...);}
template<class... As>
static auto gemm(As&&... as)
->decltype(core::gemm(std::forward<As>(as)...)){
return core::gemm(std::forward<As>(as)...);}
template<class... As>
static auto dot(As&&... as)
->decltype(core::dot(std::forward<As>(as)...)){
return core::dot(std::forward<As>(as)...);}
template<class... As>
static auto dotc(As&&... as)
->decltype(core::dotc(std::forward<As>(as)...)){
return core::dotc(std::forward<As>(as)...);}
template<class... As>
static auto dotu(As&&... as)
->decltype(core::dotu(std::forward<As>(as)...)){
return core::dotu(std::forward<As>(as)...);}
template<class... As>
static auto trsm(As&&... as)
->decltype(core::trsm(std::forward<As>(as)...)){
return core::trsm(std::forward<As>(as)...);}
template<class... As>
static auto herk(As&&... as)
->decltype(core::herk(std::forward<As>(as)...)){
return core::herk(std::forward<As>(as)...);}
};
template<class Context> struct is_context : std::false_type{};
template<> struct is_context<context> : std::true_type{};
template<> struct is_context<context&&> : std::true_type{};
template<> struct is_context<context&> : std::true_type{};
template<> struct is_context<context const&> : std::true_type{};
template<> struct is_context<void*&> : std::true_type{};
namespace core{
template<class Context, class... As>
auto copy(Context&&, As... as)
->decltype(core::copy(as...)){
return core::copy(as...);}
}
template<class TPtr, std::enable_if_t<std::is_convertible<TPtr, typename std::pointer_traits<TPtr>::element_type*>{}, int> =0>
blas::context* default_context_of(TPtr const&){return {};}
}
}}
///////////////////////////////////////////////////////////////////////////////
#if not __INCLUDE_LEVEL__
#include "../../array.hpp"
#include "../../utility.hpp"
int main(){}
#endif
#endif

View File

@ -0,0 +1,545 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
$CXXX $CXXFLAGS -include"boost/log/trivial.hpp" -D'MULTI_MARK_SCOPE(MsG)=BOOST_LOG_TRIVIAL(trace)<<MsG' -DBOOST_LOG_DYN_LINK $0 -o $0x `pkg-config --cflags --libs cudart-11.0 cublas-11.0 blas` -lboost_unit_test_framework -lboost_log -lboost_thread -lboost_system -lboost_log_setup -lpthread&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_CUDA_HPP
#define MULTI_ADAPTORS_BLAS_CUDA_HPP
#include "../blas/../../config/MARK.hpp" // MULTI_MARK_SCOPE
#include "../../adaptors/blas/core.hpp" // is_context
#include "../../memory/adaptors/cuda/ptr.hpp"
#include "../../memory/adaptors/cuda/managed/ptr.hpp"
#include "../../memory/adaptors/cuda/managed/allocator.hpp"
#include<cublas_v2.h>
#include "../cuda/cublas/error.hpp"
#include<thrust/complex.h>
#define DECLRETURN(ExpR) ->decltype(ExpR){return ExpR;}
#define JUSTRETURN(ExpR) {return ExpR;}
#include<complex>
///////////////////
#include<system_error>
#define CUBLAS_CALL(CodE) \
MULTI_MARK_SCOPE("multi::cublas::"#CodE); \
auto s = static_cast<enum boost::multi::cuda::cublas::error>(CodE); \
cudaDeviceSynchronize(); /*TODO make this more specific to mananged ptr and specific handle*/ \
if(s != boost::multi::cuda::cublas::error::success) throw std::system_error{boost::multi::cuda::cublas::make_error_code(s), "cannot call cublas function "#CodE };
cublasStatus_t cublasZdot (cublasHandle_t handle, int n,
const double2 *x, int incx,
const double2 *y, int incy,
double2 *result) = delete;
namespace boost{
namespace multi{
namespace cublas{
using Complex = cuComplex;
using DoubleComplex = cuDoubleComplex;
namespace {
template<class T> struct complex_t;
template<> struct complex_t<float>{using type = Complex;};
template<> struct complex_t<double>{using type = DoubleComplex;};
}
template<class T> using complex = typename complex_t<T>::type;
// 2.2.7. cublasPointerMode_t https://docs.nvidia.com/cuda/cublas/index.html#cublaspointermode_t
enum class pointer_mode : std::underlying_type<cublasPointerMode_t>::type{
host = CUBLAS_POINTER_MODE_HOST,
device = CUBLAS_POINTER_MODE_DEVICE
};
template<class T> enum pointer_mode scalar_kind(memory::cuda::ptr<T>){return pointer_mode::device;}
template<class T> enum pointer_mode scalar_kind(T*){return pointer_mode::host;}
}
using v = void;
using S = float;
using D = double;
using C = cublas::complex<float>;
using Z = cublas::complex<double>;
template<class T = void> struct cublas1{};
template<class T = void> struct cublas2{};
template<class T = void> struct cublas3{};
#define DEFINE_CUBLAS1(UppeR, LowR) \
template<> struct cublas1<UppeR>{ \
template<class...As> static auto iamax(As...as){return cublasI##LowR##amax(as...);} \
/*amin */ \
template<class...As> static auto asum (As...as){return cublas##UppeR##asum (as...);} \
/*axpy */ \
template<class...As> static auto copy (As...as){return cublas##UppeR##copy (as...);} \
template<class...As> static auto dot (As...as){return cublas##UppeR##dot (as...);} \
template<class...As> static auto dotu (As...as){return cublas##UppeR##dotu (as...);} \
template<class...As> static auto dotc (As...as){return cublas##UppeR##dotc (as...);} \
template<class...As> static auto nrm2 (As...as){return cublas##UppeR##nrm2 (as...);} \
/*rot */ \
/*rotg */ \
/*rotmg*/ \
template<class...As> static auto scal (As...as){return cublas##UppeR##scal (as...);} \
/*swap */ \
}
DEFINE_CUBLAS1(S, s);
DEFINE_CUBLAS1(D, d);
#define DEFINE_CUBLAS1_COMPLEX(UppeR, LowR, ReaLUppeR, ReaLLowR) \
template<> struct cublas1<UppeR>{ \
template<class...As> static auto iamax(As...as){return cublasI##LowR##amax(as...);} \
/*amin */ \
template<class...As> static auto asum (As...as){return cublas##ReaLUppeR##LowR##asum (as...);} \
/*axpy */ \
template<class...As> static auto copy (As...as){return cublas##UppeR##copy (as...);} \
template<class...As> static auto dot (As...as){return cublas##UppeR##dotu (as...);} \
template<class...As> static auto dotu (As...as){return cublas##UppeR##dotu (as...);} \
template<class...As> static auto dotc (As...as){return cublas##UppeR##dotc (as...);} \
template<class...As> static auto nrm2 (As...as){return cublas##UppeR##nrm2 (as...);} \
/*rot */ \
/*rotg */ \
/*rotmg*/ \
template<class...As> static auto scal (As...as){return cublas##UppeR##scal (as...);} \
/*swap */ \
}
DEFINE_CUBLAS1_COMPLEX(C, c, S, s);
DEFINE_CUBLAS1_COMPLEX(Z, z, D, d);
template<class T> struct nrm2_result;//{using type = T;};
template<> struct nrm2_result<S>{using type = S;};
template<> struct nrm2_result<D>{using type = D;};
template<> struct nrm2_result<C>{using type = S;};
template<> struct nrm2_result<Z>{using type = D;};
template<> struct cublas1<void>{
// 2.5.1. cublasI<t>amax() https://docs.nvidia.com/cuda/cublas/index.html#cublasi-lt-t-gt-amax
template<class T> static cublasStatus_t iamax(cublasHandle_t handle, int n, const T* x, int incx, int *result ){return cublas1<T>::iamax(handle, n, x, incx, result);}
// 2.5.3. cublas<t>asum() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-asum
template<class T1, class T2> static cublasStatus_t asum (cublasHandle_t handle, int n, T1 const* x, int incx, T2* result ){return cublas1<T1>::asum(handle, n, x, incx, result);}
// 2.5.5. cublas<t>copy() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-copy
template<class T> static cublasStatus_t copy (cublasHandle_t handle, int n, const T* x, int incx, T* y, int incy){return cublas1<T>::copy(handle, n, x, incx, y, incy);}
// 2.5.6. cublas<t>dot() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-dot
template<class T> static auto dot(cublasHandle_t handle, int n, const T* x, int incx, const T* y, int incy, T* result)
->decltype(cublas1<T>::dot(handle, n, x, incx, y, incy, result)){MULTI_MARK_SCOPE("function dot");
return cublas1<T>::dot(handle, n, x, incx, y, incy, result);}
template<class T> static auto dotu(cublasHandle_t handle, int n, const T* x, int incx, const T* y, int incy, T* result)
->decltype(cublas1<T>::dotu(handle, n, x, incx, y, incy, result)){MULTI_MARK_SCOPE("function dotu");
return cublas1<T>::dotu(handle, n, x, incx, y, incy, result);}
template<class T> static auto dotc(cublasHandle_t handle, int n, const T* x, int incx, const T* y, int incy, T* result)
->decltype(cublas1<T>::dotc(handle, n, x, incx, y, incy, result)){MULTI_MARK_SCOPE("function dotc");
return cublas1<T>::dotc(handle, n, x, incx, y, incy, result);}
// 2.5.7. cublas<t>nrm2() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-nrm2
template<class T> static auto nrm2(cublasHandle_t handle, int n,
const T *x, int incx, typename nrm2_result<T>::type *result){return cublas1<T>::nrm2(handle, n, x, incx, result);}
// 2.5.12. cublas<t>scal() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-scale
template<class T> static cublasStatus_t scal(cublasHandle_t handle, int n,
const T *alpha,
T *x, int incx){return cublas1<T>::scal(handle, n, alpha, x, incx);}
};
template<> struct cublas2<void>{
// 2.6.16. cublas<t>trsv() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-trsv
template<class T> static cublasStatus_t trsv(cublasHandle_t handle, cublasFillMode_t uplo,
cublasOperation_t trans, cublasDiagType_t diag,
int n, const T *A, int lda,
T *x, int incx){return cublas2<T>::trsv(handle, uplo, trans, diag, n, A, lda, x, incx);}
};
template<> struct cublas2<S>{template<class...A> static auto trsv(A...a){return cublasStrsv(a...);}};
template<> struct cublas2<D>{template<class...A> static auto trsv(A...a){return cublasDtrsv(a...);}};
template<> struct cublas2<C>{template<class...A> static auto trsv(A...a){return cublasCtrsv(a...);}};
template<> struct cublas2<Z>{template<class...A> static auto trsv(A...a){return cublasZtrsv(a...);}};
template<> struct cublas3<S>{
template<class...As> static auto gemm (As...as){CUBLAS_CALL(cublasSgemm(as...));}
template<class...As> static auto syrk (As...as){CUBLAS_CALL(cublasSsyrk(as...));}
// template<class...As> static auto herk (As...as){return CUBLAS_CALL(cublasSherk)(as...);}
template<class...As> static auto trsm (As...as){CUBLAS_CALL(cublasStrsm(as...));}
};
template<> struct cublas3<D>{
template<class...As> static auto gemm (As...as){ CUBLAS_CALL(cublasDgemm(as...));}
template<class...As> static auto syrk (As...as){ CUBLAS_CALL(cublasDsyrk(as...));}
// template<class...As> static auto herk (As...as){return cublas_call(cublasDherk)(as...);}
template<class...As> static auto trsm (As...as){ CUBLAS_CALL(cublasDtrsm(as...));}
};
template<> struct cublas3<C>{
template<class...As> static auto gemm (As...as){ CUBLAS_CALL(cublasCgemm(as...));}
template<class...As> static auto syrk (As...as){ CUBLAS_CALL(cublasCsyrk(as...));}
template<class...As> static auto herk (As...as){ CUBLAS_CALL(cublasCherk(as...));}
template<class...As> static auto trsm (As...as){ CUBLAS_CALL(cublasCtrsm(as...));}
};
template<> struct cublas3<Z>{
template<class...As> static auto gemm (As...as){ CUBLAS_CALL(cublasZgemm(as...));}
template<class...As> static auto syrk (As...as){ CUBLAS_CALL(cublasZsyrk(as...));}
template<class...As> static auto herk (As...as){ CUBLAS_CALL(cublasZherk(as...));}
template<class...As> static auto trsm (As...as){ CUBLAS_CALL(cublasZtrsm(as...));}
};
template<class T> struct herk_scalar;
template<> struct herk_scalar<C>{using type = S;};
template<> struct herk_scalar<Z>{using type = D;};
template<class T> struct asum_scalar;
template<> struct asum_scalar<C>{using type = S;};
template<> struct asum_scalar<Z>{using type = D;};
template<class T> using herk_scalar_t = typename herk_scalar<T>::type;
template<> struct cublas3<void>{
// 2.7.1. cublas<t>gemm() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-gemm
template<class T> static auto gemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
const T *beta,
T *C, int ldc){MULTI_MARK_SCOPE("cublas3 gemm"); return cublas3<T>::gemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
// 2.7.6. cublas<t>syrk() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-syrk
template<class T> static auto syrk(cublasHandle_t handle,
cublasFillMode_t uplo, cublasOperation_t trans,
int n, int k,
const T *alpha,
const T *A, int lda,
const T *beta,
T *C, int ldc){return cublas3<T>::syrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);}
// 2.7.13. cublas<t>herk() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-herk
template<class T2, class T3> static auto herk(cublasHandle_t handle,
cublasFillMode_t uplo, cublasOperation_t trans,
int n, int k,
const herk_scalar_t<T2> *alpha,
const T2 *A, int lda,
const herk_scalar_t<T2> *beta,
T3 *C, int ldc){return cublas3<T2>::herk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);}
// 2.7.10. cublas<t>trsm() https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-trsm
template<class T> static auto trsm(cublasHandle_t handle,
cublasSideMode_t side, cublasFillMode_t uplo,
cublasOperation_t trans, cublasDiagType_t diag,
int m, int n,
std::add_const_t<T> *alpha,
std::add_const_t<T> *A, int lda,
T *B, int ldb){return cublas3<T>::trsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);}
};
namespace cublas{
template<class T, std::enable_if_t<not std::is_integral<T>{}, int> =0> decltype(auto) translate(T t){return t;}
template<class T, std::enable_if_t<not std::is_copy_constructible<std::decay_t<T>>{}, int> =0> T& translate(T& t){return t;}
auto translate(std::complex<float> const * t){return reinterpret_cast<cublas::complex<float> const*>(t);}
auto translate(std::complex<float> * t){return reinterpret_cast<cublas::complex<float> *>(t);}
auto translate(std::complex<double> const* t){return reinterpret_cast<cublas::complex<double> const*>(t);}
auto translate(std::complex<double> * t){return reinterpret_cast<cublas::complex<double> *>(t);}
auto translate(thrust::complex<double> const* t){return reinterpret_cast<cublas::complex<double> const*>(t);}
auto translate(thrust::complex<double> * t){return reinterpret_cast<cublas::complex<double> *>(t);}
template<class T> auto translate(memory::cuda::ptr<T> p) DECLRETURN(translate(raw_pointer_cast(p)))
template<class T> auto translate(memory::cuda::managed::ptr<T> p) DECLRETURN(translate(raw_pointer_cast(p)))
//auto translate(context& c){return c;}
template<class T, std::enable_if_t<std::is_integral<T>{},int> = 0>
auto translate(T n){
assert(n <= +static_cast<T>(std::numeric_limits<int>::max()));
assert(n > -static_cast<T>(std::numeric_limits<int>::max()));
return static_cast<T>(n);
}
auto translate(char O)->cublasOperation_t{
switch(O){case 'N': return CUBLAS_OP_N; case 'T': return CUBLAS_OP_T; case 'C': return CUBLAS_OP_C;} assert(0);
return CUBLAS_OP_N;
}
//struct context : std::unique_ptr<std::decay_t<decltype(*cublasHandle_t{})>, decltype(&cublasDestroy)>{
// context() : std::unique_ptr<std::decay_t<decltype(*cublasHandle_t{})>, decltype(&cublasDestroy)>(
// []{MULTI_MARK_SCOPE("multi::cublas::create context"); cublasHandle_t h; cublasCreate(&h); return h;}(), &cublasDestroy
// ){}
// int version() const{
// int ret; cublasGetVersion(get(), &ret); return ret;
// }
// context(context&& other) noexcept = default;
// ~context() noexcept = default;
//// 2.4.7. cublasGetPointerMode()
// auto get_pointer_mode() const{
// cublasPointerMode_t ret; cublasGetPointerMode(get(), &ret);
// return static_cast<enum pointer_mode>(ret);
// }
//// 2.4.8. cublasSetPointerMode() https://docs.nvidia.com/cuda/cublas/index.html#cublassetpointermode
// context& set_pointer_mode(enum pointer_mode m){
// cublasSetPointerMode(get(), static_cast<cublasPointerMode_t>(m)); return *this;
// }
// //set_stream https://docs.nvidia.com/cuda/cublas/index.html#cublassetstream
// //get_stream https://docs.nvidia.com/cuda/cublas/index.html#cublasgetstream
// //get_pointer_mode https://docs.nvidia.com/cuda/cublas/index.html#cublasgetpointermode
// //set_pointer_mode https://docs.nvidia.com/cuda/cublas/index.html#cublasgetpointermode
// template<class...As> auto iamax(As...as) const DECLRETURN(cublas1<>::iamax(get(), translate(as)...))
// template<class...As> auto asum (As...as) const DECLRETURN(cublas1<>::asum (get(), translate(as)...))
// template<class...As> auto scal (As...as) const DECLRETURN(cublas1<>::scal (get(), translate(as)...))
// template<class...As> auto dot (As...as) const DECLRETURN(cublas1<>::dot (get(), translate(as)...))
// template<class...As> auto dotu (As...as) const DECLRETURN(cublas1<>::dotu (get(), translate(as)...))
// template<class...As> auto dotc (As...as) const DECLRETURN(cublas1<>::dotc (get(), translate(as)...))
// template<class S, class Ptr, class T>
// auto nrm2(S n, Ptr p, S incx, memory::cuda::ptr<T> result) // no const because the method is not thread safe
// ->decltype(cublas1<>::nrm2 (get(), translate(n), translate(p), translate(incx), translate(result))){set_pointer_mode(pointer_mode::device);
// auto r=cublas1<>::nrm2 (get(), translate(n), translate(p), translate(incx), translate(result)); set_pointer_mode(pointer_mode::host);
// return r;
// }
// template<class S, class Ptr, class T>
// auto nrm2(S n, Ptr p, S incx, T* result) const{
// return cublas1<>::nrm2 (get(), translate(n), translate(p), translate(incx), translate(result));
// }
// template<class...As> auto copy (As...as) const DECLRETURN(cublas1<>::copy (get(), translate(as)...))
// template<class...As> auto trsv (As...as) const{return cublas2<>::trsv(get(), translate(as)...);}
// template<typename... As> auto gemm(As... as) DECLRETURN(cublas3<>::gemm(get(), translate(as)...))
// template<class...As> auto syrk (As...as) const{return cublas3<>::syrk(get(), translate(as)...);}
// template<class...As> auto herk (As...as) const{return cublas3<>::herk(get(), translate(as)...);}
// template<class...As> auto trsm (As...as) const{return cublas3<>::trsm(get(), translate(as)...);}
//};
//context* get_default_context(){
// thread_local context instance;
// return &instance;
//}
}
}}
namespace boost{
namespace multi{
namespace blas{
template<> struct is_context<boost::multi::cublas::context> : std::true_type{};
template<> struct is_context<boost::multi::cublas::context&&> : std::true_type{};
template<> struct is_context<boost::multi::cublas::context&> : std::true_type{};
template<class T> boost::multi::cublas::context* default_context_of(memory::cuda:: ptr<T> const&){return boost::multi::cublas::get_default_context();}
template<class T> boost::multi::cublas::context* default_context_of(memory::cuda::managed::ptr<T> const&){return boost::multi::cublas::get_default_context();}
//template<class T> boost::multi::cublas::context default_context_of(memory::cuda::managed::ptr<T>){return {};}
//}
//namespace memory{namespace cuda{
// using boost::multi::blas::default_context_of; // to please nvcc 'default_context_of' should be declared prior to the call site or in namespace 'boost::multi::memory::cuda'
//}}
}
}}
namespace boost{
namespace multi{
namespace memory{
namespace cuda{
template<class... As>
auto iamax(As... as)
->decltype(cublas::context{}.iamax(as..., std::declval<int*>()), int()){
int r; cublas::context{}.iamax(as..., &r); return r-1;}
template<class ComplexTconst, typename S>//, typename T = typename std::decay_t<ComplexTconst>::value_type>
auto asum(S n, cuda::ptr<ComplexTconst> x, S incx){
decltype(std::abs(ComplexTconst{})) r;
cublas::context{}.asum(n, raw_pointer_cast(x), incx, &r);
return r;
}
template<class...As> auto copy(As... as) DECLRETURN(cublas::context{}.copy(as...))
template<class...As> auto scal(As... as) DECLRETURN(cublas::context{}.scal(as...))
//template<class...As> auto dot (As... as) DECLRETURN(cublas::context{}.dot (as...))
template<class...As> auto dotu(As... as) DECLRETURN(cublas::context{}.dotu(as...))
template<class...As> auto dotc(As... as) DECLRETURN(cublas::context{}.dotc(as...))
template<class...As> auto nrm2(As... as) DECLRETURN(cublas::context{}.nrm2(as...))
template<class S, class Tconst, class T>
auto trsv(char ul, char transA, char a_diag, S n, memory::cuda::ptr<Tconst> A, S lda, memory::cuda::ptr<T> X, S ldc){
cublasFillMode_t uplo = [ul](){
switch(ul){
case 'U': return CUBLAS_FILL_MODE_UPPER;
case 'L': return CUBLAS_FILL_MODE_LOWER;
} assert(0); return CUBLAS_FILL_MODE_UPPER;
}();
cublasOperation_t cutransA = [transA](){
switch(transA){
case 'N': return CUBLAS_OP_N;
case 'T': return CUBLAS_OP_T;
case 'C': return CUBLAS_OP_C;
} assert(0); return CUBLAS_OP_N;
}();
auto cudiag = a_diag=='N'?CUBLAS_DIAG_NON_UNIT:CUBLAS_DIAG_UNIT;
return cublas::context{}.trsv(uplo, cutransA, cudiag, n, A, lda, X, ldc);
}
template<class... As>
auto gemm(As... as)
->decltype(cublas::context{}.gemm(as...)){
return cublas::context{}.gemm(as...);}
template<class Tconst, class T, class UL, class C, class S, class Real>
void syrk(UL ul, C transA, S n, S k, Real alpha, multi::memory::cuda::ptr<Tconst> A, S lda, Real beta, multi::memory::cuda::ptr<T> CC, S ldc){
cublasFillMode_t uplo = [ul](){
switch(ul){
case 'U': return CUBLAS_FILL_MODE_UPPER;
case 'L': return CUBLAS_FILL_MODE_LOWER;
} assert(0); return CUBLAS_FILL_MODE_UPPER;
}();
cublasOperation_t cutransA = [transA](){
switch(transA){
case 'N': return CUBLAS_OP_N;
case 'T': return CUBLAS_OP_T;
case 'C': return CUBLAS_OP_C;
} assert(0); return CUBLAS_OP_N;
}();
return cublas::context{}.syrk(uplo, cutransA, n, k, &alpha, static_cast<T const*>(A), lda, &beta, static_cast<T*>(CC), ldc);
}
template<class Tconst, class T, class UL, class C, class S, class Real>
auto herk(UL ul, C transA, S n, S k, Real alpha, memory::cuda::ptr<Tconst> A, S lda, Real beta, memory::cuda::ptr<T> CC, S ldc){
cublasFillMode_t uplo = [ul](){
switch(ul){
case 'U': return CUBLAS_FILL_MODE_UPPER;
case 'L': return CUBLAS_FILL_MODE_LOWER;
} assert(0); return CUBLAS_FILL_MODE_UPPER;
}();
cublasOperation_t cutransA = [transA](){
switch(transA){
case 'N': return CUBLAS_OP_N;
case 'T': return CUBLAS_OP_T;
case 'C': return CUBLAS_OP_C;
} assert(0); return CUBLAS_OP_N;
}();
return cublas::context{}.herk(uplo, cutransA, n, k, &alpha, raw_pointer_cast(A), lda, &beta, raw_pointer_cast(CC), ldc);
}
template<class Side, class Fill, class Trans, class Diag, typename Size, class Tconst, class T/*, class Alpha*/>
auto trsm(Side /*cublasSideMode_t*/ side, /*cublasFillMode_t*/ Fill uplo, /*cublasOperation_t*/ Trans trans, /*cublasDiagType_t*/ Diag diag,
Size m, Size n, T alpha, cuda::ptr<Tconst> A, Size lda, cuda::ptr<T> B, Size ldb)
->decltype(cublas::context{}.trsm(
side=='L'?CUBLAS_SIDE_LEFT:CUBLAS_SIDE_RIGHT, uplo=='L'?CUBLAS_FILL_MODE_LOWER:CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, diag=='N'?CUBLAS_DIAG_NON_UNIT:CUBLAS_DIAG_UNIT, m, n, &alpha, raw_pointer_cast(A), lda, raw_pointer_cast(B), ldb))
{
cublasOperation_t trans_cu = [&]{
switch(trans){
case 'N': return CUBLAS_OP_N;
case 'T': return CUBLAS_OP_T;
case 'C': return CUBLAS_OP_C;
} __builtin_unreachable();
}();
// T alpha_{alpha};
return cublas::context{}.trsm(
side=='L'?CUBLAS_SIDE_LEFT:CUBLAS_SIDE_RIGHT, uplo=='L'?CUBLAS_FILL_MODE_LOWER:CUBLAS_FILL_MODE_UPPER, trans_cu, diag=='N'?CUBLAS_DIAG_NON_UNIT:CUBLAS_DIAG_UNIT, m, n, &alpha, raw_pointer_cast(A), lda, raw_pointer_cast(B), ldb);
}
}}}}
namespace boost{namespace multi{namespace memory{namespace cuda{namespace managed{
using cuda::iamax;
using cuda::asum;
using cuda::copy;
using cuda::scal;
//using cuda::dot;
using cuda::dotu;
using cuda::dotc;
using cuda::nrm2;
template<class S, class Tconst, class T>
auto trsv(char ul, char transA, char a_diag, S n, multi::memory::cuda::managed::ptr<Tconst> A, S lda, cuda::managed::ptr<T> X, S ldc){
cuda::trsv(ul, transA, a_diag, n, cuda::ptr<Tconst>(A), lda, cuda::ptr<T>(X), ldc);
}
using cuda::gemm;
using cuda::syrk;
using cuda::herk;
template<class Side, class Fill, class Trans, class Diag, typename Size, class Tconst, class T>
auto trsm(Side /*cublasSideMode_t*/ side, /*cublasFillMode_t*/ Fill uplo, /*cublasOperation_t*/ Trans trans, /*cublasDiagType_t*/ Diag diag,
Size m, Size n, T alpha, cuda::managed::ptr<Tconst> A, Size lda, cuda::managed::ptr<T> B, Size ldb){
return trsm(side, uplo, trans, diag, m, n, alpha, cuda::ptr<Tconst>(A), lda, cuda::ptr<T>(B), ldb);
}
}}}}}
///////////////////////////////////////////////////////////////////////////////
#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_CUDA
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../array.hpp"
#include "../../utility.hpp"
#include "../../adaptors/cuda.hpp"
#include "../../adaptors/blas.hpp"
#include "../../adaptors/blas/cuda.hpp"
#include<cassert>
namespace multi = boost::multi;
#if 0
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_version){
multi::cublas::context c;
BOOST_REQUIRE( c.version() >= 10100 );
}
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_iamax){
using complex = std::complex<double>;
complex const I{0,1};
{
multi::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
using multi::blas::iamax;
BOOST_REQUIRE( iamax(A) == 2 );
}
{
multi::cuda::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
using multi::blas::iamax;
BOOST_REQUIRE( iamax(A) == 2 );
}
{
multi::cuda::managed::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
using multi::blas::iamax;
BOOST_REQUIRE( iamax(A) == 2 );
}
}
#endif
template<class T> void what(T&&) = delete;
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_dot){
using complex = std::complex<double>;
complex const I{0,1};
multi::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
multi::array<complex, 1> const B = {2. + 3.*I, 4., 5. + 6.*I, 7.};
namespace blas = multi::blas;
{
multi::cuda::array<complex, 1> const A_gpu = A, B_gpu = B;
using blas::dot;
BOOST_REQUIRE( dot(blas::C(A_gpu), B_gpu) == dot(blas::C(A), B) );
}
{
multi::cuda::managed::array<complex, 1> const A_mng = A, B_mng = B;
using blas::dot;
BOOST_REQUIRE( dot(blas::C(A_mng), A_mng) == dot(blas::C(A), A) );
}
}
#endif
#endif

View File

@ -0,0 +1,167 @@
#ifdef COMPILATION_INSTRUCTIONS
/usr/local/cuda-11.1/bin/nvcc -x cu -std=c++17 -use_fast_math -lpthread -D_REENTRANT -DBOOST_PP_VARIADICS -Xcudafe "--diag_suppress=implicit_return_from_non_void_function" --extended-lambda --expt-relaxed-constexpr $0 -o $0x `pkg-config --cflags --libs cudart-11.0 cublas-11.0 blas` -lboost_unit_test_framework -DBOOST_LOG_DYN_LINK -lboost_log -lboost_thread -lboost_system -lboost_log_setup -lpthread -lboost_timer&&$0x&&rm $0x; exit
#endif
// © Alfredo A. Correa 2020-2021
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS gemm"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include <boost/timer/timer.hpp>
//#include"boost/log/trivial.hpp"
//#define MULTI_MARK_SCOPE(MsG) BOOST_LOG_TRIVIAL(trace)<<MsG
//#include "../../../../adaptors/cublas/context.hpp"
#include "../../../cuda/cublas.hpp"
#include "../../../../array.hpp"
#include "../../../../adaptors/cuda.hpp"
#include "../../../../adaptors/blas.hpp"
#include<random>
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_gemm_complex_3x2_3x2){
using complex = std::complex<double>; complex const I{0, 1};
namespace blas = multi::blas;
multi::array<complex, 2> const a = {
{1. + 2.*I, 5. + 2.*I},
{9. - 1.*I, 9. + 1.*I},
{1. + 1.*I, 2. + 2.*I}
};
multi::array<complex, 2> const b = {
{ 11. - 2.*I, 5. + 2.*I},
{ 7. - 3.*I, 2. + 1.*I},
{ 8. - 1.*I, 1. + 1.*I}
};
{
{
multi::array<complex, 2> c({2, 2});
c = blas::gemm(1., blas::H(a), b); // c=ab, c⸆=b⸆a⸆
BOOST_REQUIRE( c[1][0] == 125.-84.*I );
}
}
{
multi::cuda::array<complex, 2> const a_gpu = a;
multi::cuda::array<complex, 2> const b_gpu = b;
{
multi::cuda::array<complex, 2> c_gpu({2, 2});
c_gpu = blas::gemm(1., blas::H(a_gpu), b_gpu); // c=ab, c⸆=b⸆a⸆
BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
}
{
auto c_gpu =+ blas::gemm(1.0, blas::H(a_gpu), b_gpu);
BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
}
}
{
multi::cuda::managed::array<complex, 2> const a_gpu = a;
multi::cuda::managed::array<complex, 2> const b_gpu = b;
{
multi::cuda::managed::array<complex, 2> c_gpu({2, 2});
blas::gemm(1., blas::H(a_gpu), b_gpu, 0., c_gpu); // c=ab, c⸆=b⸆a⸆
BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
}
{
auto c_gpu =+ blas::gemm(1.0, blas::H(a_gpu), b_gpu);
BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
}
}
}
//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_gemm_complex_3x2_3x2_with_context){
// using complex = std::complex<double>; complex const I{0, 1};
// namespace blas = multi::blas;
// multi::array<complex, 2> const a = {
// {1. + 2.*I, 5. + 2.*I},
// {9. - 1.*I, 9. + 1.*I},
// {1. + 1.*I, 2. + 2.*I}
// };
// multi::array<complex, 2> const b = {
// { 11. - 2.*I, 5. + 2.*I},
// { 7. - 3.*I, 2. + 1.*I},
// { 8. - 1.*I, 1. + 1.*I}
// };
// {
// {
// multi::blas::context ctx;
// multi::array<complex, 2> c({2, 2});
// blas::gemm(ctx, 1., blas::H(a), b, 0., c); // c=ab, c⸆=b⸆a⸆
// BOOST_REQUIRE( c[1][0] == 125.-84.*I );
// }
// }
// {
// multi::cublas::context ctx;
// multi::cuda::array<complex, 2> const a_gpu = a;
// multi::cuda::array<complex, 2> const b_gpu = b;
// {
// multi::cuda::array<complex, 2> c_gpu({2, 2});
// blas::gemm(ctx, 1., blas::H(a_gpu), b_gpu, 0., c_gpu); // c=ab, c⸆=b⸆a⸆
// BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
// }
// {
// auto c_gpu =+ blas::gemm(&ctx, blas::H(a_gpu), b_gpu);
// BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
// }
// }
// {
// multi::cublas::context ctx;
// multi::cuda::managed::array<complex, 2> const a_gpu = a;
// multi::cuda::managed::array<complex, 2> const b_gpu = b;
// {
// multi::cuda::managed::array<complex, 2> c_gpu({2, 2});
// blas::gemm(ctx, 1., blas::H(a_gpu), b_gpu, 0., c_gpu); // c=ab, c⸆=b⸆a⸆
// BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
// }
// {
// auto c_gpu =+ blas::gemm(&ctx, blas::H(a_gpu), b_gpu);
// BOOST_REQUIRE( c_gpu[1][0] == 125.-84.*I );
// }
// }
//}
#if 0
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_gemm_context_timing){
using complex = std::complex<double>;//complex const I{0, 1};
multi::array<complex, 2> A({1000, 1000});
multi::array<complex, 2> B( {1000, 1000});
multi::array<complex, 2> C({size(A), size(~B)});
A[99][99] = B[11][22] = C[33][44] = 1.0;
std::cerr<< "memory " << (A.num_elements()+ B.num_elements() + C.num_elements())*sizeof(complex)/1e6 <<" MB"<<std::endl;
{
auto rand = [d=std::uniform_real_distribution<>{0., 10.}, g=std::mt19937{}]() mutable{return complex{d(g), d(g)};};
std::generate(A.elements().begin(), A.elements().end(), rand);
std::generate(B.elements().begin(), B.elements().end(), rand);
}
namespace blas = multi::blas;
{
boost::timer::auto_cpu_timer t; // 2.398206s
for(auto i = 0; i != 10; ++i){
blas::context ctx;
blas::gemm(ctx, 1, A, B, 0, C);
}
}
using device_array = multi::cuda::array<complex, 2>;
{
device_array A_gpu = A, B_gpu = B, C_gpu({size(A), size(~B)});
boost::timer::auto_cpu_timer t; // 0.707426s
for(auto i = 0; i != 10; ++i){
multi::cublas::context ctx;
blas::gemm(ctx, 1, A_gpu, B_gpu, 0, C_gpu);
}
}
{
device_array A_gpu = A, B_gpu = B, C_gpu({size(A), size(~B)});
boost::timer::auto_cpu_timer t; // 0.613534s
multi::cublas::context ctx;
for(auto i = 0; i != 10; ++i) blas::gemm(ctx, 1, A_gpu, B_gpu, 0, C_gpu);
}
}
#endif

View File

@ -0,0 +1,34 @@
#ifdef COMPILATION_INSTRUCTIONS
$CXXX $CXXFLAGS $0 -o $0x `pkg-config --libs blas` -Wno-deprecated-declarations `pkg-config --cflags --libs cudart-11.0 cublas-11.0 blas` -lboost_unit_test_framework&&$0x&&rm $0x; exit
#endif
// © Alfredo A. Correa 2019-2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS iamax"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../../../adaptors/blas.hpp"
#include "../../../../adaptors/cuda.hpp"
#include "../../../../adaptors/blas/cuda.hpp"
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_cuda_iamax){
using complex = std::complex<double>; complex const I{0, 1};
{
multi::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
using multi::blas::iamax;
BOOST_REQUIRE( iamax(A) == 2 );
}
{
multi::cuda::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
using multi::blas::iamax;
BOOST_REQUIRE( iamax(A) == 2 );
}
{
multi::cuda::managed::array<complex, 1> const A = {1. + 2.*I, 2., 3. + 3.*I, 4.};
using multi::blas::iamax;
BOOST_REQUIRE( iamax(A) == 2 );
}
}

View File

@ -0,0 +1,136 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
// © Alfredo A. Correa 2019-2021
#ifndef MULTI_ADAPTORS_BLAS_DOT_HPP
#define MULTI_ADAPTORS_BLAS_DOT_HPP
#include "../blas/core.hpp"
#include "../blas/numeric.hpp" // is_complex
#include "../blas/operations.hpp" // blas::C
namespace boost{
namespace multi::blas{
using core::dot ;
using core::dotu;
using core::dotc;
template<class Context, class XIt, class Size, class YIt, class RPtr>
auto dot_n(Context&& ctxt, XIt x_first, Size count, YIt y_first, RPtr rp){
if constexpr(is_complex<typename XIt::value_type>{}){
;;;; if constexpr (!is_conjugated<XIt>{} and !is_conjugated<YIt>{}) std::forward<Context>(ctxt)->dotu(count, base(x_first) , stride(x_first), base(y_first), stride(y_first), rp);
else if constexpr (!is_conjugated<XIt>{} and is_conjugated<YIt>{}) std::forward<Context>(ctxt)->dotc(count, underlying(base(y_first)), stride(y_first), base(x_first), stride(x_first), rp);
else if constexpr ( is_conjugated<XIt>{} and !is_conjugated<YIt>{}) std::forward<Context>(ctxt)->dotc(count, underlying(base(x_first)), stride(x_first), base(y_first), stride(y_first), rp);
else if constexpr ( is_conjugated<XIt>{} and is_conjugated<YIt>{}) static_assert(!sizeof(XIt*), "not implemented in blas");
}else{
std::forward<Context>(ctxt)->dot (count, base(x_first) , stride(x_first), base(y_first), stride(y_first), rp);
}
struct{XIt x_last; YIt y_last;} ret{x_first + count, y_first + count};
return ret;
}
template<class XIt, class Size, class YIt, class RPtr>
auto dot_n(XIt x_first, Size count, YIt y_first, RPtr rp){//->decltype(dot_n(blas::context{}, x_first, count, y_first, rp)){
if constexpr(is_conjugated<XIt>{}){
auto ctxtp = blas::default_context_of(underlying(x_first.base()));
return dot_n(ctxtp, x_first, count, y_first, rp);
}else{
auto ctxtp = blas::default_context_of(x_first.base());
return dot_n(ctxtp, x_first, count, y_first, rp);
}
}
template<class Context, class X1D, class Y1D, class R>
R&& dot(Context&& ctxt, X1D const& x, Y1D const& y, R&& r){
assert( size(x) == size(y) );
return blas::dot_n(std::forward<Context>(ctxt), begin(x), size(x), begin(y), &r), std::forward<R>(r);
}
template<class X1D, class Y1D, class R>
R&& dot(X1D const& x, Y1D const& y, R&& r){
assert( size(x) == size(y) );
if constexpr(is_conjugated<X1D>{}){
auto ctxtp = blas::default_context_of(underlying(x.base()));
return blas::dot(ctxtp, x, y, r);
}else{
auto ctxtp = blas::default_context_of(x.base());
return blas::dot(ctxtp, x, y, r);
}
}
template<class ContextPtr, class ItX, class Size, class ItY>
class dot_ptr{
ContextPtr ctxt_;
ItX x_first_;
Size count_;
ItY y_first_;
protected:
dot_ptr(ContextPtr ctxt, ItX x_first, Size count, ItY y_first) : ctxt_{ctxt}, x_first_{x_first}, count_{count}, y_first_{y_first}{}
public:
dot_ptr(dot_ptr const&) = default;
template<class ItOut, class Size2>
friend constexpr auto copy_n(dot_ptr first, [[maybe_unused]] Size2 count, ItOut d_first)
->decltype(blas::dot_n(std::declval<ContextPtr>(), std::declval<ItX>(), Size{} , std::declval<ItY>(), d_first), d_first + count){assert(count == 1);
return blas::dot_n(first.ctxt_ , first.x_first_ , first.count_, first.y_first_ , d_first), d_first + count;}
template<class ItOut, class Size2>
friend constexpr auto uninitialized_copy_n(dot_ptr first, Size2 count, ItOut d_first)
->decltype(blas::dot_n(std::declval<ContextPtr>(), std::declval<ItX>(), Size{} , std::declval<ItY>(), d_first), d_first + count){assert(count == 1);
return blas::dot_n(first.ctxt_ , first.x_first_ , first.count_, first.y_first_ , d_first), d_first + count;}
// ->decltype(copy_n(first, count, d_first)){ // nvcc is not detecting friend copy_n
// return copy_n(first, count, d_first);}
};
template<class ContextPtr, class X, class Y, class Ptr = dot_ptr<ContextPtr, typename X::const_iterator, typename X::size_type, typename Y::const_iterator>>
struct dot_ref : private Ptr{
dot_ref(dot_ref const&) = delete;
using decay_type = decltype(typename X::value_type{}*typename Y::value_type{});
dot_ref(ContextPtr ctxt, X const& x, Y const& y) : Ptr{ctxt, begin(x), size(x), begin(y)}{assert(size(x)==size(y));}
constexpr Ptr const& operator&() const&{return *this;}
decay_type decay() const{decay_type r; copy_n(operator&(), 1, &r); return r;}
operator decay_type() const&{return decay();}
#if not defined(__CUDACC__) or not defined(__INTEL_COMPILER)
friend auto operator*(decay_type const& lhs, dot_ref const& self){return lhs*self.decay();}
#endif
decay_type operator+() const{return decay();}
bool operator==(dot_ref const& other) const{return decay() == other.decay();}
bool operator!=(dot_ref const& other) const{return decay() != other.decay();}
template<class Other>
auto operator==(Other const& other) const
->decltype(decay()==other){
return decay()==other;}
template<class Other>
auto operator!=(Other const& other) const
->decltype(decay()!=other){
return decay()!=other;}
};
template<class Context, class X, class Y> [[nodiscard]]
dot_ref<Context, X, Y> dot(Context const& ctxt, X const& x, Y const& y){return {ctxt, x, y};}
//template<class X, class Y> [[nodiscard]]
//dot_ref<blas::context, X, Y> dot(X const& x, Y const& y){return {blas::context{}, x, y};}
template<class X, class Y> [[nodiscard]]
auto dot(X const& x, Y const& y){
if constexpr(is_conjugated<X>{}){
auto ctxtp = blas::default_context_of(underlying(x.base()));
return blas::dot(ctxtp, x, y);
}else{
auto ctxtp = blas::default_context_of(x.base());
return blas::dot(ctxtp, x, y);
}
}
namespace operators{
template<class X1D, class Y1D> [[nodiscard]]
auto operator,(X1D const& x, Y1D const& y)
->decltype(dot(x, y)){
return dot(x, y);}
}
}
}
#endif

View File

@ -0,0 +1,123 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXXX $CXXFLAGS $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_FILLING_HPP
#define MULTI_ADAPTORS_BLAS_FILLING_HPP
#include "../blas/core.hpp"
#include "../blas/operations.hpp"
#include "../../array_ref.hpp"
namespace boost{
namespace multi{
namespace blas{
enum class filling : char{
lower = 'U',
upper = 'L'
};
MAYBE_UNUSED static constexpr filling U = filling::upper;
MAYBE_UNUSED static constexpr filling L = filling::lower;
filling flip(filling side){
switch(side){
case filling::lower: return filling::upper;
case filling::upper: return filling::lower;
} __builtin_unreachable();
}
filling operator-(filling side){return flip(side);}
filling operator+(filling side){return side;}
template<class A2D, std::enable_if_t<is_conjugated<A2D>{}, int> =0>
filling detect_triangular_aux(A2D const& A, std::false_type){
{
for(auto i = size(A); i != 0; --i){
auto const asum_up = blas::asum(begin(A[i-1])+i, end(A[i-1]));
if(asum_up!=asum_up) return filling::lower;
else if(asum_up!=0.) return filling::upper;
auto const asum_lo = blas::asum(begin(rotated(A)[i-1])+i, end(rotated(A)[i-1]));
if(asum_lo!=asum_lo) return filling::upper;
else if(asum_lo!=0.) return filling::lower;
}
}
return filling::lower;
}
template<class A2D>
filling detect_triangular(A2D const& A);
template<class A2D, std::enable_if_t<is_conjugated<A2D>{}, int> =0>
filling detect_triangular_aux(A2D const& A){
return flip(detect_triangular(hermitized(A)));
}
template<class A2D>
filling detect_triangular(A2D const& A){
#if defined(__cpp_if_constexpr)
if constexpr(not is_conjugated<A2D>{}){
using blas::asum;
for(auto i = size(A); i != 0; --i){
auto const asum_up = asum(A[i-1]({i, A[i-1].size()}));
if(asum_up!=asum_up) return filling::lower;
else if(asum_up!=0.) return filling::upper;
auto const asum_lo = asum(rotated(A)[i-1]({i, rotated(A)[i-1].size()}));
if(asum_lo!=asum_lo) return filling::upper;
else if(asum_lo!=0.) return filling::lower;
}
}else{
return flip(detect_triangular(hermitized(A)));
}
return filling::lower;
#else
return detect_triangular_aux(A);//, is_conjugated<A2D>{});//std::integral_constant<bool, not is_hermitized<A2D>()>{});
#endif
}
}}
}
#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_FILLING
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi adaptors side"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../array.hpp"
#include "../../utility.hpp"
#include "../blas/nrm2.hpp"
#include<complex>
#include<cassert>
#include<iostream>
#include<numeric>
#include<algorithm>
using std::cout;
template<class M>
decltype(auto) print(M const& C){
using boost::multi::size;
for(int i = 0; i != size(C); ++i){
for(int j = 0; j != size(C[i]); ++j) cout<< C[i][j] <<' ';
cout<<std::endl;
}
return cout<<"---"<<std::endl;
}
namespace multi = boost::multi;
using complex = std::complex<double>;
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_side){
return;
}
#endif
#endif

View File

@ -0,0 +1,259 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
// © Alfredo A. Correa 2019-2021
#ifndef MULTI_ADAPTORS_BLAS_GEMM_HPP
#define MULTI_ADAPTORS_BLAS_GEMM_HPP
#include "../blas/core.hpp"
#include "../blas/gemv.hpp"
#include "../blas/numeric.hpp"
#include "../blas/operations.hpp"
namespace boost{
namespace multi{
namespace blas{
using core::gemm;
template<class It>
auto xbase_aux(It const& it, std::true_type const&)
->decltype(underlying(base(it))){
return underlying(base(it));}
template<class It>
auto xbase_aux(It const& it, std::false_type const&)
->decltype(base(it)){
return base(it);}
template<class It>
auto xbase(It const& it)
->decltype(xbase_aux(it, std::integral_constant<bool, is_conjugated<It>{}>{})){
return xbase_aux(it, std::integral_constant<bool, is_conjugated<It>{}>{});}
template<class Context, class It2DA, class Size, class It2DB, class It2DC>
auto gemm_n(Context&& ctxt, typename It2DA::element alpha, It2DA a_first, Size a_count, It2DB b_first, typename It2DA::element beta, It2DC c_first)
//->decltype(std::forward<Context>(ctxt).gemm('N', 'N', b_first->size(), a_count, a_first->size(), &alpha, xbase(b_first), b_first->size() , xbase(a_first), a_first->size(), &beta, c_first.base(), c_first->size() ), It2DC{})
try{
assert( b_first->size() == c_first->size() );
assert( a_first.stride()==1 or a_first->stride()==1 );
assert( b_first.stride()==1 or b_first->stride()==1 );
assert( c_first.stride()==1 or c_first->stride()==1 );
if(a_count != 0){
#define CTXT std::forward<Context>(ctxt)
;;;;; if constexpr(!is_conjugated<It2DA>{} and !is_conjugated<It2DB>{}){
;;;;; if(a_first->stride()==1 and b_first->stride()==1 and c_first->stride()==1){
;;;; if( a_count==1 and b_first->size()==1 ){CTXT.gemm('N', 'N', b_first->size(), a_count, a_first->size(), &alpha, base(b_first), b_first->size() , base(a_first), a_first->size() , &beta, base(c_first), c_first->size() );}
else if( a_count==1 ){CTXT.gemm('N', 'N', b_first->size(), a_count, a_first->size(), &alpha, base(b_first), b_first. stride(), base(a_first), a_first->size() , &beta, base(c_first), c_first->size() );}
else {CTXT.gemm('N', 'N', b_first->size(), a_count, a_first->size(), &alpha, base(b_first), b_first. stride(), base(a_first), a_first. stride(), &beta, base(c_first), c_first. stride());}
}else if(a_first->stride()==1 and b_first->stride()==1 and c_first. stride()==1){
if (a_count==1) {CTXT.gemm('T', 'T', a_count, b_first->size(), a_first->size(), &alpha, base(a_first), a_first. stride(), base(b_first), b_first->size() , &beta, base(c_first), a_first->size() );}
else {CTXT.gemm('T', 'T', a_count, b_first->size(), a_first->size(), &alpha, base(a_first), a_first. stride(), base(b_first), b_first.stride(), &beta, base(c_first), c_first->stride());}
}else if(a_first. stride()==1 and b_first->stride()==1 and c_first->stride()==1){
if (a_count==1) {CTXT.gemm('N', 'T', c_first->size(), a_count, a_first->size(), &alpha, base(b_first), b_first. stride(), base(a_first), a_first->stride(), &beta, base(c_first), a_first->size() );}
else {CTXT.gemm('N', 'T', c_first->size(), a_count, a_first->size(), &alpha, base(b_first), b_first. stride(), base(a_first), a_first->stride(), &beta, base(c_first), c_first.stride());}
}else if(a_first. stride()==1 and b_first->stride()==1 and c_first. stride()==1){
if (a_count==1) {CTXT.gemm('N', 'T', a_count, b_first->size(), a_first->size(), &alpha, base(a_first), a_first->stride(), base(b_first), a_first->size() , &beta, base(c_first), b_first->size() );}
else {CTXT.gemm('N', 'T', a_count, b_first->size(), a_first->size(), &alpha, base(a_first), a_first->stride(), base(b_first), b_first. stride(), &beta, base(c_first), c_first->stride());}
}else if(a_first->stride()==1 and b_first.stride()==1 and c_first. stride()==1){
;;;; if(a_count==1 and b_first->size()){CTXT.gemm('N', 'N', c_first->size(), a_count, a_first->size(), &alpha, base(b_first), b_first->size() , base(a_first), a_first->size() , &beta, base(c_first), c_first->stride());}
else if(a_count==1) {CTXT.gemm('N', 'T', c_first->size(), a_count, a_first->size(), &alpha, base(b_first), b_first->stride(), base(a_first), a_first->size() , &beta, base(c_first), c_first->stride());}
else {CTXT.gemm('N', 'T', c_first->size(), a_count, a_first->size(), &alpha, base(b_first), b_first->stride(), base(a_first), a_first.stride() , &beta, base(c_first), c_first->stride());}
}else if(a_first->stride()==1 and b_first. stride()==1 and c_first->stride()==1){
if (a_count==1) {CTXT.gemm('T', 'N', a_count, c_first->size(), a_first->size(), &alpha, base(b_first), b_first->stride(), base(a_first), a_first->size() , &beta, base(c_first), c_first.stride());}
else {CTXT.gemm('T', 'N', c_first->size(), a_count, a_first->size(), &alpha, base(b_first), b_first->stride(), base(a_first), a_first.stride(), &beta, base(c_first), c_first.stride());}
}else if(a_first. stride()==1 and b_first.stride( )==1 and c_first. stride()==1){
{CTXT.gemm('N', 'N', c_first->size(), a_count, a_first->size(), &alpha, base(a_first), a_first->stride(), base(b_first), b_first->stride(), &beta, base(c_first), c_first->stride());}
}else if(a_first. stride()==1 and b_first.stride( )==1 and c_first->stride()==1){
{CTXT.gemm('T', 'T', a_count, c_first->size(), a_first->size(), &alpha, base(b_first), b_first->stride(), base(a_first), a_first->stride(), &beta, base(c_first), c_first. stride());}
}else assert(0);
}else if constexpr(!is_conjugated<It2DA>{} and is_conjugated<It2DB>{}){
;;;;; if(a_first->stride()==1 and b_first->stride()==1 and c_first->stride()==1){
if(b_first->size()==1) {CTXT.gemm('C', 'N', c_first->size(), a_count, a_first->size(), &alpha, underlying(base(b_first)), b_first->stride(), base(a_first), a_first->size() , &beta, base(c_first), c_first.stride());}
else {CTXT.gemm('C', 'N', c_first->size(), a_count, a_first->size(), &alpha, underlying(base(b_first)), b_first->stride(), base(a_first), a_first->size() , &beta, base(c_first), c_first.stride());}
}else if(a_first->stride()==1 and b_first. stride()==1 and c_first->stride()==1){
if (a_count==1) {CTXT.gemm('C', 'N', a_count, c_first->size(), a_first->size(), &alpha, underlying(base(b_first)), b_first->stride(), base(a_first), a_first->size() , &beta, base(c_first), c_first.stride());}
else {CTXT.gemm('C', 'N', c_first->size(), a_count, a_first->size(), &alpha, underlying(base(b_first)), b_first->stride(), base(a_first), a_first.stride(), &beta, base(c_first), c_first.stride());}
}else if(a_first->stride()==1 and b_first. stride()==1 and c_first. stride()==1){
{CTXT.gemm('C', 'N', c_first->size(), a_count, a_first->size(), &alpha, underlying(base(b_first)), b_first->stride(), base(a_first), a_first. stride(), &beta, base(c_first), c_first->stride());}
}else if(a_first. stride()==1 and b_first. stride()==1 and c_first. stride()==1){
{CTXT.gemm('C', 'T', c_first->size(), a_count, a_first->size(), &alpha, underlying(base(b_first)), b_first->stride(), base(a_first), a_first->stride(), &beta, base(c_first), c_first->stride());}
}else if(a_first. stride()==1 and b_first. stride()==1 and c_first->stride()==1){
{CTXT.gemm('C', 'T', a_count, c_first->size(), a_first->size(), &alpha, underlying(base(b_first)), b_first->stride(), base(a_first), a_first->stride(), &beta, base(c_first), c_first. stride());}
}else assert(0);
}else if constexpr( is_conjugated<It2DA>{} and !is_conjugated<It2DB>{}){
;;;;; if(a_first. stride()==1 and b_first->stride()==1 and c_first->stride()==1){
if (a_count==1) {CTXT.gemm('N', 'C', c_first->size(), a_count, a_first->size(), &alpha, base(b_first), b_first. stride(), underlying(base(a_first)), a_first->stride(), &beta, base(c_first), a_first->size() );}
else {CTXT.gemm('N', 'C', c_first->size(), a_count, a_first->size(), &alpha, base(b_first), b_first. stride(), underlying(base(a_first)), a_first->stride(), &beta, base(c_first), c_first.stride());}
}else assert(0);
}else if constexpr( is_conjugated<It2DA>{} and is_conjugated<It2DB>{}){
;;;;; if(a_first. stride()==1 and b_first. stride()==1 and c_first->stride()==1){
{CTXT.gemm('C', 'C', a_count, c_first->size(), a_first->size(), &alpha, underlying(base(b_first)), b_first->stride(), underlying(base(a_first)), a_first->stride(), &beta, base(c_first), c_first. stride());}
}else assert(0);
}
#undef CTXT
}
return c_first + a_count;
}catch(std::logic_error& e){
using std::to_string;
throw std::logic_error{
"couldn't do "+std::string(__PRETTY_FUNCTION__)+" of layout a_count="+std::to_string(a_count)
+" a_strides="+to_string(a_first.stride())+","+to_string(a_first->stride())+" a->size="+to_string(a_first->size())
+" b_strides="+to_string(b_first.stride())+","+to_string(b_first->stride())+" b->size="+to_string(b_first->size())
+" c_strides="+to_string(c_first.stride())+","+to_string(c_first->stride())+" c->size="+to_string(c_first->size())
+" because " + e.what()
};
}
template<class It2DA, class Size, class It2DB, class It2DC, class Context = blas::context> // TODO automatic deduction of context
auto gemm_n(typename It2DA::element alpha, It2DA a_first, Size a_count, It2DB b_first, typename It2DA::element beta, It2DC c_first)
->decltype(gemm_n(Context{}, alpha, a_first, a_count, b_first, beta, c_first)){
return gemm_n(Context{}, alpha, a_first, a_count, b_first, beta, c_first);}
template<class Context, class A, class B, class C>
C&& gemm(Context&& ctx, typename A::element alpha, A const& a, B const& b, typename A::element beta, C&& c){
assert( size( a) == size( c) );
if(not a.is_empty()) assert( size(~a) == size( b) );
if constexpr(is_conjugated<C>{}){blas::gemm (std::forward<Context>(ctx), conj(alpha), conj(a), conj(b) , conj(beta), conj(c) );}
else {blas::gemm_n(std::forward<Context>(ctx), alpha , begin(a), size(a), begin(b), beta , begin(c));}
return std::forward<C>(c);
}
template<class A, class B, class C>
C&& gemm(typename A::element alpha, A const& a, B const& b, typename A::element beta, C&& c){
return gemm(blas::context{}, alpha, a, b, beta, std::forward<C>(c));
}
template<class ContextPtr, class Scalar, class ItA, class ItB, class DecayType>
class gemm_range;
template<class Ext>
struct gemm_reference{ // TODO implement this in terms of gemv_range
Ext x;
Ext const& extensions() const{return x;}
friend Ext const& extensions(gemm_reference const& self){return self.extensions();}
};
template<class ContextPtr, class Scalar, class ItA, class ItB>
class gemm_iterator{
ContextPtr ctxtp_;
Scalar s_;
ItA a_it_;
ItB b_begin_;
gemm_iterator(ContextPtr ctxtp, Scalar s, ItA a_it, ItB b_begin) : ctxtp_{ctxtp}, s_{s}, a_it_{a_it}, b_begin_{b_begin}{}
template<class ContextPtr2, class Scalar2, class ItA2, class ItB2, class DecayType2>
friend class gemm_range;
public:
gemm_iterator(gemm_iterator const&) = default;
using difference_type = typename std::iterator_traits<ItA>::difference_type;
using value_type = typename std::iterator_traits<ItA>::value_type;
using pointer = void*;
using reference = gemm_reference<decltype(b_begin_->extensions())>;
using iterator_category = std::random_access_iterator_tag; // using iterator_category = std::input_iterator_tag;
static_assert( std::is_base_of<std::random_access_iterator_tag, typename std::iterator_traits<gemm_iterator>::iterator_category>{} );
gemm_iterator& operator+=(difference_type n){a_it_ += n; return *this;}
gemm_iterator& operator-=(difference_type n){a_it_ -= n; return *this;}
gemm_iterator& operator++(){return operator+=(1);} // required by random access concept requires even if not used explicitly
gemm_iterator& operator--(){return operator-=(1);}
auto operator+(difference_type n) const{gemm_iterator ret{*this}; ret+=n; return ret;}
friend difference_type operator-(gemm_iterator const& a, gemm_iterator const& b){assert(a.b_begin_ == b.b_begin_);
return a.a_it_ - b.a_it_;
}
friend bool operator==(gemm_iterator const& a, gemm_iterator const& b){return a.a_it_ == b.a_it_;}
friend bool operator!=(gemm_iterator const& a, gemm_iterator const& b){return a.a_it_ != b.a_it_;}
template<class ItOut>
friend auto copy_n(gemm_iterator const& first, difference_type count, ItOut d_first)
->decltype(blas::gemm_n(*std::declval<ContextPtr>(), std::declval<Scalar>(), std::declval<ItA>(), count, std::declval<ItB>(), 0., d_first)) try{
return blas::gemm_n(*first.ctxtp_ , first.s_ , first.a_it_ , count, first.b_begin_ , 0., d_first);
}catch(std::exception const& e){
throw std::logic_error(
"in " + std::string(__PRETTY_FUNCTION__) + "\nCouldn't decay product of arrays of size " + std::to_string(count) +"x"+ std::to_string(first.a_it_->size()) + " and " +
std::to_string(first.a_it_->size())+ "x" +std::to_string(first.b_begin_->size()) + " into " + std::to_string(count) +"x" + std::to_string(first.b_begin_->size()) +
"\nbecause\n"+e.what()
);
}
template<class ItOut>
friend auto copy(gemm_iterator const& first, gemm_iterator const& last, ItOut d_first){assert(first.s_ == last.s_);
return copy_n(first, last - first, d_first);
}
template<class ItOut>
friend auto uninitialized_copy_n(gemm_iterator const& first, difference_type count, ItOut d_first){
return copy_n(first, count, d_first);
}
template<class ItOut>
friend auto uninitialized_copy(gemm_iterator const& first, gemm_iterator const& last, ItOut d_first){assert( first.s_ == last.s_ );
return uninitialized_copy_n(first, last - first, d_first);}
reference operator*() const{return {b_begin_->extensions()};}
};
template<class ContextPtr, class Scalar, class ItA, class ItB, class DecayType>
class gemm_range{
ContextPtr ctxtp_;
Scalar s_;
ItA a_begin_;
ItA a_end_;
ItB b_begin_;
public:
gemm_range(gemm_range const&) = delete;
gemm_range(ContextPtr ctxtp, Scalar s, ItA a_first, ItA a_last, ItB b_first) : ctxtp_{ctxtp}, s_{s}, a_begin_{a_first}, a_end_{a_last}, b_begin_{b_first}{}
using iterator = gemm_iterator<ContextPtr, Scalar, ItA, ItB>;
using decay_type = DecayType;
using size_type = typename decay_type::size_type;
iterator begin() const{return {ctxtp_, s_, a_begin_, b_begin_};}
iterator end() const{return {ctxtp_, s_, a_end_ , b_begin_};}
friend auto begin(gemm_range const& self){return self.begin();}
friend auto end (gemm_range const& self){return self.end ();}
size_type size() const{return a_end_ - a_begin_;}
typename decay_type::extensions_type extensions() const{return size()*b_begin_->extensions();}
friend auto extensions(gemm_range const& self){return self.extensions();}
// operator decay_type() const{return decay_type(*this);} // do not use curly { }
decay_type operator+() const{return *this;}
template<class Arr>
friend Arr&& operator+=(Arr&& a, gemm_range const& gr){
blas::gemm_n(*gr.ctxtp_, gr.s_, gr.a_begin_, gr.a_end_ - gr.a_begin_, gr.b_begin_, 1., a.begin());
return std::forward<Arr>(a);
}
};
template<class ContextPtr, class Scalar, class A2D, class B2D, class=std::enable_if_t<is_context<decltype(*ContextPtr{})>{}> >
gemm_range<ContextPtr, Scalar, typename A2D::const_iterator, typename B2D::const_iterator, typename A2D::decay_type/*B2D*/>
gemm(ContextPtr ctxtp, Scalar s, A2D const& a, B2D const& b){
return {ctxtp, s, begin(a), end(a), begin(b)};
}
//#pragma warning (disable:1011)
//#pragma diag_suppress 0117 //"implicit_return_from_non_void_function"
//#pragma diag_suppress 940 //"implicit_return_from_non_void_function"
// -Xcudafe "--diag_suppress=implicit_return_from_non_void_function"
template< class Scalar, class A2D, class B2D>
auto gemm( Scalar s, A2D const& a, B2D const& b){
if constexpr(is_conjugated<A2D>{}){
auto ctxtp = blas::default_context_of(underlying(a.base()));
return blas::gemm(ctxtp, s, a, b);
}else{
auto ctxtp = blas::default_context_of(a.base());
return blas::gemm(ctxtp, s, a, b);
}
}
namespace operators{
template<class A2D, class B2D>
auto operator*(A2D const& A, B2D const& B)
->decltype(+blas::gemm(1., A, B)){
return +blas::gemm(1., A, B);}
}
}}}
#endif

View File

@ -0,0 +1,152 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_GEMV_HPP
#define MULTI_ADAPTORS_BLAS_GEMV_HPP
#include "../blas/core.hpp"
#include "../blas/dot.hpp"
#include "./../../detail/../utility.hpp"
namespace boost{
namespace multi::blas{
using core::gemv;
template<class Context, class A, class MIt, class Size, class XIt, class B, class YIt>
auto gemv_n(Context&& ctxt, A a, MIt m_first, Size count, XIt x_first, B b, YIt y_first){
assert(m_first->stride()==1 or m_first.stride()==1); // blas doesn't implement this case
assert( x_first.base() != y_first.base() );
if constexpr(not is_conjugated<MIt>{}){
assert( y_first.base() != m_first.base() );
;;;; if(m_first .stride()==1) std::forward<Context>(ctxt).gemv('N', count, m_first->size(), a, m_first.base() , m_first->stride(), x_first.base(), x_first.stride(), b, y_first.base(), y_first.stride());
else if(m_first->stride()==1) std::forward<Context>(ctxt).gemv('T', m_first->size(), count, a, m_first.base() , m_first. stride(), x_first.base(), x_first.stride(), b, y_first.base(), y_first.stride());
else assert(0);
}else{
assert( y_first.base() != underlying(m_first.base()) );
;;;; if(m_first->stride()==1) std::forward<Context>(ctxt).gemv('C', m_first->size(), count, a, underlying(m_first.base()), m_first. stride(), x_first.base(), x_first.stride(), b, y_first.base(), y_first.stride());
else if(m_first. stride()==1) assert(0); // not implemented in blas (use cblas?)
else assert(0); // not implemented in blas
}
struct{
MIt m_last;
YIt y_last;
} ret{m_first + count, y_first + count};
return ret;
}
template<class A, class MIt, class Size, class XIt, class B, class YIt>
auto gemv_n(A a, MIt m_first, Size count, XIt x_first, B b, YIt y_first){
return gemv_n(blas::context{}, a, m_first, count, x_first, b, y_first);
}
template<class A, class M, class V, class B, class W>
W&& gemv(A const& a, M const& m, V const& v, B const& b, W&& w){
assert(size( m) == size(w) );
assert(size(~m) == size(v) );
gemv_n(a, begin(m), size(m), begin(v), b, begin(w));
return std::forward<W>(w);
}
template<class Scalar, class It2D, class It1D, class Context>
class gemv_iterator{
Scalar alpha_ = 1.;
It2D m_it_;
It1D v_first_;
Context ctxt_;
public:
using difference_type = typename std::iterator_traits<It2D>::difference_type;
using value_type = typename std::iterator_traits<It1D>::value_type;
using pointer = void;
using reference = void;
using iterator_category = std::random_access_iterator_tag;
// using iterator_category = std::output_iterator_tag;
// friend difference_type distance(gemv_iterator const& a, gemv_iterator const& b){assert(a.v_first_ == b.v_first_);
// return b.m_it_ - a.m_it_;
// }
friend difference_type operator-(gemv_iterator const& a, gemv_iterator const& b){assert(a.v_first_ == b.v_first_);
return a.m_it_ - b.m_it_;
}
template<class It1DOut>
friend auto copy_n(gemv_iterator first, difference_type count, It1DOut result){
if constexpr
(std::is_same<Context, void>{}) blas::gemv_n( first.alpha_, first.m_it_, count, first.v_first_, 0., result);
else blas::gemv_n(first.ctxt_, first.alpha_, first.m_it_, count, first.v_first_, 0., result);
return result + count;
}
template<class It1DOut>
friend auto copy(gemv_iterator first, gemv_iterator last, It1DOut result){return copy_n(first, last - first, result);}
template<class It1DOut>
friend auto uninitialized_copy(gemv_iterator first, gemv_iterator last, It1DOut result){
static_assert(std::is_trivially_default_constructible<typename It1DOut::value_type>{});
return copy(first, last, result);
}
gemv_iterator(Scalar alpha, It2D m_it, It1D v_first, Context ctxt)
: alpha_{alpha}, m_it_{m_it}, v_first_{v_first}, ctxt_{ctxt}{}
value_type operator*() const{return 0.;}
};
template<class Scalar, class It2D, class It1D, class DecayType, class Context>
class gemv_range{
Scalar alpha_ = 1.;
It2D m_begin_;
It2D m_end_;
It1D v_first_;
Context ctxt_ = {};
public:
gemv_range(gemv_range const&) = delete;
gemv_range(Scalar alpha, It2D m_first, It2D m_last, It1D v_first)
: alpha_{alpha}, m_begin_{m_first}, m_end_{m_last}, v_first_{v_first}{
assert(m_begin_.stride() == m_end_.stride());
}
gemv_range(Context&& ctxt, Scalar alpha, It2D m_first, It2D m_last, It1D v_first)
: alpha_{alpha}, m_begin_{m_first}, m_end_{m_last}, v_first_{v_first}, ctxt_{std::forward<Context>(ctxt)}{
assert(m_begin_.stride() == m_end_.stride());
}
using iterator = gemv_iterator<Scalar, It2D, It1D, Context>;
using decay_type = DecayType;
iterator begin() const{return {alpha_, m_begin_, v_first_, ctxt_};}
iterator end() const{return {alpha_, m_end_ , v_first_, ctxt_};}
size_type size() const{return end() - begin();}
typename decay_type::extensions_type extensions() const{return typename decay_type::extensions_type{{0, size()}};}
decay_type decay() const{
decay_type ret;
ret = *this;
return ret;
}
friend auto operator+(gemv_range const& self){return self.decay();}
template<class V>
friend V&& operator+=(V&& v, gemv_range const& s){
if constexpr
(std::is_same<Context, void*>{}) blas::gemv_n( s.alpha_, s.m_begin_, s.m_end_ - s.m_begin_, s.v_first_, 1., v.begin());
else blas::gemv_n(s.ctxt_, s.alpha_, s.m_begin_, s.m_end_ - s.m_begin_, s.v_first_, 1., v.begin());
return std::forward<V>(v);
}
};
template<class Scalar, class M, class V>
auto gemv(Scalar s, M const& m, V const& v)
{//->decltype(gemv_range{s, m, v}){
assert(size(~m) == size(v));
return gemv_range<Scalar, typename M::const_iterator, typename V::const_iterator, typename V::decay_type, blas::context>(s, m.begin(), m.end(), v.begin());}
template<class Context, class Scalar, class M, class V>
auto gemv(Context&& ctxt, Scalar s, M const& m, V const& v)
//->decltype(gemv_ranges, m, v})
{ assert(size(~m) == size(v));
return gemv_range<Scalar, typename M::const_iterator, typename V::const_iterator, typename V::decay_type, Context&&>(std::forward<Context>(ctxt), s, m.begin(), m.end(), v.begin());}
namespace operators{
template<class M, class V>
auto operator%(M const& m, V const& v)
->decltype(+blas::gemv(1., m, v)){
return +blas::gemv(1., m, v);}
}
}
}
#endif

View File

@ -0,0 +1,239 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXXX $CXXFLAGS -DADD_ $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_GER_HPP
#define MULTI_ADAPTORS_BLAS_GER_HPP
#include "../blas/core.hpp"
namespace boost{
namespace multi{
namespace blas{
using core::ger;
template<class T, class It1, class Size1, class It2, class Size2, class Out>
Out ger_n(T alpha, It1 x_first, Size1 x_n, It2 y_first, Size2 y_n, Out A_first){
assert( A_first->size() == x_n );
assert( A_first->stride() == 1 );
ger(x_n, y_n, alpha, base(x_first), stride(x_first), base(y_first), stride(y_first), base(A_first), stride(A_first));
return A_first + y_n;
}
template<class T, class It1, class It2, class Out>
Out ger(T alpha, It1 x_first, It1 x_last, It2 y_first, It2 y_last, Out A_first){
assert( stride(x_first) == stride(x_last) );
assert( stride(y_first) == stride(y_last) );
return ger_n(alpha, x_first, std::distance(x_first, x_last), y_first, std::distance(y_first, y_last), A_first);
}
template<class T, class X1D, class Y1D, class A2D>
A2D&& ger(T alpha, X1D const& x, Y1D const& y, A2D&& A){
if(stride(A) == 1){
auto e = ger(alpha, begin(y), end(y), begin(x), end(x), begin(rotated(A)));
assert( end(rotated(A)) == e );
}else{
assert( size(A) == size(y) );
auto e = ger(alpha, begin(x), end(x), begin(y), end(y), begin(A));
assert( end(A) == e );
}
return std::forward<A2D>(A);
}
template<class T, class It1, class Size1, class It2, class Size2, class Out>
Out gerc_n(T alpha, It1 x_first, Size1 x_n, It2 y_first, Size2 y_n, Out A_first){
assert( A_first->size() == x_n );
assert( A_first->stride() == 1 );
gerc(x_n, y_n, alpha, base(x_first), stride(x_first), base(y_first), stride(y_first), base(A_first), stride(A_first));
return A_first + y_n;
}
template<class T, class It1, class It2, class Out>
Out gerc(T alpha, It1 x_first, It1 x_last, It2 y_first, It2 y_last, Out A_first){
assert( stride(x_first) == stride(x_last) );
assert( stride(y_first) == stride(y_last) );
return gerc_n(alpha, x_first, std::distance(x_first, x_last), y_first, std::distance(y_first, y_last), A_first);
}
template<class T, class X1D, class Y1D, class A2D>
A2D gerc(T alpha, X1D const& x, Y1D const& y, A2D&& A){
if(stride(A) == 1){
auto e = gerc(alpha, begin(y), end(y), begin(x), end(x), begin(rotated(A)));
assert( end(rotated(A)) == e );
}else{
assert( size(A) == size(y) );
auto e = gerc(alpha, begin(x), end(x), begin(y), end(y), begin(A));
assert( end(A) == e );
}
return A;
}
template<class T, class It1, class Size1, class It2, class Size2, class Out>
Out geru_n(T alpha, It1 x_first, Size1 x_n, It2 y_first, Size2 y_n, Out A_first){
assert( A_first->size() == x_n );
assert( A_first->stride() == 1 );
geru(x_n, y_n, alpha, base(x_first), stride(x_first), base(y_first), stride(y_first), base(A_first), stride(A_first));
return A_first + y_n;
}
template<class T, class It1, class It2, class Out>
Out geru(T alpha, It1 x_first, It1 x_last, It2 y_first, It2 y_last, Out A_first){
assert( stride(x_first) == stride(x_last) );
assert( stride(y_first) == stride(y_last) );
return geru_n(alpha, x_first, std::distance(x_first, x_last), y_first, std::distance(y_first, y_last), A_first);
}
template<class T, class X1D, class Y1D, class A2D>
A2D geru(T alpha, X1D const& x, Y1D const& y, A2D&& A){
if(stride(A) == 1){
auto e = geru(alpha, begin(y), end(y), begin(x), end(x), begin(rotated(A)));
assert( end(rotated(A)) == e );
}else{
assert( size(A) == size(y) );
auto e = geru(alpha, begin(x), end(x), begin(y), end(y), begin(A));
assert( end(A) == e );
}
return A;
}
}}}
#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_GER
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi blas ger"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../array.hpp"
#include "../../utility.hpp"
#include<complex>
#include<cassert>
#include<iostream>
#include<numeric>
#include<algorithm>
using std::cout;
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(multi_blas_ger){
namespace blas = multi::blas;
{
multi::array<double, 2> A = {
{0., 0. ,0.},
{0., 0., 0.}
};
multi::array<double, 1> const x = { 0., 0., 1.};
multi::array<double, 1> const y = { 0., 1.};
blas::ger(1., x, y, A); // A = a*A + (y^T)(x)
for(int i = 0; i != size(A); ++i){
for(int j = 0; j != size(A[i]); ++j)
std::cout << A[i][j] << ' ';
std::cout << std::endl;
}
std::cout << std::endl;
// a = {{2., 3.}, {1., 4.}, {1., 0.}}; GER[1, {1., 2., 5.}, {-2., 1.}, a]; Print[a] : {{0., 4.}, {-3., 6.}, {-9., 5.}}
// assert( A[1][1] == 6. );
}
{
multi::array<double, 2> A = {
{0., 0.},
{0., 0.},
{0., 0.}
};
multi::array<double, 1> const x = {0., 0., 1.};
multi::array<double, 1> const y = {0., 1.};
blas::ger(1., x, y, rotated(A)); // A^T = a*A^T + (y^T)(x) and A = a*A + (x^T)y
// a = {{2., 3.}, {1., 4.}, {1., 0.}}; GER[1, {1., 2., 5.}, {-2., 1.}, a]; Print[a] : {{0., 4.}, {-3., 6.}, {-9., 5.}}
for(int i = 0; i != size(A); ++i){
for(int j = 0; j != size(A[i]); ++j)
std::cout << A[i][j] << ' ';
std::cout << std::endl;
}
// std::cout << A[1][2] << std::endl;
// assert( A[1][2] == 1. );
}
{
// multi::array<double, 2> A = {
// {2., 3., 6., 8.},
// {4., 1., 6., 8.},
// {0., 1., 6., 8.}
// };
// assert( A[1][2] == 6. );
// multi::array<double, 1> const x = { 0., 1., 0.};
// multi::array<double, 1> const y = { 0., 0., 1., 0.};
// multi::blas::ger(0., x, y, rotated(A)); //
// a = {{2., 3.}, {1., 4.}, {1., 0.}}; GER[1, {1., 2., 5.}, {-2., 1.}, a]; Print[a] : {{0., 4.}, {-3., 6.}, {-9., 5.}}
// assert( A[1][1] == 4. );
}
{
multi::array<double, 2> A = {
{2., 3., 6., 8.},
{4., 1., 6., 8.},
{0., 1., 6., 8.}
};
multi::array<double, 1> const x = { 1., 2., 5.};
multi::array<double, 1> const y = {-2., 1., 1., 1.};
blas::ger(1., x, y, A); //
// a = {{2., 3.}, {1., 4.}, {1., 0.}}; GER[1, {1., 2., 5.}, {-2., 1.}, a]; Print[a] : {{0., 4.}, {-3., 6.}, {-9., 5.}}
// assert( A[1][1] == 4. );
}
{
multi::array<double, 2> a = {
{2., 1., 1.},
{3., 4., 0.}
};
multi::array<double, 1> const x = { 1., 2., 5.};
multi::array<double, 1> const y = {-2., 1.};
blas::ger(1., x, y, rotated(a));
// a = {{2., 3.}, {1., 4.}, {1., 0.}}; GER[1, {1., 2., 5.}, {-2., 1.}, a]; Print[a] : {{0., 4.}, {-3., 6.}, {-9., 5.}}
assert( a[1][1] == 6. );
}
#if 0
{
multi::array<std::complex<double>, 2> a = {
{2., 3.},
{1., 4.},
{1.,0.}
};
multi::array<std::complex<double>, 1> const x = { 1., 2., 5.};
multi::array<std::complex<double>, 1> const y = {-2., 1.};
multi::blas::gerc(1., x, y, a);
// a = {{2., 3.}, {1., 4.}, {1., 0.}}; GER[1, {1., 2., 5.}, {-2., 1.}, a]; Print[a] : {{0., 4.}, {-3., 6.}, {-9., 5.}}
assert( a[1][1] == 6. );
}
{
multi::array<std::complex<double>, 2> a = {{2. + 1.*I, 3. + 4.*I}, {1.+3.*I, 4. + 2.*I}, {1. + 7.*I, 0.}};
multi::array<std::complex<double>, 1> const x = { 1. + 1.*I, 2. + I*9., 5. + 4.*I};
multi::array<std::complex<double>, 1> const y = {-2. + 8.*I, 1. + 1.*I};
multi::blas::geru(1. + 2.*I, x, y, a); // a = alpha*outer(x, y) + a
// a = {{2. + 1.*I, 3. + 4.*I}, {1. + 3.*I, 4. + 2.*I}, {1. + 7.*I, 0.}}; GER[1 + 2.*I, {1. + 1.*I, 2. + I*9., 5. + 4.*I}, {-2. + 8.*I, 1. + 1.*I}, a]; Print[a];
// {{-20.-13. I,-1.+6. I},{-71.-151. I,-25.-1. I},{-105.-45. I,-17.+11. I}}
std::cout << "a11 " << a[1][1] << std::endl;
assert( a[1][1] == -25. - 1.*I );
}
{
multi::array<std::complex<double>, 2> a = {
{2. + 1.*I, 1. + 3.*I, 1. + 7.*I},
{3. + 4.*I, 4. + 2.*I, 0. + 0.*I}
};
std::cout << "a = " << size(a) << std::endl;
multi::array<std::complex<double>, 1> const x = { 1. + 1.*I, 2. + I*9., 5. + 4.*I};
multi::array<std::complex<double>, 1> const y = {-2. + 8.*I, 1. + 1.*I};
multi::blas::geru(1. + 2.*I, x, y, rotated(a)); // a = alpha*outer(x, y) + a
// a = {{2. + 1.*I, 3. + 4.*I}, {1. + 3.*I, 4. + 2.*I}, {1. + 7.*I, 0.}}; GER[1 + 2.*I, {1. + 1.*I, 2. + I*9., 5. + 4.*I}, {-2. + 8.*I, 1. + 1.*I}, a]; Print[a];
// {{-20.-13. I,-1.+6. I},{-71.-151. I,-25.-1. I},{-105.-45. I,-17.+11. I}}
std::cout << "here a11 " << a[1][1] << std::endl;
assert( a[1][1] == -25. - 1.*I );
}
#endif
}
#endif
#endif

View File

@ -0,0 +1,909 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXXX $CXXFLAGS $0 -o $0x -lboost_unit_test_framework `pkg-config --libs blas` \
`#-Wl,-rpath,/usr/local/Wolfram/Mathematica/12.0/SystemFiles/Libraries/Linux-x86-64 -L/usr/local/Wolfram/Mathematica/12.0/SystemFiles/Libraries/Linux-x86-64 -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5` \
-lboost_timer &&$0x&&rm $0x; exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_HERK_HPP
#define MULTI_ADAPTORS_BLAS_HERK_HPP
#include "../blas/core.hpp"
#include "../blas/copy.hpp"
//#include "../blas/scal.hpp"
#include "../blas/syrk.hpp" // fallback to real case
#include "../blas/side.hpp"
#include "../blas/filling.hpp"
#include "../blas/operations.hpp"
#include "../../config/NODISCARD.hpp"
//#include<iostream> //debug
//#include<type_traits> // void_t
namespace boost{
namespace multi{namespace blas{
template<class A, std::enable_if_t<not is_conjugated<A>{}, int> =0>
auto base_aux(A&& a)
->decltype(base(a)){
return base(a);}
template<class A, std::enable_if_t< is_conjugated<A>{}, int> =0>
auto base_aux(A&& a)
->decltype(underlying(base(a))){
return underlying(base(a));}
using core::herk;
template<class AA, class BB, class A2D, class C2D, class = typename A2D::element_ptr, std::enable_if_t<is_complex_array<C2D>{}, int> =0>
C2D&& herk(filling c_side, AA alpha, A2D const& a, BB beta, C2D&& c)
//->decltype(herk('\0', '\0', c.size(), a.size(), &alpha, base_aux(a), stride(a.rotated()), &beta, base_aux(c), stride(c)), std::forward<C2D>(c))
{
assert( a.size() == c.size() );
assert( c.size() == rotated(c).size() );
if(c.size()==0) return std::forward<C2D>(c);
if constexpr(is_conjugated<C2D>{}){herk(flip(c_side), alpha, a, beta, hermitized(c)); return std::forward<C2D>(c);}
{
auto base_a = base_aux(a);
auto base_c = base_aux(c); // static_assert( not is_conjugated<C2D>{}, "!" );
if constexpr(is_conjugated<A2D>{}){
// auto& ctxt = *blas::default_context_of(underlying(a.base()));
// if you get an error here might be due to lack of inclusion of a header file with the backend appropriate for your type of iterator
if(stride(a)==1 and stride(c)!=1) herk(c_side==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), &alpha, base_a, stride(rotated(a)), &beta, base_c, stride(c));
else if(stride(a)==1 and stride(c)==1){
if(size(a)==1) herk(c_side==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), &alpha, base_a, stride(rotated(a)), &beta, base_c, stride(c));
else assert(0);
}
else if(stride(a)!=1 and stride(c)==1) herk(c_side==filling::upper?'U':'L', 'C', size(c), size(rotated(a)), &alpha, base_a, stride( a ), &beta, base_c, stride(rotated(c)));
else if(stride(a)!=1 and stride(c)!=1) herk(c_side==filling::upper?'L':'U', 'C', size(c), size(rotated(a)), &alpha, base_a, stride( a ), &beta, base_c, stride( c ));
else assert(0);
}else{
// auto& ctxt = *blas::default_context_of( a.base() );
;;;; if(stride(a)!=1 and stride(c)!=1) herk(c_side==filling::upper?'L':'U', 'C', size(c), size(rotated(a)), &alpha, base_a, stride( a ), &beta, base_c, stride(c));
else if(stride(a)!=1 and stride(c)==1){
if(size(a)==1) herk(c_side==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), &alpha, base_a, stride(rotated(a)), &beta, base_c, stride(rotated(c)));
else assert(0);
}
else if(stride(a)==1 and stride(c)!=1) assert(0);//case not implemented, herk(c_side==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), alpha, base_a, stride(rotated(a)), beta, base(c), stride(c));
else if(stride(a)==1 and stride(c)==1) herk(c_side==filling::upper?'U':'L', 'N', size(c), size(rotated(a)), &alpha, base_a, stride(rotated(a)), &beta, base_c, stride(rotated(c)));
else assert(0);
}
}
return std::forward<C2D>(c);
}
template<class AA, class BB, class A2D, class C2D, class = typename A2D::element_ptr, std::enable_if_t<not is_complex_array<C2D>{}, int> =0>
auto herk(filling c_side, AA alpha, A2D const& a, BB beta, C2D&& c)
->decltype(syrk(c_side, alpha, a, beta, std::forward<C2D>(c))){
return syrk(c_side, alpha, a, beta, std::forward<C2D>(c));}
//template<class AA, class BB, class A2D, class C2D, class = typename A2D::element_ptr>
//auto herk(filling c_side, AA alpha, A2D const& a, BB beta, C2D&& c)
//->decltype(herk_aux(c_side, alpha, a, beta, std::forward<C2D>(c), is_complex<C2D>{})){
// return herk_aux(c_side, alpha, a, beta, std::forward<C2D>(c), is_complex<C2D>{});}
template<class AA, class A2D, class C2D, class = typename A2D::element_ptr>
auto herk(filling c_side, AA alpha, A2D const& a, C2D&& c)
->decltype(herk(c_side, alpha, a, 0., std::forward<C2D>(c))){
return herk(c_side, alpha, a, 0., std::forward<C2D>(c));}
template<typename AA, class A2D, class C2D>
auto herk(AA alpha, A2D const& a, C2D&& c)
->decltype(herk(filling::lower, alpha, a, herk(filling::upper, alpha, a, std::forward<C2D>(c)))){
return herk(filling::lower, alpha, a, herk(filling::upper, alpha, a, std::forward<C2D>(c)));}
template<class A2D, class C2D>
auto herk(A2D const& a, C2D&& c)
->decltype(herk(1., a, std::forward<C2D>(c))){
return herk(1., a, std::forward<C2D>(c));}
/*
template<class A2D, class C2D>
NODISCARD("when last argument is const")
auto herk(A2D const& a, C2D const& c)
->decltype(herk(1., a, decay(c))){
return herk(1., a, decay(c));}
*/
template<class AA, class A2D, class Ret = typename A2D::decay_type>
NODISCARD("when argument is read-only")
auto herk(AA alpha, A2D const& a)//->std::decay_t<decltype(herk(alpha, a, Ret({size(a), size(a)}, get_allocator(a))))>{
{
return herk(alpha, a, Ret({size(a), size(a)}));//Ret({size(a), size(a)}));//, get_allocator(a)));
}
template<class T> struct numeric_limits : std::numeric_limits<T>{};
template<class T> struct numeric_limits<std::complex<T>> : std::numeric_limits<std::complex<T>>{
static std::complex<T> quiet_NaN(){auto n=numeric_limits<T>::quiet_NaN(); return {n, n};}
};
template<class AA, class A2D, class Ret = typename A2D::decay_type>
NODISCARD("because argument is read-only")
auto herk(filling cs, AA alpha, A2D const& a)
->std::decay_t<
decltype(herk(cs, alpha, a, Ret({size(a), size(a)}, 0., get_allocator(a))))>{
return herk(cs, alpha, a, Ret({size(a), size(a)},
#ifdef NDEBUG
numeric_limits<typename Ret::element_type>::quiet_NaN(),
#endif
get_allocator(a)
));
}
template<class A2D> auto herk(filling s, A2D const& a)
->decltype(herk(s, 1., a)){
return herk(s, 1., a);}
template<class A2D> auto herk(A2D const& a)
//->decltype(herk(1., a)){
{ return herk(1., a);}
}}
}
#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_HERK
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS herk"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../array.hpp"
#include "../../adaptors/blas/gemm.hpp"
#include "../../adaptors/blas/nrm2.hpp"
#include<iostream>
#include<numeric>
namespace utf = boost::unit_test;
namespace multi = boost::multi;
template<class T> void what(T&&) = delete;
template<class M> decltype(auto) print(M const& C){
using std::cout;
using boost::multi::size;
for(int i = 0; i != size(C); ++i){
for(int j = 0; j != size(C[i]); ++j) cout << C[i][j] << ' ';
cout << std::endl;
}
return cout << std::endl;
}
BOOST_AUTO_TEST_CASE(inq_case){
using namespace multi::blas;
multi::array<double, 2> const a = {
{0, 1, 2},
{3, 4, 5},
{6, 7, 8},
{9, 10, 11}
};
BOOST_REQUIRE( gemm(a, T(a))[1][2] == 86. );
{
multi::array<double, 2> c({4, 4});
herk(1.0, a, c);
BOOST_REQUIRE( c == gemm(a, T(a)) );
}
{
multi::array<double, 2> c = herk(1.0, a);
BOOST_REQUIRE( c == gemm(a, T(a)) );
}
{
BOOST_REQUIRE( herk(a) == gemm(a, T(a)) );
}
{
BOOST_REQUIRE( herk(2.0, a) == gemm(2.0, a, T(a)) );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_real){
namespace blas = multi::blas;
multi::array<double, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
{
multi::array<double, 2> c({2, 2}, 9999);
blas::herk(1., a, c);
BOOST_REQUIRE( c[1][0] == 34 );
BOOST_REQUIRE( c[0][1] == 34 );
multi::array<double, 2> const c_copy = blas::herk(1., a);
BOOST_REQUIRE( c == c_copy );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk1x1_case){
namespace blas = multi::blas;
multi::array<double, 2> const A = {{1., 2., 3.}};
multi::array<double, 2> B = blas::herk(A);
BOOST_REQUIRE( size(B) == 1 );
BOOST_REQUIRE( B[0][0] == 1.*1. + 2.*2. + 3.*3. );
}
BOOST_AUTO_TEST_CASE(multi_blas_herk1x1_case_scale){
namespace blas = multi::blas;
multi::array<double, 2> const A = {{1., 2., 3.}};
multi::array<double, 2> B = blas::herk(0.1, A);
BOOST_REQUIRE( size(B) == 1 );
BOOST_TEST( B[0][0] == (1.*1. + 2.*2. + 3.*3.)*0.1 );
}
BOOST_AUTO_TEST_CASE(multi_blas_herk1x1_complex_real_case){
namespace blas = multi::blas;
multi::array<complex, 2> const A = {{1., 2., 3.}};
multi::array<complex, 2> B = blas::herk(1.0, A);
BOOST_REQUIRE( size(B) == 1 );
BOOST_REQUIRE( B[0][0] == 1.*1. + 2.*2. + 3.*3. );
}
BOOST_AUTO_TEST_CASE(multi_blas_herk1x1_complex_real_case_scale, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
multi::array<complex, 2> const A = {{1., 2., 3.}};
multi::array<complex, 2> B = blas::herk(0.1, A);
BOOST_REQUIRE( size(B) == 1 );
BOOST_TEST( real( B[0][0]/0.1 ) == 1.*1. + 2.*2. + 3.*3. );
}
BOOST_AUTO_TEST_CASE(multi_blas_herk1x1_complex_case){
namespace blas = multi::blas;
multi::array<complex, 2> const A = {{1. + 2.*I, 2.+3.*I, 3. + 4.*I}};
multi::array<complex, 2> B = blas::herk(A);
BOOST_REQUIRE( size(B) == 1 );
BOOST_REQUIRE( B[0][0] == std::norm(1. + 2.*I) + std::norm(2.+3.*I) + std::norm(3. + 4.*I) );
BOOST_TEST( std::sqrt(real(blas::herk(A)[0][0])) == blas::nrm2(A[0])() );
}
BOOST_AUTO_TEST_CASE(multi_blas_herk1x1_complex_case_hermitized_out_param){
namespace blas = multi::blas;
multi::array<complex, 2> const A = {{1. + 2.*I}, {2.+3.*I}, {3. + 4.*I}};
multi::array<complex, 2> B({1, 1});
BOOST_REQUIRE( size(B) == 1 );
blas::herk(blas::filling::upper, 1.0, blas::H(A), 0.0, B);
BOOST_REQUIRE( B[0][0] == std::norm(1. + 2.*I) + std::norm(2.+3.*I) + std::norm(3. + 4.*I) );
BOOST_TEST( std::sqrt(real(B[0][0])) == blas::nrm2(blas::T(A)[0])() );
}
BOOST_AUTO_TEST_CASE(multi_blas_herk1x1_complex_case_hermitized){
multi::array<complex, 2> A = {{1. + 2.*I}, {2.+3.*I}, {3. + 4.*I}};
namespace blas = multi::blas;
multi::array<complex, 2> B = blas::herk(blas::H(A));
BOOST_REQUIRE( size(B) == 1 );
BOOST_REQUIRE( B[0][0] == std::norm(1. + 2.*I) + std::norm(2.+3.*I) + std::norm(3. + 4.*I) );
BOOST_TEST( std::sqrt(real(blas::herk(blas::H(A))[0][0])) == blas::nrm2(rotated(A)[0])() );
}
BOOST_AUTO_TEST_CASE(multi_blas_herk1x1_complex_case_hermitized_auto){
namespace blas = multi::blas;
multi::array<complex, 2> A = {{1. + 2.*I}, {2.+3.*I}, {3. + 4.*I}};
auto B = blas::herk(1., blas::hermitized(A));
static_assert( std::is_same<decltype(B), multi::array<complex, 2>>{}, "!" );
BOOST_REQUIRE( size(B) == 1 );
BOOST_REQUIRE( B[0][0] == std::norm(1. + 2.*I) + std::norm(2.+3.*I) + std::norm(3. + 4.*I) );
BOOST_TEST( std::sqrt(real(blas::herk(blas::H(A))[0][0])) == blas::nrm2(rotated(A)[0])() );
}
#if 1
#if 1
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_identity){
namespace blas = multi::blas;
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
{
multi::array<complex, 2> c({2, 2}, 9999.);
blas::herk(blas::filling::lower, 1., a, 0., c); // c†=c=aa†=(aa†)†, `c` in lower triangular
BOOST_REQUIRE( c[1][0]==complex(50., -49.) );
BOOST_REQUIRE( c[0][1]==9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
static_assert(blas::is_conjugated<decltype(blas::H(c))>{}, "!" );
blas::herk(blas::filling::lower, 1., a, 0., blas::H(c)); // c†=c=aa†=(aa†)†, `c` in upper triangular
BOOST_REQUIRE( blas::H(c)[1][0]==complex(50., -49.) );
BOOST_REQUIRE( blas::H(c)[0][1]==9999. );
}
{
// multi::array<complex, 2> c({2, 2}, 9999.);
// blas::herk(blas::filling::lower, 1., a, 0., blas::T(c)); // c†=c=aa†=(aa†)†, `c` in lower triangular
// BOOST_REQUIRE( transposed(c)[1][0]==complex(50., -49.) );
// BOOST_REQUIRE( transposed(c)[0][1]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
// herk(filling::lower, 1., transposed(a), 0., c); // c†=c=aT(aT)† not supported
// print(c);
// BOOST_REQUIRE( c[1][0]==complex(52., -90.) );
// BOOST_REQUIRE( c[0][1]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
// herk(filling::lower, 1., transposed(a), 0., hermitized(c)); // c†=c=aT(aT)† not supported
// BOOST_REQUIRE( hermitized(c)[1][0]==complex(52., -90.) );
// BOOST_REQUIRE( hermitized(c)[0][1]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(blas::filling::lower, 1., blas::T(a), 0., blas::T(c)); // c†=c=aT(aT)† not supported
BOOST_REQUIRE( transposed(c)[1][0]==complex(52., -90.) );
BOOST_REQUIRE( transposed(c)[0][1]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
blas::herk(blas::filling::lower, 1., blas::T(a), 0., blas::H(blas::T(c))); // c†=c=aT(aT)† not supported
BOOST_REQUIRE( blas::H(blas::T(c))[1][0]==complex(52., -90.) );
BOOST_REQUIRE( blas::H(blas::T(c))[0][1]==9999. );
}
{
// multi::array<complex, 2> c({3, 3}, 9999.);
// using namespace multi::blas;
// blas::herk(blas::filling::lower, 1., blas::T(a), 0., c); // c†=c=aa†=(aa†)†, `c` in lower triangular
// BOOST_REQUIRE( c[1][0]==complex(50., -49.) );
// BOOST_REQUIRE( c[0][1]==9999. );
}
#if 1
{
multi::array<complex, 2> c({2, 2}, 9999.);
blas::herk(blas::U, 1., a, 0., c); // c†=c=aa†=(aa†)†, `c` in upper triangular
BOOST_REQUIRE( c[0][1]==complex(50., +49.) );
BOOST_REQUIRE( c[1][0]==9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
blas::herk(1., a, c); // c†=c=aa†=(aa†)†
BOOST_REQUIRE( c[0][1]==complex(50., +49.) );
BOOST_REQUIRE( c[1][0]==complex(50., -49.) );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
blas::herk(blas::L, 1., blas::H(a), 0., c); // c†=c=aa†=(aa†)†, `c` in lower triangular
BOOST_REQUIRE( c[1][0]==complex(52., 90.) );
BOOST_REQUIRE( c[0][1]==9999. );
}
{
// multi::array<complex, 2> c({3, 3}, 9999.);
// using namespace multi::blas;
// herk(filling::lower, 1., transposed(a), 0., c); // c†=c=aa†=(aa†)†, `c` in lower triangular
// BOOST_REQUIRE( c[0][1]==9999. );
// BOOST_REQUIRE( c[1][0]==complex(52., 90.) );
}
#endif
}
#if 0
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_real_case){
multi::array<complex, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
namespace blas = multi::blas;
using blas::filling;
using blas::transposed;
using blas::hermitized;
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(filling::lower, 1., hermitized(a), 0., c);//c†=c=a†a=(a†a)†, `c` in lower triangular
BOOST_REQUIRE( c[2][1]==complex(19.,0.) );
BOOST_REQUIRE( c[1][2]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(filling::upper, 1., hermitized(a), 0., c);//c†=c=a†a=(a†a)†, `c` in lower triangular
BOOST_REQUIRE( c[1][2]==complex(19.,0.) );
BOOST_REQUIRE( c[2][1]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
// herk(filling::upper, 1., hermitized(a), 0., transposed(c));//c†=c=a†a=(a†a)†, `c` in lower triangular
// print(transposed(c));
// BOOST_REQUIRE( c[1][2]==complex(19.,0.) );
// BOOST_REQUIRE( c[2][1]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
using blas::transposed;
// herk(filling::upper, 1., transposed(a), 0., c);//c_†=c_=a_†a_=(a_†a_)†, `c_` in lower triangular
// BOOST_REQUIRE( c[2][1] == 9999. );
// BOOST_REQUIRE( c[1][2] == 19. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_basic_transparent_interface){
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
namespace blas = multi::blas;
using blas::filling;
using blas::hermitized;
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(filling::lower, 1., hermitized(a), 0., c); // c†=c=a†a=(a†a)†, information in `c` lower triangular
BOOST_REQUIRE( c[2][1]==complex(41.,2.) );
BOOST_REQUIRE( c[1][2]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
using multi::blas::herk;
herk(filling::upper, 1., hermitized(a), 0., c); // c†=c=a†a=(a†a)†, `c` in upper triangular
BOOST_REQUIRE( c[1][2]==complex(41., -2.) );
BOOST_REQUIRE( c[2][1]==9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
using multi::blas::herk;
herk(filling::lower, 1., a, 0., c); // c†=c=aa†, `a` and `c` are c-ordering, information in c lower triangular
BOOST_REQUIRE( c[1][0]==complex(50., -49.) );
BOOST_REQUIRE( c[0][1]==9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
using multi::blas::herk;
herk(filling::upper, 1., a, 0., c); //c†=c=aa†, `c` in upper triangular
BOOST_REQUIRE( c[0][1]==complex(50., 49.) );
BOOST_REQUIRE( c[1][0]==9999. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_basic_enum_interface){
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
namespace blas = multi::blas;
using blas::filling;
using blas::hermitized;
using blas::transposed;
{
// multi::array<complex, 2> c({2, 2}, 8888.);
// std::cerr << "here" << std::endl;
// herk(filling::lower, 1., hermitized(transposed(a)), 0., c); //c†=c=a†a=(a†a)†, `c` in lower triangular
// print(c) << std::endl;
// std::cerr << "there" << std::endl;
// BOOST_REQUIRE( c[0][1]==complex(41.,2.) );
// BOOST_REQUIRE( c[1][0]==8888. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(filling::lower, 1., hermitized(a), 0., c); //c†=c=a†a=(a†a)†, `c` in lower triangular
BOOST_REQUIRE( c[2][1]==complex(41.,2.) );
BOOST_REQUIRE( c[1][2]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
using namespace multi::blas;
herk(filling::upper, 1., hermitized(a), 0., c); //c†=c=a†a=(a†a)†, `c` in upper triangular
BOOST_REQUIRE( c[1][2]==complex(41., -2.) );
BOOST_REQUIRE( c[2][1]==9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
using namespace multi::blas;
herk(filling::lower, 1., a, 0., c); // c†=c=aa†, `c` in lower triangular
BOOST_REQUIRE( c[1][0]==complex(50., -49.) );
BOOST_REQUIRE( c[0][1]==9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
using namespace multi::blas;
herk(filling::upper, 1., a, 0., c); // c†=c=aa†, `c` in upper triangular
BOOST_REQUIRE( c[0][1]==complex(50., 49.) );
BOOST_REQUIRE( c[1][0]==9999. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_basic_explicit_enum_interface){
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
using namespace multi::blas;
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(filling::lower, 1., hermitized(a), 0., c); // c†=c=a†a=(a†a)†, `c` in lower triangular
BOOST_REQUIRE( c[2][1]==complex(41.,2.) );
BOOST_REQUIRE( c[1][2]==9999. );
}
BOOST_REQUIRE( herk(hermitized(a)) == gemm(hermitized(a), a) );
{
multi::array<complex, 2> c({3, 3}, 9999.);
// herk(filling::lower, 1., hermitized(a), 0., transposed(c)); // c†=c=a†a=(a†a)†, `c` in lower triangular
// print(transposed(c));
// BOOST_REQUIRE( c[2][1]==complex(41.,2.) );
// BOOST_REQUIRE( c[1][2]==9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
herk(filling::lower, 1., hermitized(transposed(a)), 0., transposed(c)); // c†=c=a†a=(a†a)†, `c` in lower triangular
BOOST_REQUIRE( transposed(c)[1][0]==complex(50.,+49.) );
BOOST_REQUIRE( transposed(c)[0][1]==9999. );
}
// BOOST_REQUIRE( herk(hermitized(transposed(a))) == gemm(hermitized(transposed(a)), transposed(a)) );
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(filling::upper, 1., hermitized(a), 0., c); // c†=c=a†a=(a†a)†, `c` in upper triangular
BOOST_REQUIRE( c[1][2]==complex(41., -2.) );
BOOST_REQUIRE( c[2][1]==9999. );
BOOST_REQUIRE( herk(hermitized(a)) == gemm(hermitized(a), a) );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
herk(filling::lower, 1., a, 0., c); // c†=c=aa†=(aa†)†, `c` in lower triangular
BOOST_REQUIRE( c[1][0]==complex(50., -49.) );
BOOST_REQUIRE( c[0][1]==9999. );
BOOST_REQUIRE( herk(a) == gemm(a, hermitized(a)) );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
herk(filling::upper, 1., a, 0., c); // c†=c=aa†=(aa†)†, `c` in upper triangular
BOOST_REQUIRE( c[0][1]==complex(50., 49.) );
BOOST_REQUIRE( c[1][0]==9999. );
BOOST_REQUIRE( herk(a) == gemm(a, hermitized(a)) );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
herk(filling::upper, 2., a, 0., c); // c†=c=aa†=(aa†)†, `c` in upper triangular
BOOST_REQUIRE( c[0][1]==complex(100., 98.) );
BOOST_REQUIRE( c[1][0]==9999. );
BOOST_REQUIRE( herk(2., a) == gemm(2., a, hermitized(a)) );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
herk(filling::upper, 1., a, 0., c); // c†=c=aa†=(aa†)†, `c` in upper triangular
BOOST_REQUIRE( c[0][1]==complex(50., 49.) );
BOOST_REQUIRE( c[1][0]==9999. );
BOOST_REQUIRE( herk(1., a) == gemm(1., a, hermitized(a)) );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_automatic_operator_interface){
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
{
multi::array<complex, 2> c({3, 3}, 9999.);
namespace blas = multi::blas;
using blas::filling;
using blas::hermitized;
herk(filling::lower, 1., hermitized(a), 0., c); // c=c†=a†a, `c` in lower triangular
BOOST_REQUIRE( c[2][1]==complex(41., 2.) );
BOOST_REQUIRE( c[1][2]==9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
using multi:: blas::filling;
herk(filling::lower, 1., a, 0., c); // c=c†=aa†, `c` in lower triangular
BOOST_REQUIRE( c[1][0]==complex(50., -49.) );
BOOST_REQUIRE( c[0][1]==9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
using multi::blas::herk;
herk(1., a, c); // c=c†=aa†
BOOST_REQUIRE( c[1][0]==complex(50., -49.) );
BOOST_REQUIRE( c[0][1]==complex(50., +49.) );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
namespace blas = multi::blas;
using blas::filling;
using blas::hermitized;
herk(filling::lower, 1., hermitized(a), 0., c); // c=c†=a†a, `c` in lower triangular
herk(filling::upper, 1., hermitized(a), 0., c);
BOOST_REQUIRE( c[2][1]==complex(41., 2.) );
BOOST_REQUIRE( c[1][2]==complex(41., -2.) );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_automatic_operator_interface_implicit_no_sum){
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
{
multi::array<complex, 2> c({3, 3}, 9999.);
namespace blas = multi::blas;
using blas::filling;
using blas::hermitized;
herk(filling::lower, 1., hermitized(a), c); // c=c†=a†a, `c` in lower triangular
BOOST_REQUIRE( c[2][1]==complex(41., 2.) );
BOOST_REQUIRE( c[1][2]==9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
using multi::blas::filling;
herk(filling::lower, 1., a, c); // c=c†=aa†, `c` in lower triangular
BOOST_REQUIRE( c[1][0]==complex(50., -49.) );
BOOST_REQUIRE( c[0][1]==9999. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_automatic_ordering_and_symmetrization){
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
namespace blas = multi::blas;
using blas::herk;
using blas::hermitized;
using blas::filling;
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(filling::upper, 1., hermitized(a), c); // c†=c=a†a
BOOST_REQUIRE( c[2][1]==9999. );
BOOST_REQUIRE( c[1][2]==complex(41., -2.) );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(1., hermitized(a), c); // c†=c=a†a
BOOST_REQUIRE( c[2][1]==complex(41., +2.) );
BOOST_REQUIRE( c[1][2]==complex(41., -2.) );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
herk(filling::upper, 1., a, c); // c†=c=aa† // c implicit hermitic in upper
BOOST_REQUIRE( c[1][0] == 9999. );
BOOST_REQUIRE( c[0][1] == complex(50., +49.) );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
herk(1., a, c); // c†=c=aa†
BOOST_REQUIRE( c[1][0] == complex(50., -49.) );
BOOST_REQUIRE( c[0][1] == complex(50., +49.) );
}
{
multi::array<complex, 2> c = herk(filling::upper, 1., a); // c†=c=aa†
// BOOST_REQUIRE( c[1][0] == complex(50., -49.) );
BOOST_REQUIRE( c[0][1] == complex(50., +49.) );
}
{
using multi::blas::herk;
using multi::blas::filling;
multi::array<complex, 2> c = herk(1., a); // c†=c=aa†
BOOST_REQUIRE( c[1][0] == complex(50., -49.) );
BOOST_REQUIRE( c[0][1] == complex(50., +49.) );
}
{
using multi::blas::herk;
using multi::blas::hermitized;
using multi::blas::filling;
multi::array<complex, 2> c = herk(filling::upper, 1., hermitized(a)); // c†=c=a†a
BOOST_REQUIRE( size(hermitized(a))==3 );
// BOOST_REQUIRE( c[2][1] == complex(41., +2.) );
BOOST_REQUIRE( c[1][2] == complex(41., -2.) );
}
{
using multi::blas::herk;
using multi::blas::filling;
multi::array<complex, 2> c = herk(filling::upper, a); // c†=c=a†a
// what(multi::pointer_traits<decltype(base(a))>::default_allocator_of(base(a)));
// BOOST_REQUIRE( c[1][0] == complex(50., -49.) );
BOOST_REQUIRE( c[0][1] == complex(50., +49.) );
}
{
using multi::blas::herk;
using multi::blas::hermitized;
using multi::blas::filling;
multi::array<complex, 2> c = herk(filling::upper, hermitized(a)); // c†=c=a†a
// BOOST_REQUIRE( c[2][1] == complex(41., +2.) );
BOOST_REQUIRE( c[1][2] == complex(41., -2.) );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_size1_real_case){
multi::array<complex, 2> const a = {
{1., 3., 4.}
};
using namespace multi::blas;
{
multi::array<complex, 2> c({1, 1}, 9999.);
herk(filling::upper, 1., a, c); // c†=c=aa†
BOOST_TEST( c[0][0] == 26. );
}
BOOST_TEST( herk(a) == gemm(a, hermitized(a)) );
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_size1){
multi::array<complex, 2> const a = {
{1. + 4.*I, 3. + 2.*I, 4. - 1.*I}
};
using namespace multi::blas;
{
multi::array<complex, 2> c({1, 1}, 9999.);
herk(filling::upper, 1., a, c); // c†=c=aa†
BOOST_TEST( c[0][0] == 47. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_size0){
multi::array<complex, 2> const a;
using namespace multi::blas;
{
multi::array<complex, 2> c;
herk(filling::upper, 1., a, c); // c†=c=aa†
// BOOST_TEST( c[0][0] == 47. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_automatic_ordering_and_symmetrization_real_case){
multi::array<complex, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
using namespace multi::blas;
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(filling::upper, 1., hermitized(a), c); // c†=c=a†a
// BOOST_REQUIRE( c[2][1]==19. );
BOOST_REQUIRE( c[1][2]==19. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
herk(filling::upper, 1., a, c); // c†=c=aa†
// BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 34. );
}
{
multi::array<complex, 2> c = herk(filling::upper, 1., a); // c†=c=aa†
// BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 34. );
}
{
multi::array<complex, 2> c = herk(filling::upper, 1., hermitized(a)); // c†=c=a†a
BOOST_REQUIRE( size(hermitized(a))==3 );
// BOOST_REQUIRE( c[2][1]==19. );
BOOST_REQUIRE( c[1][2]==19. );
}
{
multi::array<complex, 2> c = herk(filling::upper, a); // c†=c=a†a
// BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 34. );
}
{
multi::array<complex, 2> c = herk(filling::upper, hermitized(a)); // c†=c=a†a
// BOOST_REQUIRE( c[2][1]==19. );
BOOST_REQUIRE( c[1][2]==19. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_real_automatic_ordering_and_symmetrization_real_case){
multi::array<double, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
{
multi::array<double, 2> c({3, 3}, 9999.);
using multi::blas::hermitized;
using multi::blas::herk;
using multi::blas::filling;
// herk(filling::upper, 1., hermitized(a), c); // c†=c=a†a
// BOOST_REQUIRE( c[2][1]==19. );
// BOOST_REQUIRE( c[1][2]==19. );
}
{
multi::array<double, 2> c({2, 2}, 9999.);
using multi::blas::herk;
using multi::blas::filling;
herk(filling::upper, 1., a, c); // c†=c=aa†
// BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 34. );
}
{
multi::array<double, 2> c({2, 2}, 9999.);
using multi::blas::herk;
using multi::blas::filling;
herk(filling::upper, 1., a, c); // c†=c=aa†
// BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 34. );
}
{
using multi::blas::herk;
using multi::blas::filling;
multi::array<double, 2> c = herk(filling::upper, 1., a); // c†=c=aa†
// BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 34. );
}
{
using multi::blas::herk;
multi::array<complex, 2> c = herk(a); // c†=c=a†a
BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 34. );
}
{
using multi::blas::herk;
using multi::blas::hermitized;
multi::array<complex, 2> c = herk(hermitized(a)); // c†=c=a†a
BOOST_REQUIRE( c[2][1]==19. );
BOOST_REQUIRE( c[1][2]==19. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_real_case){
multi::array<double, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
using multi::blas::filling;
{
static_assert( not boost::multi::blas::is_complex_array<multi::array<double, 2>>{} , "!");
multi::array<double, 2> c({2, 2}, 9999.);
syrk(filling::lower, 1., a, 0., c);//c†=c=aa†=(aa†)†, `c` in lower triangular
}
{
multi::array<double, 2> c({2, 2}, 9999.);
herk(filling::lower, 1., a, 0., c);//c†=c=aa†=(aa†)†, `c` in lower triangular
}
{
static_assert( not boost::multi::blas::is_complex_array<multi::array<double, 2>>{} , "!");
multi::array<double, 2> c = herk(filling::upper, a);//c†=c=aa†=(aa†)†, `c` in lower triangular
}
}
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_real_case_1d){
multi::array<complex, 2> const a = {
{ 1., 3., 4.},
};
namespace blas = multi::blas;
using blas::filling;
using blas::transposed;
using blas::hermitized;
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(filling::lower, 1., hermitized(a), 0., c);//c†=c=a†a=(a†a)†, `c` in lower triangular
print(c);
BOOST_REQUIRE( c[2][1]==complex(12.,0.) );
BOOST_REQUIRE( c[1][2]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
herk(2., hermitized(a), c);//c†=c=a†a=(a†a)†, `c` in lower triangular
BOOST_REQUIRE( c[2][1]==complex(24.,0.) );
BOOST_REQUIRE( c[1][2]==complex(24.,0.) );
multi::array<complex, 2> c_gemm({3, 3});
// gemm(2., hermitized(a), a, c_gemm);
}
}
#endif
#if 0
BOOST_AUTO_TEST_CASE(multi_blas_herk_complex_timing){
multi::array<complex, 2> const a({4000, 4000}); std::iota(data_elements(a), data_elements(a) + num_elements(a), 0.2);
multi::array<complex, 2> c({4000, 4000}, 9999.);
boost::timer::auto_cpu_timer t;
using multi::blas::herk;
using multi::blas::hermitized;
using multi::blas::filling;
herk(filling::upper, 1., hermitized(a), c); // c†=c=a†a
}
#endif
#endif
#endif
#endif
#endif

View File

@ -0,0 +1,74 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_IAMAX_HPP
#define MULTI_ADAPTORS_BLAS_IAMAX_HPP
#include "../blas/core.hpp"
namespace boost{
namespace multi{
namespace blas{
template<class It, class Size>
auto iamax_n(It first, Size n){
using core::iamax;
return iamax(n, base(first), stride(first));
// if you get an error here make sure that you are including (and linking) the appropriate BLAS backend for your memory type
}
template<class It>
auto iamax(It first, It last)
->decltype(iamax_n(first, std::distance(first, last))){
return iamax_n(first, std::distance(first, last));}
template<class X1D>
auto iamax(X1D const& x)
->decltype(iamax(begin(x), end(x))){assert( not offset(x) );
return iamax(begin(x), end(x));
}
template<class X1D> auto amax(X1D const& x){return begin(x) + iamax(x);}
}}}
#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_IAMAX
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS iamax"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../array.hpp"
#include "../../utility.hpp"
#include<complex>
#include<cassert>
using std::cout;
namespace multi = boost::multi;
namespace blas = multi::blas;
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_iamax_real){
multi::array<double, 1> const A = {1., 2., 3., 4.};
auto i = blas::iamax(A);
BOOST_REQUIRE( i == 3 );
BOOST_REQUIRE( A[blas::iamax(A)] == 4. );
BOOST_REQUIRE( *blas::amax(A) == 4. );
}
using complex = std::complex<double>;
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_iamax_complex){
multi::array<complex, 1> const A = {1., 2., 3., 4.};
auto i = blas::iamax(A);
BOOST_REQUIRE( i == 3 );
BOOST_REQUIRE( A[blas::iamax(A)] == 4. );
BOOST_REQUIRE( *blas::amax(A) == 4. );
}
#endif
#endif

View File

@ -0,0 +1,194 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXXX $CXXFLAGS $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifdef __CUDA_ARCH__
//#define BOOST_NO_RTTI 1
//#define BOOST_TYPE_INDEX_CTTI_USER_DEFINED_PARSING (39, 1, true, "T = ")
#endif
#ifndef MULTI_ADAPTORS_BLAS_NRM2_HPP
#define MULTI_ADAPTORS_BLAS_NRM2_HPP
#include "../blas/core.hpp"
#include "../../array.hpp"
#include<complex> // std::norm
namespace boost{
namespace multi{
namespace blas{
using core::nrm2;
using multi::base;
using std::norm; // nvcc11 needs using std::FUNCTION and the FUNCTION (and it works in clang, gcc, culang, icc)
template<class A1D, class A0D>
auto nrm2(A1D const& x, A0D&& r)
->decltype(nrm2(x.size(), x.base(), x.stride(), base(r)), std::forward<A0D>(r)){
return nrm2(x.size(), x.base(), x.stride(), base(r)), std::forward<A0D>(r);}
#if 0
template<class A1D>
auto nrm2(A1D const& x, double& r)
->decltype(nrm2(x.size(), x.base(), x.stride(), &r), r){
return nrm2(x.size(), x.base(), x.stride(), &r), r;}
template<class A1D>
auto nrm2(A1D const& x, float& r)
->decltype(nrm2(x.size(), x.base(), x.stride(), &r), r){
return nrm2(x.size(), x.base(), x.stride(), &r), r;}
#endif
template<
class A1D, typename T = double, //decltype(norm(std::declval<typename A1D::value_type>())),
class Alloc = typename std::allocator_traits<typename A1D::default_allocator_type>::template rebind_alloc<T>
>
NODISCARD("")
auto nrm2(A1D const& x)
//->std::decay_t<decltype(nrm2(x, multi::static_array<T, 0, Alloc>({}, x.get_allocator()) ))>{
->std::decay_t<decltype(nrm2(x, multi::static_array<T, 0, Alloc>({})))>{ // x.get_allocator() in decltype doesn't work for icc
return nrm2(x, multi::static_array<T, 0, Alloc>({}, x.get_allocator()));}
template<class Alloc, class A1D, typename T = decltype(norm(std::declval<typename A1D::value_type>())),
class AllocR = typename std::allocator_traits<typename A1D::default_allocator_type>::template rebind_alloc<T>
>
NODISCARD("")
auto nrm2(A1D const& x, AllocR const& alloc)
->std::decay_t<decltype(blas::nrm2(x, multi::static_array<T, 0, AllocR>({}, alloc)))>{
return blas::nrm2(x, multi::static_array<T, 0, AllocR>({}, alloc)) ;}
namespace operators{
using std::norm;
template<class A1D>//decltype(norm(std::declval<typename A1D::value_type>()))>
NODISCARD("") auto operator^(A1D const& a, int n)
->decltype(std::pow(blas::nrm2(a), n)){
return std::pow(blas::nrm2(a), n);}
}
}}}
#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_NRM2
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS nrm2"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../array.hpp"
#include "../../complex.hpp"
//#include<thrust/complex.h>
#include<boost/mpl/list.hpp>
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_real){
namespace blas = multi::blas;
multi::array<double, 2> const cA = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
double n;
BOOST_REQUIRE( blas::nrm2(rotated(cA)[1], n) == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
BOOST_REQUIRE( n == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
BOOST_REQUIRE( blas::nrm2(rotated(cA)[1]) == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
double n2 = blas::nrm2(rotated(cA)[1]);
BOOST_REQUIRE( n == n2 );
multi::array<double, 1> R(4);
blas::nrm2( rotated(cA)[1], R[2]);
BOOST_REQUIRE( R[2] == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
multi::array<double, 0> R0;
blas::nrm2( rotated(cA)[1], R0);
BOOST_REQUIRE( R0 == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
BOOST_REQUIRE( blas::nrm2(rotated(cA)[1]) == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
}
BOOST_AUTO_TEST_CASE(multi_adaptor_blas_nrm2_operators){
multi::array<double, 1> X = {1.1,2.1,3.1, 4.1};
double n; multi::blas::nrm2(X, n);
BOOST_REQUIRE( n == multi::blas::nrm2(X) );
}
BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_complex_real_case){
using complex = std::complex<double>;
multi::array<complex, 2> const cA = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
using multi::blas::nrm2;
double n;
BOOST_REQUIRE( nrm2(rotated(cA)[1], n) == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
BOOST_REQUIRE( nrm2(rotated(cA)[1]) == n );
}
#if 0
BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_complex_real_case_thrust){
using complex = thrust::complex<double>;
multi::array<complex, 2> const cA = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
using multi::blas::nrm2;
double n;
BOOST_REQUIRE( nrm2(rotated(cA)[1], n) == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
BOOST_REQUIRE( nrm2(rotated(cA)[1]) == n );
}
BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_complex_real_case_types){
boost::mpl::for_each<boost::mpl::list<
std ::complex<double>,
thrust::complex<double>//,
// boost::multi::complex<double> // TODO make this work
>>([](auto cplx){
multi::array<decltype(cplx), 2> const cA = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
using multi::blas::nrm2;
double n;
BOOST_REQUIRE( nrm2(rotated(cA)[1], n) == std::sqrt( 2.*2. + 6.*6 + 10.*10.) );
BOOST_REQUIRE( nrm2(rotated(cA)[1]) == n );
});
}
#endif
BOOST_AUTO_TEST_CASE(multi_adaptor_multi_nrm2_complex){
using complex = std::complex<double>; complex const I{0,1};
multi::array<complex, 2> const cA = {
{1., 2. + 1.*I, 3., 4.},
{5., 6. + 4.*I, 7., 8.},
{9., 10. - 3.*I, 11., 12.}
};
using multi::blas::nrm2;
double n;
BOOST_REQUIRE( nrm2(rotated(cA)[1], n) == std::sqrt( norm(cA[0][1]) + norm(cA[1][1]) + norm(cA[2][1]) ) );
BOOST_REQUIRE( nrm2(rotated(cA)[1]) == std::sqrt( norm(cA[0][1]) + norm(cA[1][1]) + norm(cA[2][1]) ) );
using namespace multi::blas::operators;
BOOST_TEST_REQUIRE( (rotated(cA)[1]^-1) == 1/std::sqrt(norm(cA[0][1]) + norm(cA[1][1]) + norm(cA[2][1])) , boost::test_tools::tolerance(1e-15) );
BOOST_TEST_REQUIRE( (rotated(cA)[1]^2) == norm(cA[0][1]) + norm(cA[1][1]) + norm(cA[2][1]) , boost::test_tools::tolerance(1e-15) );
}
#endif
#endif

View File

@ -0,0 +1,283 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
// © Alfredo A. Correa 2019-2021
#ifndef MULTI_ADAPTORS_BLAS_NUMERIC_HPP
#define MULTI_ADAPTORS_BLAS_NUMERIC_HPP
#include "../../memory/pointer_traits.hpp"
#include "../../array_ref.hpp"
#include "../../complex.hpp"
#include "numeric/is_complex.hpp"
namespace boost{
namespace multi::blas{
template<class T> struct Complex_{T real; T imag;};
template<
class A, typename Complex = typename std::decay_t<A>::element, typename T=typename Complex::value_type,
class=std::enable_if_t<blas::numeric::is_complex_of<Complex, T>::value>
>
auto real(A&& a)
->decltype(std::forward<A>(a).template reinterpret_array_cast<Complex_<T>>().template member_cast<T>(&Complex_<T>::real)){
return std::forward<A>(a).template reinterpret_array_cast<Complex_<T>>().template member_cast<T>(&Complex_<T>::real);}
template<
class A, class Complex = typename std::decay_t<A>::element_type, typename T=typename Complex::value_type,
class=std::enable_if_t<blas::numeric::is_complex_of<Complex, T>::value>
>
auto imag(A&& a)
->decltype(std::forward<A>(a).template reinterpret_array_cast<Complex_<T>>().template member_cast<T>(&Complex_<T>::imag)){
return std::forward<A>(a).template reinterpret_array_cast<Complex_<T>>().template member_cast<T>(&Complex_<T>::imag);}
template<class ComplexArr, class ComplexElem = typename std::decay_t<ComplexArr>::element, typename RealElem = typename ComplexElem::value_type,
class=std::enable_if_t<blas::numeric::is_complex_of<ComplexElem, RealElem>::value>
>
auto real_doubled(ComplexArr&& a){ // produces a real view of complex array with the last dimension duplicated and with interleaved real imaginary parts
return std::forward<ComplexArr>(a).template reinterpret_array_cast<RealElem>(2).rotated().flatted().unrotated();
}
template<class Ref, class Involution> class involuted;
template<class It, class F, class Reference = involuted<typename std::iterator_traits<It>::reference, F> > class involuter;
template<class Ref, class Involution>
class involuted{
protected:
Ref r_; // [[no_unique_address]]
Involution f_;
public:
using decay_type =std::decay_t<decltype(std::declval<Involution>()(std::declval<Ref>()))>;
constexpr explicit involuted(Ref r, Involution f = {}) : r_{std::forward<Ref>(r)}, f_{f}{}
involuted& operator=(involuted const& other)=delete;//{r_ = other.r_; return *this;}
public:
involuted(involuted const&) = delete;
involuted(involuted&&) = default; // for C++14
constexpr decay_type decay() const&{return f_(r_);}
constexpr operator decay_type() &{return f_(r_);}
constexpr operator decay_type() const&{return f_(r_);}
constexpr operator decay_type() &&{return f_(r_);}
constexpr auto operator*(decay_type const& other) const{return f_(r_)*other;}
constexpr decltype(auto) operator&()&&{return involuter<decltype(&std::declval<Ref>()), Involution>{&r_, f_};}
template<class DecayType>
constexpr auto operator=(DecayType&& other)&
->decltype(r_=f_(std::forward<DecayType>(other)), *this){
return r_=f_(std::forward<DecayType>(other)), *this;}
template<class DecayType>
constexpr auto operator=(DecayType&& other)&&
->decltype(r_=f_(std::forward<DecayType>(other)), *this){
return r_=f_(std::forward<DecayType>(other)), *this;}
template<class DecayType>
constexpr auto operator==(DecayType&& other) const
->decltype(this->operator decay_type()==other){
return this->operator decay_type()==other;}
template<class DecayType>
constexpr auto operator!=(DecayType&& other) const
->decltype(this->operator decay_type()!=other){
return this->operator decay_type()!=other;}
friend constexpr auto operator==(decay_type const& other, involuted const& self){
return other == self.operator decay_type();}
template<class DecayType, std::enable_if_t<not std::is_base_of<involuted, DecayType>{}, int> =0>
friend constexpr auto operator==(DecayType&& other, involuted const& self){
return other == self.operator decay_type();}
template<class DecayType, std::enable_if_t<not std::is_base_of<involuted, DecayType>{}, int> =0>
friend constexpr auto operator!=(DecayType&& other, involuted const& self){
return other != self.operator decay_type();}
// auto imag() const{return static_cast<decay_type>(*this).imag();}
template<class Any> friend constexpr Any& operator<<(Any&& a, involuted const& self)
// ->decltype(a << self.operator decay_type())
{
return a << self.operator decay_type();}
constexpr auto conj() const&{return adl_conj(operator decay_type());}
template<class T = void*>
friend constexpr auto imag(involuted const& self, T = nullptr)
->decltype(adl_imag(std::declval<decay_type>())){
return adl_imag(self.operator decay_type());}
};
#if defined(__cpp_deduction_guides)
template<class T, class F> involuted(T&&, F)->involuted<T const, F>;
//template<class T, class F> involuted(T&, F)->involuted<T&, F>;
//template<class T, class F> involuted(T const&, F)->involuted<T const&, F>;
#endif
//template<class It, class F>
//class involuter;
template<class It, class F>
auto get_allocator(involuter<It, F> const& s);
template<class It, class F>
auto default_allocator_of(involuter<It, F> const& iv){
return default_allocator_of(iv.it_);
}
template<class It, class F, class Reference>
class involuter{// : public std::iterator_traits<It>{
It it_; // [[no_unique_address]]
F f_;
template<class, class, class> friend class involuter;
public:
using difference_type = typename std::iterator_traits<It>::difference_type;
using value_type = typename std::iterator_traits<It>::value_type;
using pointer = involuter<It, F>;//svoid; // typename std::iterator_traits<It>::pointer
using reference = Reference;
using iterator_category = typename std::iterator_traits<It>::iterator_category;
using element_type = typename std::pointer_traits<It>::element_type;
template<class U> using rebind = involuter<typename std::pointer_traits<It>::template rebind<U>, F>;
involuter() = default;
constexpr explicit involuter(It it, F f = {}) : it_{std::move(it)}, f_{std::move(f)}{}
involuter(involuter const& other) = default;
// template<class Other, > constexpr involuter(Other const& other) : it_{other.it_}, f_{other.f_}{}
template<class Other, typename = decltype(_implicit_cast<It>(typename Other::underlying_type{}))>
// cppcheck-suppress noExplicitConstructor
constexpr involuter(Other const& o) : it_{o.it_}, f_{o.f_}{}
template<class Other, typename = decltype(_explicit_cast<It>(typename Other::underlying_type{}))>
constexpr explicit involuter(Other const& o, int = 0) : it_{o.it_}, f_{o.f_}{}
constexpr auto operator*() const {return reference{*it_, f_};}
bool operator==(involuter const& o) const{return it_==o.it_;}
bool operator!=(involuter const& o) const{return it_!=o.it_;}
constexpr involuter& operator+=(typename involuter::difference_type n){it_+=n; return *this;}
constexpr auto operator+(typename involuter::difference_type n) const{return involuter{it_+n, f_};}
// decltype(auto) operator->() const{
// return &const_cast<reference&>(reinterpret_cast<reference const&>(*this));
// return reference{*it_, f_};
// return involuter<typename std::iterator_traits<It>::pointer, F>{&*it_, f_};
// }
auto operator-(involuter const& other) const{return it_-other.it_;}
explicit operator bool() const{return it_;}
using underlying_type = It;
friend constexpr underlying_type underlying(involuter const& self){return self.it_;}
constexpr explicit operator It() const {return underlying(*this);}
template<class Itt, class FF> friend auto get_allocator(involuter<Itt, FF> const&);
friend auto default_allocator_of(involuter const& inv){
using multi::default_allocator_of;
return default_allocator_of(inv.it_);
}
using default_allocator_type = typename multi::pointer_traits<It>::default_allocator_type;
friend auto get_allocator(involuter const& inv){
using boost::multi::get_allocator;
return get_allocator(inv.it_);
}
};
template<class It, class F>
auto get_allocator(involuter<It, F> const& inv){
using multi::get_allocator;
return get_allocator(inv.it_);
}
template<class Ref> using negated = involuted<Ref, std::negate<>>;
template<class It> using negater = involuter<It, std::negate<>>;
#if 1
struct conjugate{
template<class T>
decltype(auto) operator()(T&& a) const{
// using std::conj; /*for doubles?*/
// using std::conj;
// std::complex<double> A = static_cast<std::complex<double>>(a);
return multi::adl_conj(std::forward<T>(a)); // this is needed by icc
}
};
#endif
#if 0
namespace detail{
template<class Ref> struct conjugated : involuted<Ref, conjugate>{
using involuted<Ref, conjugate>::involuted;
template<class Other>
conjugated(conjugated<Other> const& other) : involuted<Ref, conjugate>{static_cast<involuted<Ref, conjugate> const&>(other)}{}
auto real() const{return static_cast<typename conjugated::decay_type>(*this).real();}
auto imag() const{return static_cast<typename conjugated::decay_type>(*this).imag();}
friend auto imag(conjugated const& self){return self.imag();}
friend auto real(conjugated const& self){return self.real();}
public:
decltype(auto) operator->() const{return this;}
// friend auto conj(conjugated const& self){
// return conjugate{}(static_cast<typename conjugated::decay_type>(self));
// }
};
}
#endif
template<class Ref> using conjugated = involuted<Ref, conjugate>;
template<class It> using conjugater = involuter<It, conjugate>;//, conjugated<typename std::iterator_traits<It>::reference> >;
template<class It> auto make_conjugater(It it){return conjugater<It>{it};}
template<class It> It make_conjugater(conjugater<It> it){return underlying(it);}
template<class T> auto imag(involuted<T, conjugate> const& inv){return inv.decay().imag();}
template<class T> auto real(involuted<T, conjugate> const& inv){return inv.decay().real();}
template<class T> auto has_imag_fun_aux(T const& t)->decltype(imag(t), std::true_type {});
auto has_imag_fun_aux(... )->decltype( std::false_type{});
template<class T> struct has_imag_fun : decltype(has_imag_fun_aux(std::declval<T>())){};
template<class T> auto has_imag_mem_aux(T const& t)->decltype(t.imag(), std::true_type {});
auto has_imag_mem_aux(... )->decltype( std::false_type{});
template<class T> struct has_imag_mem : decltype(has_imag_mem_aux(std::declval<T>())){};
template<class T> struct has_imag : std::integral_constant<bool, (has_imag_fun<T>{} or has_imag_mem<T>{})>{};
template<class A = void> struct is_complex_array{
template<class T> static auto _(T const& t) -> has_imag<T>;
constexpr operator bool() const{return decltype(_(*base(std::declval<A>()))){};}
template<class AA> constexpr auto operator()(AA&&){return _(*base(std::declval<A>()));}
};
template<class V> struct is_complex : has_imag<V>{};
template<class A = void> struct is_conjugated{
template<class It> static std::true_type _(conjugater<It> a);
static std::false_type _(... );
constexpr operator bool() const{return decltype(_(base(std::declval<A>()))){};}
template<class AA> constexpr auto operator()(AA&&){return _(base(std::declval<A>()));}
};
template<class A, class D = std::decay_t<A>, typename Elem=typename D::element_type, typename Ptr=typename D::element_ptr,
std::enable_if_t<not is_complex_array<A>{}, int> =0>
A&& conj(A&& a){
// return multi::static_array_cast<Elem, conjugater<Ptr>>(a);
return std::forward<A>(a);
}
template<class A, class D = std::decay_t<A>, typename Elem=typename D::element_type, typename Ptr=typename D::element_ptr,
std::enable_if_t<not is_conjugated<A>{} and is_complex_array<A>{}, int> =0>
decltype(auto) conj(A&& a){
// return multi::static_array_cast<Elem, conjugater<Ptr>>(a);
return std::forward<A>(a).template static_array_cast<Elem, conjugater<Ptr>>();
}
template<class A, class D = std::decay_t<A>, typename Elem=typename D::element_type, typename Ptr=typename D::element_ptr::underlying_type,
std::enable_if_t< is_conjugated<A>{}, int> =0>
auto conj(A&& a)
->decltype(std::forward<A>(a).template static_array_cast<Elem, Ptr>()){
return std::forward<A>(a).template static_array_cast<Elem, Ptr>();}
// return multi::static_array_cast<Elem, Ptr>(a);}
// return multi::static_array_cast<Elem, Ptr>(a);}
}
template<class It, class F, class Reference>
auto default_allocator_of(multi::blas::involuter<It, F, Reference> it){
return multi::default_allocator_of(underlying(it));
}
}
namespace std{
// template<> struct is_convertible<boost::multi::blas::Complex_<double>*, std::complex<double>*> : std::true_type{};
// template<class T> struct is_convertible<boost::multi::blas::Complex_<double>*, T*> : boost::multi::blas::numeric::is_complex_of<T, double>{};
}
#endif

View File

@ -0,0 +1,93 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXXX $CXXFLAGS $0 -o $0x -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2020
#ifndef MULTI_ADAPTORS_BLAS_NUMERIC_IS_COMPLEX_HPP
#define MULTI_ADAPTORS_BLAS_NUMERIC_IS_COMPLEX_HPP
#include<complex>
#include<type_traits>
namespace boost{
namespace multi{
namespace blas{
namespace numeric{
using std::true_type;
using std::false_type;
template<class T> auto has_real_fun_aux(T const& t)->decltype(real(t), true_type{});
auto has_real_fun_aux(... )->decltype( false_type{});
template<class T> struct has_real_fun : decltype(has_real_fun_aux(std::declval<T>())){};
template<class T> constexpr bool has_real_fun_v = has_real_fun<T>::value;
template<class T> auto has_real_aux(T const& t)->decltype(t.real(), true_type{});
auto has_real_aux(... )->decltype( false_type{});
template<class T> struct has_real : decltype(has_real_aux(std::declval<T>())){};
template<class T> constexpr bool has_real_v = has_real<T>::value;
template<class T> auto has_imag_fun_aux(T const& t)->decltype(imag(t), true_type{});
auto has_imag_fun_aux(... )->decltype( false_type{});
template<class T> struct has_imag_fun : decltype(has_imag_fun_aux(std::declval<T>())){};
template<class T> constexpr bool has_imag_fun_v = has_imag_fun<T>::value;
template<class T> auto has_imag_aux(T const& t)->decltype(t.imag(), true_type{});
auto has_imag_aux(... )->decltype( false_type{});
template<class T> struct has_imag : decltype(has_imag_aux(std::declval<T>())){};
template<class T> constexpr bool has_imag_v = has_imag<T>::value;
template<class T> struct is_complex : std::integral_constant<bool,
(has_real_v<T> or has_real_fun_v<T>) and (has_imag_v<T> or has_imag_fun_v<T>)
>{};
template<class V, class T> auto real_is_aux(T const& t)->typename std::is_same<decltype(t.real()), V>;
template<class> auto real_is_aux(... )->false_type;
template<class T, class V> struct real_is : decltype(real_is_aux<V>(std::declval<T>())){};
template<class V, class T> auto imag_is_aux(T const& t)->typename std::is_same<decltype(t.imag()), V>;
template<class> auto imag_is_aux(... )->false_type;
template<class T, class V> struct imag_is : decltype(imag_is_aux<V>(std::declval<T>())){};
template<class T, class V> struct is_complex_of : std::integral_constant<bool, real_is<T, V>::value and imag_is<T, V>::value>{};
}}}}
#if not __INCLUDE_LEVEL__
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS numeric is_complex"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include<thrust/complex.h>
#include "../../../complex.hpp"
#include "boost/mpl/list.hpp"
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(multi_blas_is_complex){
namespace numeric = multi::blas::numeric;
boost::mpl::for_each<boost::mpl::list<double, float, long double>>([](auto f){
using F = decltype(f);
static_assert( not numeric::is_complex<F>{}, "!");
static_assert( numeric::is_complex<std::complex<F>>{}, "!");
static_assert( numeric::is_complex<thrust::complex<F>>{}, "!");
static_assert( numeric::is_complex<multi::complex<F>>{}, "!");
static_assert( numeric::is_complex_of<std::complex<F>, F>{}, "!");
static_assert( not numeric::is_complex_of<F, F>{}, "!");
});
static_assert( not numeric::is_complex_of<std::complex<double>, float>{}, "!");
static_assert( not numeric::is_complex_of<double, float>{}, "!");
static_assert( numeric::is_complex<std::complex<double> const&>{}, "!");
}
#endif
#endif

View File

@ -0,0 +1,141 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXXX $CXXFLAGS $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_OPERATIONS_HPP
#define MULTI_ADAPTORS_BLAS_OPERATIONS_HPP
#include "../blas/numeric.hpp"
namespace boost{
namespace multi{
namespace blas{
template<class M> decltype(auto) transposed(M const& m){return rotated(m);}
template<class A, typename D=std::decay_t<A>, typename E=typename D::element_type>
decltype(auto) conjugated_transposed(A&& a){
return transposed(blas::conj(std::forward<A>(a)));
}
template<class A> decltype(auto) identity(A&& a){return std::forward<A>(a);}
template<class A>
decltype(auto) hermitized(A&& a, std::true_type){
return conjugated_transposed(std::forward<A>(a));
}
template<class A>
decltype(auto) hermitized(A&& a, std::false_type){
return transposed(std::forward<A>(a));
}
template<class A>
decltype(auto) hermitized(A&& a){return conjugated_transposed(std::forward<A>(a));}
template<class A>
decltype(auto) transposed(A&& a){return rotated(std::forward<A>(a));}
//template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 2, int> =0>
//decltype(auto) H(A&& a){return hermitized(std::forward<A>(a));}
namespace operators{
MAYBE_UNUSED constexpr static struct {
template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 2, int> =0>
decltype(auto) operator()(A&& a) const{return hermitized(std::forward<A>(a));}
template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 1, int> =0>
[[deprecated("use blas::C instead of blas::H for conjugated vectors to avoid confusions")]]
decltype(auto) operator()(A&& a) const{return blas::conj(std::forward<A>(a));}
} H;
template<class A, class Op>
auto operator^(A&& a, Op op)
->decltype(op(std::forward<A>(a))){
return op(std::forward<A>(a));}
}
using operators::H;
template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 1, int> =0>
decltype(auto) C(A&& a){return blas::conj(std::forward<A>(a));}
template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 2, int> =0>
decltype(auto) C(A&& a){return hermitized(std::forward<A>(a));}
namespace operators{
template<class A>
auto operator*(A&& a)
->decltype(blas::conj(std::forward<A>(a))){
return blas::conj(std::forward<A>(a));}
}
//template<class A, std::enable_if_t<std::decay_t<A>::dimensionality == 1, int> =0>
//[[deprecated("use blas::C instead of blas::H for conjugated vectors to avoid confusions")]]
//decltype(auto) H(A&& a){return blas::conj(std::forward<A>(a));}
template<class A> decltype(auto) T(A&& a){return transposed(std::forward<A>(a));}
template<class A> decltype(auto) N(A&& a){return identity (std::forward<A>(a));}
}}
}
#if not __INCLUDE_LEVEL__
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi blas operations"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../array.hpp"
using std::cout;
template<class M> decltype(auto) print(M const& C){
using boost::multi::size;
for(int i = 0; i != size(C); ++i){
for(int j = 0; j != size(C[i]); ++j) cout<< C[i][j] <<' ';
cout<<std::endl;
}
return cout<<"---"<<std::endl;
}
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(m){
using complex = std::complex<double>; constexpr complex I{0., 1.};
namespace blas = multi::blas;
multi::array<complex, 2> const A = {
{1. - 3.*I, 6. + 2.*I},
{8. + 2.*I, 2. + 4.*I},
{2. - 1.*I, 1. + 1.*I}
};
using blas::hermitized;
BOOST_REQUIRE( hermitized(A)[0][1] == conj(A[1][0]) );
static_assert( blas::is_conjugated<decltype(blas::H(A))>{}, "!" );
BOOST_REQUIRE( blas::H(A)[0][1] == conj(A[1][0]) );
using blas::transposed;
BOOST_REQUIRE( transposed(A)[0][1] == A[1][0] );
static_assert( not blas::is_conjugated<decltype(blas::T(A))>{}, "!" );
BOOST_REQUIRE( blas::T(A)[0][1] == A[1][0] );
using namespace blas::operators;
BOOST_REQUIRE( (*~A)[0][1] == conj(A[1][0]) );
BOOST_REQUIRE( (~*A)[0][1] == conj(A[1][0]) );
BOOST_REQUIRE( ( ~A)[0][1] == A[1][0] );
BOOST_REQUIRE( ( *A)[0][1] == conj(A[0][1]) );
}
BOOST_AUTO_TEST_CASE(is_complex_array_test){
static_assert(multi::blas::is_complex_array<multi::array<std::complex<double>, 2>>{}, "!");
}
#endif
#endif

View File

@ -0,0 +1,47 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_SCAL_HPP
#define MULTI_ADAPTORS_BLAS_SCAL_HPP
#include "../blas/core.hpp"
namespace boost{
namespace multi::blas{
using core::scal;
template<class A, class It, class Size>
auto scal_n(A const& a, It first, Size count)
->decltype(scal(count, &a, first.base(), first.stride()), void()){
scal(count, &a, first.base(), first.stride()); }
template<class A, class It1D>
auto scal(A const& a, It1D first, It1D last)
->decltype(blas::scal_n(a, first, last - first)){
return blas::scal_n(a, first, last - first);}
template<class A, class X1D> // don't do this: ", typename Elem = typename X1D::element_type>"
auto scal(A const& a, X1D&& x)
->decltype(blas::scal(a, x.begin(), x.end()), std::forward<X1D>(x)){
return blas::scal(a, x.begin(), x.end()), std::forward<X1D>(x);}
template<class A>
class scal_range{
A alpha_;
public:
using scalar_type = A;
explicit scal_range(A const& alpha) : alpha_{alpha}{}
template<class X1D>
friend auto operator*=(X1D&& x, scal_range const& self)
->decltype(std::forward<X1D>(scal(std::declval<scalar_type const&>(), x))){
return std::forward<X1D>(scal(self.alpha_, x));}
};
template<class A> auto scal(A const& a){return scal_range<A>{a};}
}
}
#endif

View File

@ -0,0 +1,51 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXXX $CXXFLAGS $0 -o $0.$X `pkg-config --libs blas` -lboost_unit_test_framework&&$0.$X&&rm $0.$X;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_SIDE_HPP
#define MULTI_ADAPTORS_BLAS_SIDE_HPP
#include "../blas/core.hpp"
#include "../blas/operations.hpp"
#include "../../array_ref.hpp"
namespace boost{
namespace multi{
namespace blas{
//enum class SIDE : char{L='L', R='R'};
enum side : char{
left = 'L',
right = 'R'//,
// pre_multiply = 'R',
// post_multiply = 'L'
};
side swap(side s){
switch(s){
case side::left: return side::right;
case side::right: return side::left;
} __builtin_unreachable();
}
}}}
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
#if defined(__INCLUDE_LEVEL__) and not __INCLUDE_LEVEL__
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS adaptors side"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_side){
return;
}
#endif
#endif

View File

@ -0,0 +1,60 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_SWAP_HPP
#define MULTI_ADAPTORS_BLAS_SWAP_HPP
#include "../blas/core.hpp"
namespace boost{
namespace multi{
namespace blas{
template<class It1, class It2>
It2 swap(It1 first, It2 last, It2 first2){
assert(stride(first) == stride(last));
using std::distance;
auto d = distance(first, last);
swap(d, base(first), stride(first), base(first2), stride(first2));
return first2 + d;
}
template<class X1D, class Y1D>
Y1D&& swap(X1D&& x, Y1D&& y){
assert( size(x) == size(y) );
assert( offset(x) == 0 and offset(y) == 0 );
swap( begin(x), end(x), begin(y) );
return std::forward<Y1D>(y);
}
}}}
#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_SWAP
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS swap"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../array.hpp"
#include "../../utility.hpp"
#include "../blas/dot.hpp"
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(multi_blas_swap, *boost::unit_test::tolerance(0.00001) ){
multi::array<double, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
// using multi::blas::swap;
multi::blas::swap(rotated(A)[1], rotated(A)[3]); // can ambiguate with (friend) multi::swap
BOOST_REQUIRE( A[0][1] == 4. );
BOOST_REQUIRE( A[0][3] == 2. );
}
#endif
#endif

View File

@ -0,0 +1,444 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXX $0 -o $0x -lboost_unit_test_framework -lboost_timer \
`pkg-config --libs blas` \
`#-Wl,-rpath,/usr/local/Wolfram/Mathematica/12.0/SystemFiles/Libraries/Linux-x86-64 -L/usr/local/Wolfram/Mathematica/12.0/SystemFiles/Libraries/Linux-x86-64 -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5` \
&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_SYRK_HPP
#define MULTI_ADAPTORS_BLAS_SYRK_HPP
#include "../blas/core.hpp"
#include "../blas/numeric.hpp"
#include "../blas/filling.hpp"
namespace boost{
namespace multi{namespace blas{
using core::syrk;
template<typename AA, typename BB, class A2D, class C2D>
auto syrk(filling c_side, AA alpha, A2D const& a, BB beta, C2D&& c)
->decltype(syrk('\0', '\0', size(c), size(a), alpha, base(a), stride(rotated(a)), beta, base(c), stride(c)), std::forward<C2D>(c)){
assert( size(c) == size(rotated(c)) );
if(stride(a)==1)
if(stride(c)==1) syrk(flip(c_side)==filling::upper?'L':'U', 'N', size(c), size(a ), alpha, base(a), stride(rotated(a)), beta, base(c), stride(rotated(c)));
else syrk(c_side ==filling::upper?'L':'U', 'N', size(c), size(rotated(a)), alpha, base(a), stride(rotated(a)), beta, base(c), stride( c ));
else
if(stride(c)==1) syrk(flip(c_side)==filling::upper?'L':'U', 'T', size(c), size(rotated(a)), alpha, base(a), stride(a), beta, base(c), stride(rotated(c)));
else syrk(c_side ==filling::upper?'L':'U', 'T', size(c), size(rotated(a)), alpha, base(a), stride(a), beta, base(c), stride( c ));
return std::forward<C2D>(c);
}
template<typename AA, class A2D, class C2D>
auto syrk(filling c_side, AA alpha, A2D const& a, C2D&& c)
->decltype(syrk(c_side, alpha, a, 0., std::forward<C2D>(c))){
return syrk(c_side, alpha, a, 0., std::forward<C2D>(c));}
template<typename AA, class A2D, class C2D>
auto syrk(AA alpha, A2D const& a, C2D&& c)
->decltype(syrk(filling::upper, alpha, a, syrk(filling::lower, alpha, a, std::forward<C2D>(c)))){
return syrk(filling::upper, alpha, a, syrk(filling::lower, alpha, a, std::forward<C2D>(c)));}
template<typename AA, class A2D, class Ret = typename A2D::decay_type>
NODISCARD("because input argument is const") // this decay in the return type is important
auto syrk(AA alpha, A2D const& a)->std::decay_\
t<decltype(syrk(alpha, a, Ret({size(a), size(a)}, get_allocator(a))))>{
return syrk(alpha, a, Ret({size(a), size(a)}, get_allocator(a)));}
template<class A2D>
NODISCARD("")
auto syrk(A2D const& A)
->decltype(syrk(1., A)){
return syrk(1., A);}
}}}
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_SYRK
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS syrk"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../blas/gemm.hpp"
#include "../../array.hpp"
#include "../../utility.hpp"
#include <boost/timer/timer.hpp>
#include<complex>
#include<cassert>
#include<iostream>
#include<numeric>
#include<algorithm>
//#include<catch.hpp>
using std::cout;
using std::cerr;
namespace multi = boost::multi;
template<class M> decltype(auto) print(M const& C){
using boost::multi::size;
for(int i = 0; i != size(C); ++i){
for(int j = 0; j != size(C[i]); ++j)
std::cout << C[i][j] << ' ';
std::cout << std::endl;
}
return std::cout << std::endl;
}
BOOST_AUTO_TEST_CASE(multi_blas_syrk_real){
multi::array<double, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
{
multi::array<double, 2> c({3, 3}, 9999.);
namespace blas = multi::blas;
using blas::filling;
using blas::transposed;
syrk(filling::lower, 1., transposed(a), 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, `c` in lower triangular
BOOST_REQUIRE( c[2][1] == 19. );
BOOST_REQUIRE( c[1][2] == 9999. );
}
{
multi::array<double, 2> c({3, 3}, 9999.);
namespace blas = multi::blas;
using blas::filling;
using blas::transposed;
syrk(filling::upper, 1., transposed(a), 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, `c` in lower triangular
BOOST_REQUIRE( c[1][2] == 19. );
BOOST_REQUIRE( c[2][1] == 9999. );
}
{
multi::array<double, 2> c({2, 2}, 9999.);
namespace blas = multi::blas;
using blas::filling;
using blas::syrk;
syrk(filling::lower, 1., a, 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, `c` in lower triangular
BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 9999. );
}
{
multi::array<double, 2> c({2, 2}, 9999.);
namespace blas = multi::blas;
using blas::filling;
syrk(filling::upper, 1., a, 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, a⸆a, `c` in lower triangular
BOOST_REQUIRE( c[0][1] == 34. );
BOOST_REQUIRE( c[1][0] == 9999. );
}
{
multi::array<double, 2> c({2, 2}, 9999.);
namespace blas = multi::blas;
using blas::filling;
syrk(filling::upper, 1., a, 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, a⸆a, `c` in lower triangular
BOOST_REQUIRE( c[0][1] == 34. );
BOOST_REQUIRE( c[1][0] == 9999. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_syrk_real_special_case){
multi::array<double, 2> const a = {
{ 1., 3., 4.},
};
{
multi::array<double, 2> c({1, 1}, 9999.);
namespace blas = multi::blas;
using blas::filling;
syrk(filling::lower, 1., a, 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, `c` in lower triangular
//BOOST_REQUIRE( c[1][0] == 34. );
//BOOST_REQUIRE( c[0][1] == 9999. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_syrk_complex_real_case){
using complex = std::complex<double>;
multi::array<complex, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
{
multi::array<complex, 2> c({3, 3}, 9999.);
namespace blas = multi::blas;
using blas::filling;
using blas::transposed;
syrk(filling::lower, 1., transposed(a), 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, `c` in lower triangular
BOOST_REQUIRE( c[2][1] == 19. );
BOOST_REQUIRE( c[1][2] == 9999. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_syrk_complex){
using complex = std::complex<double>;
constexpr auto const I = complex{0., 1.};
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
{
multi::array<complex, 2> c({3, 3}, 9999.);
namespace blas = multi::blas;
using blas::filling;
using blas::transposed;
syrk(filling::lower, 1., transposed(a), 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, `c` in lower triangular
BOOST_REQUIRE( c[2][1] == complex(-3., -34.) );
BOOST_REQUIRE( c[1][2] == 9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
namespace blas = multi::blas;
using blas::filling;
syrk(filling::lower, 1., a, 0., c); // c⸆=c=aa⸆=(aa⸆)⸆, `c` in lower triangular
BOOST_REQUIRE( c[1][0] == complex(18., -21.) );
BOOST_REQUIRE( c[0][1] == 9999. );
}
{
multi::array<complex, 2> c({2, 2}, 9999.);
namespace blas = multi::blas;
using blas::filling;
syrk(filling::upper, 1., a, 0., c); // c⸆=c=aa⸆=(aa⸆)⸆, `c` in upper triangular
BOOST_REQUIRE( c[0][1] == complex(18., -21.) );
BOOST_REQUIRE( c[1][0] == 9999. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_syrk_automatic_operation_complex){
using complex = std::complex<double>;
constexpr auto const I = complex{0., 1.};
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
{
multi::array<complex, 2> c({2, 2}, 9999.);
using multi::blas::filling;
syrk(filling::lower, 1., a, 0., c); // c⸆=c=aa⸆=(aa⸆)⸆, `c` in lower triangular
BOOST_REQUIRE( c[1][0]==complex(18., -21.) );
BOOST_REQUIRE( c[0][1]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
namespace blas = multi::blas;
using blas::filling;
using blas::transposed;
syrk(filling::lower, 1., transposed(a), 0., c); // c⸆=c=a⸆a=(aa⸆)⸆, `c` in lower triangular
BOOST_REQUIRE( c[2][1]==complex(-3.,-34.) );
BOOST_REQUIRE( c[1][2]==9999. );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
namespace blas = multi::blas;
using blas::filling;
using blas::transposed;
syrk(filling::lower, 1., rotated(a), 0., c); // c⸆=c=a⸆a=(aa⸆)⸆, `c` in lower triangular
BOOST_REQUIRE( c[2][1]==complex(-3.,-34.) );
BOOST_REQUIRE( c[1][2]==9999. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_syrk_automatic_operation_real){
multi::array<double, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
{
multi::array<double, 2> c({2, 2}, 9999.);
using multi::blas::filling;
syrk(filling::lower, 1., a, 0., c); // c⸆=c=aa⸆=(aa⸆)⸆, `c` in lower triangular
BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 9999. );
}
{
multi::array<double, 2> c({2, 2}, 9999.);
using multi::blas::filling;
syrk(filling::upper, 1., a, 0., c); // c⸆=c=aa⸆=(aa⸆)⸆, `c` in upper triangular
BOOST_REQUIRE( c[0][1] == 34. );
BOOST_REQUIRE( c[1][0] == 9999. );
}
{
multi::array<double, 2> c({3, 3}, 9999.);
using multi::blas::filling;
syrk(filling::lower, 1., rotated(a), 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, `c` in lower triangular
BOOST_REQUIRE( c[2][1] == 19. );
BOOST_REQUIRE( c[1][2] == 9999. );
}
{
multi::array<double, 2> c({3, 3}, 9999.);
namespace blas = multi::blas;
using blas::transposed;
using blas::filling;
syrk(filling::lower, 1., transposed(a), 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, `c` in lower triangular
BOOST_REQUIRE( c[2][1] == 19. );
BOOST_REQUIRE( c[1][2] == 9999. );
}
{
multi::array<double, 2> c({3, 3}, 9999.);
namespace blas = multi::blas;
using blas::transposed;
using blas::filling;
syrk(filling::upper, 1., transposed(a), 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, `c` in upper triangular
BOOST_REQUIRE( c[1][2] == 19. );
BOOST_REQUIRE( c[2][1] == 9999. );
}
{
multi::array<double, 2> c({2, 2}, 9999.);
using multi::blas::filling;
using multi::blas::transposed;
syrk(filling::upper, 1., a, 0., transposed(c)); // c⸆=c=aa⸆=(aa⸆)⸆, `c` in upper triangular
BOOST_REQUIRE( c[0][1] == 9999. );
BOOST_REQUIRE( c[1][0] == 34. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_syrk_automatic_implicit_zero){
multi::array<double, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
{
multi::array<double, 2> c({2, 2}, 9999.);
using multi::blas::filling;
syrk(filling::lower, 1., a, c); // c⸆=c=aa⸆=(aa⸆)⸆, `c` in lower triangular
BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 9999. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_syrk_automatic_symmetrization){
multi::array<double, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
{
multi::array<double, 2> c({2, 2}, 9999.);
using multi::blas::syrk;
using multi::blas::gemm;
using multi::blas::T;
syrk(1., a, c); // c⸆=c=aa⸆=(aa⸆)⸆
BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 34. );
BOOST_REQUIRE( syrk(a) == gemm(a, T(a)) );
}
{
using multi::blas::syrk;
multi::array<double, 2> c = syrk(1., a); // c⸆=c=aa⸆=(aa⸆)⸆
BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 34. );
}
{
using multi::blas::syrk;
multi::array<double, 2> c = syrk(a); // c⸆=c=aa⸆=(aa⸆)⸆
BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 34. );
}
{
using multi::blas::transposed;
using multi::blas::syrk;
multi::array<double, 2> c = syrk(transposed(a)); // c⸆=c=a⸆a=(a⸆a)⸆
BOOST_REQUIRE( c[2][1] == 19. );
BOOST_REQUIRE( c[1][2] == 19. );
}
}
#if 0
}
}
#if 0
{
{
multi::array<complex, 2> C({2, 2}, 9999.);
syrk(1., rotated(A), rotated(C)); // C^T=C=A*A^T=(A*A^T)^T
assert( C[1][0] == complex(18., -21.) );
}
{
multi::array<complex, 2> C({2, 2}, 9999.);
syrk(rotated(A), rotated(C)); // C^T=C=A*A^T=(A*A^T)^T
assert( C[1][0] == complex(18., -21.) );
}
{
complex C[2][2];
using multi::rotated;
syrk(rotated(A), rotated(C)); // C^T=C=A*A^T=(A*A^T)^T
assert( C[1][0] == complex(18., -21.) );
}
{
auto C = syrk(1., A); // C = C^T = A^T*A, C is a value type matrix (with C-ordering, information is everywhere)
assert( C[1][2]==complex(-3.,-34.) );
}
{
// what(rotated(syrk(A)));
multi::array C = rotated(syrk(A)); // C = C^T = A^T*A, C is a value type matrix (with C-ordering, information is in upper triangular part)
print(C) <<"---\n";
}
}
#if 0
{
multi::array<complex, 2> const A = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
auto C = rotated(syrk(A)).decay(); // C = C^T = A^T*A, C is a value type matrix (with C-ordering, information is in upper triangular part)
print(C) <<"---\n";
// print(C) <<"---\n";
}
return 0;
{
multi::array<complex, 2> const A = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
auto C = syrk(rotated(A)); // C = C^T = A^T*A, C is a value type matrix (with C-ordering)
print(C) <<"---\n";
}
#endif
#endif
}
BOOST_AUTO_TEST_CASE(multi_blas_syrk_herk_fallback){
multi::array<double, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
{
multi::array<double, 2> c({2, 2}, 9999.);
namespace blas = multi::blas;
using blas::filling;
syrk(filling::lower, 1., a, 0., c); // c⸆=c=a⸆a=(a⸆a)⸆, `c` in lower triangular
BOOST_REQUIRE( c[1][0] == 34. );
BOOST_REQUIRE( c[0][1] == 9999. );
}
}
#endif
#endif
#endif

View File

@ -0,0 +1,95 @@
# -*-indent-tabs-mode:nil;c-basic-offset:2;tab-width:4;autowrap:nil;-*-
#[=[Multi Test suite can be run like this:
mkdir -p build
cd build
cmake .. [-DENABLE_CUDA=1]
make -j
ctest -j --output-on-error [-T memcheck]
exit
#]=]
cmake_minimum_required(VERSION 3.11)
set(CMAKE_VERBOSE_MAKEFILE ON)
project(boost-multi-adaptors-blas-test VERSION 0.1 LANGUAGES CXX)
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
find_package(BLAS REQUIRED)
find_path(BLAS_INCLUDE_DIRS cblas.h
/usr/include
/usr/local/include
$ENV{BLAS_HOME}/include)
link_libraries(${BLAS_LIBRARIES})
include_directories(${TEST_EXE} PRIVATE ${BLAS_INCLUDE_DIRS})
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
if(ENABLE_CUDA OR DEFINED CXXCUDA)
enable_language(CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe \"--diag_suppress=implicit_return_from_non_void_function\"")
endif()
find_package(CUDA QUIET)
if (CUDA_FOUND)
message("CUDA found")
include_directories(${CUDA_INCLUDE_DIRS})
else()
message("CUDA not found")
endif()
enable_testing()
list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure") # needs cmake 3.17
include(CTest)
configure_file("config.hpp.in" ${CMAKE_BINARY_DIR}/config.hpp)
include_directories(${CMAKE_BINARY_DIR})
#file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
set(TEST_SRCS
axpy.cpp
copy.cpp
dot.cpp
herk.cpp
gemv.cpp
gemm.cpp
numeric.cpp
scal.cpp
traits.cpp
trsm.cpp
)
foreach(TEST_FILE ${TEST_SRCS})
SET(TEST_EXE "${TEST_FILE}.x")
add_executable (${TEST_EXE} ${TEST_FILE})
if(ENABLE_CUDA OR DEFINED CXXCUDA)
set_source_files_properties(${TEST_FILE} PROPERTIES LANGUAGE CUDA)
target_compile_options (${TEST_EXE} PRIVATE -std=c++17)
endif()
# target_compile_features (${TEST_EXE} PUBLIC cxx_std_17)
target_compile_definitions(${TEST_EXE} PRIVATE "BOOST_PP_VARIADICS")
target_compile_definitions(${TEST_EXE} PRIVATE ${Boost_DEFINITIONS})
target_include_directories(${TEST_EXE} PRIVATE ${Boost_INCLUDE_DIRS})
target_link_libraries (${TEST_EXE} PRIVATE ${Boost_LIBRARIES})
target_link_directories (${TEST_EXE} PRIVATE ${Boost_LIBRARY_DIRS})
if(NOT ENABLE_CUDA)
target_compile_options (${TEST_EXE} PRIVATE
-Werror -Wall -Wextra -fno-common
$<$<CXX_COMPILER_ID:GNU>:
-Wpedantic -Wformat-truncation -fstack-usage>#-Wconversion
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:
-Wpedantic -Wmove>
$<$<CXX_COMPILER_ID:Intel>:
-wd161 -diag-disable=remark -Warray-bounds -Wchar-subscripts -Wcomment -Wenum-compare -Wformat -Wuninitialized -Wmaybe-uninitialized -Wmain -Wnarrowing -Wnonnull -Wparentheses -Wpointer-sign -Wreorder -Wno-return-type -Wsign-compare -Wsequence-point -Wtrigraphs -Wunused-function -Wunused-but-set-variable -Wunused-variable -Wwrite-strings -Werror -diag-error:3846
>
$<$<CXX_COMPILER_ID:MSVC>:
/W4>)
endif()
add_test(NAME ${TEST_EXE} COMMAND ./${TEST_EXE})
endforeach()

View File

@ -0,0 +1,78 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXX $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS asum"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../blas/asum.hpp"
#include "../../blas/cuda.hpp"
#include "../../../array.hpp"
#include "../../../adaptors/cuda.hpp"
#include<complex>
#include<numeric>
using std::cout;
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(multi_blas_asum_double){
multi::array<double, 2> const A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
using multi::blas::asum;
BOOST_REQUIRE(asum(A[1]) == std::accumulate(begin(A[1]), end(A[1]), 0., [](auto&& a, auto&& b){return a + std::abs(b);}));
}
BOOST_AUTO_TEST_CASE(multi_blas_asum_complex){
using Z = std::complex<double>; Z const I{0, 1};
multi::array<Z, 2> const A = {
{1. + 2.*I, 2., 3., 4.},
{5., 6. + 3.*I, 7., 8.},
{9., 10., 11.+ 4.*I, 12.}
};
using multi::blas::asum;
BOOST_REQUIRE(asum(A[1]) == std::accumulate(begin(A[1]), end(A[1]), 0., [](auto&& a, auto&& b){return a + std::abs(real(b)) + std::abs(imag(b));}));
}
BOOST_AUTO_TEST_CASE(multi_blas_asum_double_cuda){
multi::cuda::array<double, 2> const A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
using multi::blas::asum;
BOOST_REQUIRE(asum(A[1]) == 26 );
}
using complex = std::complex<double>; constexpr complex I{0, 1};
BOOST_AUTO_TEST_CASE(multi_blas_asum_complex_cuda){
namespace blas = multi::blas;
multi::cuda::array<complex, 2> const A = {
{1. + 2.*I, 2., 3., 4.},
{5., 6. + 3.*I, 7., 8.},
{9., 10., 11.+ 4.*I, 12.}
};
BOOST_REQUIRE( blas::asum(A[1]) == 29. );
BOOST_REQUIRE( blas::asum(A[1]({0, 4})) == 29. );
}
BOOST_AUTO_TEST_CASE(multi_blas_asum_complex_cuda_mutable){
using Z = std::complex<double>; Z const I{0, 1};
multi::cuda::array<Z, 2> A = {
{1. + 2.*I, 2., 3., 4.},
{5., 6. + 3.*I, 7., 8.},
{9., 10., 11.+ 4.*I, 12.}
};
using multi::blas::asum;
BOOST_REQUIRE( asum(A[1]) == Z{29.} );
BOOST_REQUIRE( asum(A[1]({0, 4})) == Z{29.} );
}

View File

@ -0,0 +1,150 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
// © Alfredo A. Correa 2019-2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS axpy"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "config.hpp"
#include "../../../array.hpp"
#include "../../blas.hpp"
#include<complex>
namespace multi = boost::multi;
namespace blas = multi::blas;
BOOST_AUTO_TEST_CASE(multi_blas_axpy_real){
multi::array<double, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
auto const AC = A;
multi::array<double, 1> const B = A[2];
blas::axpy(2., B, A[1]); // daxpy
BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
}
BOOST_AUTO_TEST_CASE(multi_blas_axpy_double){
multi::array<double, 2> const cA = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
multi::array<double, 2> A = cA;
multi::array<double, 1> const b = cA[2];
blas::axpy(2., b, A[1]); // A[1] = 2*b + A[1], A[1]+= a*A[1]
BOOST_REQUIRE( A[1][2] == 2.*b[2] + cA[1][2] );
using complex = std::complex<double>; complex const I = {0, 1};
multi::array<complex, 1> AC = {1. + 2.*I, 3. + 4.*I, 4. - 8.*I};
multi::array<complex, 1> BC(size(AC), complex{0.});
blas::axpy(+1., blas::real(AC), blas::real(BC));
blas::axpy(-1., blas::imag(AC), blas::imag(BC));
BOOST_REQUIRE( BC[2] == std::conj(AC[2]) );
}
BOOST_AUTO_TEST_CASE(multi_blas_axpy_complex){
{
using complex = std::complex<double>;
multi::array<complex, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
auto const AC = A;
multi::array<complex, 1> const B = A[2];
blas::axpy(2., B, A[1]); // zaxpy (2. is promoted to 2+I*0 internally and automatically)
BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_axpy_complex_as_operator_plus_equal){
using complex = std::complex<double>;
multi::array<complex, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
auto const AC = A;
multi::array<complex, 1> const B = A[2];
A[1] += blas::axpy(2., B); // zaxpy (2. is promoted to 2+I*0 internally and automatically)
BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
}
BOOST_AUTO_TEST_CASE(multi_blas_axpy_complex_as_operator_minus_equal){
using complex = std::complex<double>;
multi::array<complex, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
auto const AC = A;
multi::array<complex, 1> const B = A[2];
A[1] -= blas::axpy(2., B); // zaxpy (2. is promoted to 2+I*0 internally and automatically)
BOOST_REQUIRE( A[1][2] == -2.*B[2] + AC[1][2] );
}
BOOST_AUTO_TEST_CASE(multi_blas_axpy_complex_context){
using complex = std::complex<double>;
multi::array<complex, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
auto const AC = A;
multi::array<complex, 1> const B = A[2];
blas::axpy(blas::context{}, 2., B, A[1]); // zaxpy (2. is promoted to 2+I*0 internally and automatically)
BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
}
BOOST_AUTO_TEST_CASE(multi_blas_axpy_operator_minus){
using complex = std::complex<double>;
multi::array<complex, 1> x = {10., 11., 12., 13.};
multi::array<complex, 1> y = x;
using blas::operators::operator-;
using blas::operators::operator+;
using blas::operators::operator-=;
BOOST_REQUIRE( (x - y)[0] == 0. );
BOOST_REQUIRE( (y - x)[0] == 0. );
BOOST_REQUIRE( (x - (y+y))[0] == -x[0] );
BOOST_REQUIRE( ((x+x) - y)[0] == +x[0] );
multi::array<complex, 2> A = {{1., 2.}, {3., 4.}};
multi::array<complex, 1> B = {1., 2.};
BOOST_REQUIRE( (A[0] - B)[0] == 0. );
BOOST_REQUIRE( (A[0] - B)[1] == 0. );
multi::array<complex, 1> X = {10., 11., 12., 13.};
multi::array<complex, 1> Y = {10., 11., 12., 13.};
X -= Y;
BOOST_REQUIRE( X[0] == 0. );
}
#if CUDA_FOUND
#include<thrust/complex.h>
BOOST_AUTO_TEST_CASE(multi_blas_axpy_complex_thrust){
{
using complex = thrust::complex<double>;
multi::array<complex, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
auto const AC = A;
multi::array<complex, 1> const B = A[2];
blas::axpy(2., B, A[1]); // zaxpy (2. is promoted to 2+I*0 internally and automatically)
BOOST_REQUIRE( A[1][2] == 2.*B[2] + AC[1][2] );
}
}
#endif

View File

@ -0,0 +1,7 @@
#ifndef MULTI_ADAPTORS_BLAS_TEST_CONFIG_HPP
#define MULTI_ADAPTORS_BLAS_TEST_CONFIG_HPP
#cmakedefine01 CUDA_FOUND
#endif

View File

@ -0,0 +1,159 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
#include "../../blas.hpp"
#include "../../../array.hpp"
#include<complex>
#include "config.hpp"
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS copy"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
namespace multi = boost::multi;
namespace blas = multi::blas;
BOOST_AUTO_TEST_CASE(multi_blas_copy_n){
multi::array<double, 1> const A = {1., 2., 3., 4.};
multi::array<double, 1> B = {5., 6., 7., 8.};
blas::copy_n(A.begin(), A.size(), B.begin());
BOOST_REQUIRE( B == A );
}
BOOST_AUTO_TEST_CASE(multi_blas_copy_it){
multi::array<double, 1> const A = {1., 2., 3., 4.};
multi::array<double, 1> B = {5., 6., 7., 8.};
blas::copy(A.begin(), A.end(), B.begin());
BOOST_REQUIRE( B == A );
}
BOOST_AUTO_TEST_CASE(multi_blas_copy){
multi::array<double, 1> const A = {1., 2., 3., 4.};
{
multi::array<double, 1> B = {5., 6., 7., 8.};
blas::copy(A, B); // segmentation fault in clang-11
BOOST_REQUIRE( B == A );
}
{
multi::array<double, 1> B = {5., 6., 7., 8.};
BOOST_REQUIRE( size(B) == size(A) );
B = blas::copy(A);
BOOST_REQUIRE( B == A );
}
}
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_copy_real){
namespace blas = multi::blas;
multi::array<double, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
BOOST_REQUIRE( A[0][2] == 3. );
BOOST_REQUIRE( A[2][2] == 11. );
blas::copy(A[0], A[2]);
BOOST_REQUIRE( A[0][2] == 3. );
BOOST_REQUIRE( A[2][2] == 3. );
// multi::blas::copy(begin(A[1]), end(A[1]), begin(A[2])); // dcopy
blas::copy( A[1]({0, size(A[1])}), A[2]({0, size(A[1])}) );
BOOST_REQUIRE( A[1][3] == 8. );
BOOST_REQUIRE( A[2][3] == 8. );
multi::array<double, 1> AR3 = blas::copy(rotated(A)[3]); // dcopy
BOOST_REQUIRE( AR3[1] == A[1][3] );
}
BOOST_AUTO_TEST_CASE(multi_blas_copy_row){
multi::array<double, 2> const A = {
{1., 2., 3.},
{4., 5., 6.},
{7., 8., 9.}
};
multi::array<double, 1> B(3);
blas::copy(rotated(A)[0], B);
BOOST_REQUIRE( B == rotated(A)[0] );
}
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_copy_complex){
using complex = std::complex<double>; constexpr complex I{0, 1};
multi::array<complex, 2> A = {
{1. + 3.*I, 2. + 4.*I, 3. + 5.*I, 4. + 6.*I},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
blas::copy(A[0], A[2]);
BOOST_REQUIRE( A[0][2] == 3. + 5.*I );
}
BOOST_AUTO_TEST_CASE(multi_blas_copy_context){
multi::array<double, 1> const A = {1., 2., 3., 4.};
blas::context ctx;
{
multi::array<double, 1> B = {5., 6., 7., 8.};
blas::copy(ctx, A, B);
BOOST_REQUIRE( A == B );
}
{
multi::array<double, 1> B = {5., 6., 7., 8.};
BOOST_REQUIRE( size(B) == size(A) );
B = blas::copy(ctx, A);
BOOST_REQUIRE( A == B );
}
}
#if CUDA_FOUND
#include<thrust/complex.h>
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_copy_thrust){
multi::array<thrust::complex<double>, 1> const a(10, thrust::complex<double>{});
multi::array<thrust::complex<double>, 1> b(10);
blas::copy(a, b);
BOOST_REQUIRE( a == b );
}
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_text_copy_interop){
static_assert( std::is_convertible<std::complex<double>, thrust::complex<double>>{} );
static_assert( std::is_convertible<thrust::complex<double>, std::complex<double>>{} );
multi::array<std::complex<double>, 1> a(10, std::complex<double>{});
multi::array<thrust::complex<double>, 1> b(10);
blas::copy(a, b);
BOOST_REQUIRE( a == b );
}
#endif
//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_copy_cuda_complex){
// namespace cuda = multi::cuda;
// cuda::array<complex, 2> A = {
// {1. + 3.*I, 2. + 4.*I, 3. + 5.*I, 4. + 6.*I},
// {5., 6., 7., 8.},
// {9., 10., 11., 12.}
// };
// blas::copy(A[0], A[2]);
// BOOST_REQUIRE( A[0][2] == 3. + 5.*I );
// BOOST_REQUIRE( A[2][2] == 3. + 5.*I );
//}
//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_copy_cuda_managed_complex){
// namespace cuda = multi::cuda;
// namespace blas = multi::blas;
// cuda::managed::array<complex, 2> A = {
// {1. + 3.*I, 2. + 4.*I, 3. + 5.*I, 4. + 6.*I},
// {5., 6., 7., 8.},
// {9., 10., 11., 12.}
// };
// blas::copy(A[0], A[2]);
// BOOST_REQUIRE( A[0][2] == 3. + 5.*I );
// BOOST_REQUIRE( A[2][2] == 3. + 5.*I );
//}

View File

@ -0,0 +1,407 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
// © Alfredo A. Correa 2019-2021
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS dot"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "config.hpp"
#include "../../blas/dot.hpp"
#include "../../../array.hpp"
//#include "../../blas/cuda.hpp"
//#include "../../../adaptors/cuda.hpp"
#include<cassert>
#include<complex>
#include<numeric>
#include<type_traits>
namespace multi = boost::multi;
namespace blas = multi::blas;
BOOST_AUTO_TEST_CASE(blas_dot_context){
multi::array<float, 1> const A = {1.,2.,3.};
multi::array<float, 1> const B = {1.,2.,3.};
blas::context ctxt;
auto C = +blas::dot(&ctxt, A, B);
BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), 0.F) );
}
BOOST_AUTO_TEST_CASE(blas_dot_no_context){
multi::array<float, 1> const A = {1.,2.,3.};
multi::array<float, 1> const B = {1.,2.,3.};
auto C = +blas::dot(A, B);
BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), 0.F) );
}
BOOST_AUTO_TEST_CASE(blas_dot_no_context_out_param){
multi::array<float, 1> const A = {1.,2.,3.};
multi::array<float, 1> const B = {1.,2.,3.};
float C = NAN;
blas::dot(A, B, C);
BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), 0.F) );
}
BOOST_AUTO_TEST_CASE(blas_dot_no_context_out_param_complex){
using complex = std::complex<double>;
multi::array<complex, 1> const A = {1.,2.,3.};
multi::array<complex, 1> const B = {1.,2.,3.};
complex C;
blas::dot(A, B, C);
BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), complex{0.}, std::plus<>{}, [](auto const& a, auto const& b){return a*std::conj(b);}) );
}
BOOST_AUTO_TEST_CASE(blas_dot_no_context_out_param_complex_C){
using complex = std::complex<double>; complex const I{0., 1.};
multi::array<complex, 1> const A = {1.,2., 3.};
multi::array<complex, 1> const B = {1.,2. + 2.*I, 3.};
complex C;
blas::dot(blas::C(A), B, C);
BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), complex{0.}, std::plus<>{}, [](auto const& a, auto const& b){return conj(a)*b;}) );
}
#if defined(CUDA_FOUND) and CUDA_FOUND
#include<thrust/complex.h>
BOOST_AUTO_TEST_CASE(blas_dot_no_context_out_param_complex_C_thrust){
using complex = thrust::complex<double>; complex const I{0., 1.};
multi::array<complex, 1> const A = {1.,2., 3.};
multi::array<complex, 1> const B = {1.,2. + 2.*I, 3.};
complex C;
blas::dot(blas::C(A), B, C);
BOOST_REQUIRE( C == std::inner_product(begin(A), end(A), begin(B), complex{0.}, std::plus<>{}, [](auto& a, auto& b){return conj(a)*b;}) );
}
#endif
BOOST_AUTO_TEST_CASE(multi_blas_dot_strided){
multi::array<double, 2> const CA = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
double d = std::numeric_limits<double>::quiet_NaN();
blas::dot_n(begin(CA[1]), size(CA[1]), begin(CA[2]), &d);
BOOST_REQUIRE( d == std::inner_product(begin(CA[1]), begin(CA[2]), end(CA[1]), 0.) );
double d2 = blas::dot(CA[1], CA[2]);
BOOST_REQUIRE( d == d2 );
}
BOOST_AUTO_TEST_CASE(multi_blas_dot_strided_context){
multi::array<double, 2> const CA = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
double d = std::numeric_limits<double>::quiet_NaN();
blas::context ctxt;
blas::dot_n(&ctxt, begin(CA[1]), size(CA[1]), begin(CA[2]), &d);
BOOST_REQUIRE( d == std::inner_product(begin(CA[1]), begin(CA[2]), end(CA[1]), 0.) );
double d2 = blas::dot(CA[1], CA[2]);
BOOST_REQUIRE( d == d2 );
}
BOOST_AUTO_TEST_CASE(multi_blas_dot_1d_real){
multi::array<float, 1> V = {1., 2., 3.};
multi::array<float, 1> W = {1., 2., 3.};
using blas::dot;
BOOST_REQUIRE( 14. == dot(V, W) );
BOOST_REQUIRE( dot(V, W) == 14. );
}
BOOST_AUTO_TEST_CASE(multi_blas_dot_impl_real){
multi::array<double, 2> const cA = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
{
double d = blas::dot(cA[1], cA[2]);
BOOST_REQUIRE( d==std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
}
{
double d = NAN;
blas::dot(cA[1], cA[2], d);
BOOST_REQUIRE( d==std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
}
{
double d = NAN;
auto d2 = blas::dot(cA[1], cA[2], d);
BOOST_REQUIRE( d==d2 );
}
// {
// multi::array<double, 0> d;
// auto d2 = blas::dot(cA[1], cA[2], d);
// BOOST_REQUIRE( d == std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
// }
{
double d = blas::dot(cA[1], cA[2]);
BOOST_REQUIRE( d == std::inner_product(begin(cA[1]), begin(cA[2]), end(cA[1]), 0.) );
BOOST_REQUIRE( blas::dot(cA[1], cA[2]) == blas::dot(cA[2], cA[1]) );
}
// {
// double s;
// blas::dot(cA[1], cA[1], s);
// BOOST_REQUIRE( std::sqrt(s)==blas::nrm2(cA[1]) );
// }
{
// auto d1 = blas::dot(cA[1], cA[1]);
// auto d2 = blas::dot(blas::conj(cA[1]), cA[1]);
}
}
BOOST_AUTO_TEST_CASE(inq_case){
multi::array<double, 1> v1(10, +1.0);
multi::array<double, 1> v2(10, -1.0);
using blas::dot;
using blas::hermitized;
using blas::conj;
auto a = dot(v1, v2);
auto b = dot(hermitized(v1), v2);
BOOST_REQUIRE(a == b);
auto c = dot(blas::conj(v1), v2); // conjugation doesn't do anything for real array
BOOST_REQUIRE(c == a);
auto d_arr = dot(blas::C(v1), v2);
BOOST_REQUIRE(d_arr == a);
static_assert( not std::is_same<decltype(d_arr), double>{}, "!" );
using blas::C;
double d_doub = dot(C(v1), v2);
BOOST_REQUIRE( d_doub == d_arr );
}
#if 1
BOOST_AUTO_TEST_CASE(multi_blas_dot_impl_complex){
namespace blas = multi::blas;
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{1. + I, 2. + 3.*I, 3.+2.*I, 4.-9.*I},
{5. + 2.*I, 6. + 6.*I, 7.+2.*I, 8.-3.*I},
{9. + 1.*I, 10. + 9.*I, 11.+1.*I, 12.+2.*I}
};
{
complex c; blas::dot(A[1], A[2], c);
BOOST_TEST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}) );
}
{
complex c = blas::dot(A[1], A[2]);
BOOST_TEST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}) );
}
{
complex c = blas::dot(A[1], blas::C(A[2]));
BOOST_TEST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}, std::plus<>{}, [](auto a, auto b){return a*conj(b);}) );
}
{
complex c = blas::dot(blas::C(A[1]), A[2]);
BOOST_TEST_REQUIRE( c == inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{}, std::plus<>{}, [](auto a, auto b){return conj(a)*b;}) );
}
{
complex c = blas::dot(blas::conj(A[1]), A[2]);
BOOST_TEST_REQUIRE( c == inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{}, std::plus<>{}, [](auto a, auto b){return conj(a)*b;}) );
}
// {
// complex c = blas::dot(blas::C(A[1]), blas::C(A[2]));
// BOOST_TEST_REQUIRE( c == inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{}, std::plus<>{}, [](auto a, auto b){return conj(a)*conj(b);}) );
// }
{
complex c = blas::dot(blas::C(A[1]), A[2]);
BOOST_TEST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}, std::plus<>{}, [](auto a, auto b){return conj(a)*b;}) );
}
// {
// complex c = blas::dot(blas::C(A[1]), blas::C(A[2]));
// BOOST_TEST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}, std::plus<>{}, [](auto a, auto b){return conj(a)*conj(b);}) );
// }
}
#include "config.hpp" // cuda found
#if defined(CUDA_FOUND) and CUDA_FOUND
#include<thrust/complex.h>
BOOST_AUTO_TEST_CASE(multi_blas_dot_impl_complex_thrust){
namespace blas = multi::blas;
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{1. + I, 2. + 3.*I, 3.+2.*I, 4.-9.*I},
{5. + 2.*I, 6. + 6.*I, 7.+2.*I, 8.-3.*I},
{9. + 1.*I, 10. + 9.*I, 11.+1.*I, 12.+2.*I}
};
{
complex c;
blas::core::dotu(size(A[1]), A[1].base(), A[1].stride(), A[2].base(), A[2].stride(), &c);
auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
BOOST_REQUIRE( c.real() == inner.real() );
BOOST_REQUIRE( c.imag() == inner.imag() );
}
{
complex c;
blas::context::dotu(size(A[1]), A[1].base(), A[1].stride(), A[2].base(), A[2].stride(), &c);
auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
BOOST_REQUIRE( c.real() == inner.real() );
BOOST_REQUIRE( c.imag() == inner.imag() );
}
{
complex c;
blas::dot_n(begin(A[1]), size(A[1]), begin(A[2]), &c);
auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
BOOST_REQUIRE( c == inner );
}
{
complex c;
blas::dot(A[1], A[2], c);
auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
BOOST_REQUIRE( c == inner );
}
{
complex c = blas::dot(A[1], A[2]);
auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
BOOST_REQUIRE( c == inner );
}
{
auto inner = std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0.});
BOOST_REQUIRE( +blas::dot(A[1], A[2]) == inner );
}
{
complex c; blas::dot(A[1], A[2], c);
BOOST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}) );
}
{
complex c = blas::dot(A[1], A[2]);
BOOST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}) );
}
{
complex c = blas::dot(A[1], blas::C(A[2]));
BOOST_REQUIRE( c == std::inner_product(begin(A[1]), end(A[1]), begin(A[2]), complex{0}, std::plus<>{}, [](auto a, auto b){return a*conj(b);}) );
}
}
#endif
#endif
BOOST_AUTO_TEST_CASE(blas_dot){
// multi::array<float, 1> const A = {1.,2.,3.};
// multi::array<float, 1> const B = {1.,2.,3.};
// {
// float f = blas::dot(A, B); // uses cast operator decay
// BOOST_REQUIRE( f == std::inner_product(begin(A), end(A), begin(B), 0.f) );
// }
// {
// float f2;
// *multi::array_ptr<float, 0>(&f2, {}) = blas::dot(A, B); // uses custom copy
// BOOST_REQUIRE( f2 == std::inner_product(begin(A), end(A), begin(B), 0.f) );
// }
// {
// multi::array<float, 0> F = blas::dot(A, B);
// BOOST_REQUIRE( F() == std::inner_product(begin(A), end(A), begin(B), 0.f) );
// }
// using complex = std::complex<double>; complex const I{0, 1};
// {
// multi::array<complex, 1> const A = {I, 2.*I, 3.*I};
// BOOST_TEST( blas::dot(A, A).decay() == std::inner_product(begin(A), end(A), begin(A), complex{0.}) );
// }
// {
// multi::array<complex, 1> const A = {I, 1. + 2.*I, 3.*I};
// multi::array<complex, 1> const B = {I, 1. + 2.*I, 3.*I};
// BOOST_TEST( blas::dot(A, B).decay() == std::inner_product(begin(A), end(A), begin(B), complex{0.}, std::plus<>{}, [](auto&& a, auto&& b){return a*b;}) );
//// BOOST_REQUIRE(
//// std::inner_product(begin(A), end(A), begin( B ), std::complex<double>{0.}, std::plus<>{}, [](auto&& a, auto&& b){return a*std::conj(b);})
//// ==s
//// std::inner_product(begin(A), end(A), begin(blas::C(B)), std::complex<double>{0.}, std::plus<>{}, [](auto&& a, auto&& b){return a*b;})
//// );
// BOOST_REQUIRE( blas::dot(A, blas::C(B)).decay() == std::inner_product(begin(A), end(A), begin(B), complex{0.}, std::plus<>{}, [](auto&& a, auto&& b){return a*std::conj(b);}) );
// }
// {
// multi::array<complex, 1> const a = {1. + I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
// multi::array<complex, 1> const b = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
// {
// multi::array<complex, 0> c({}, complex{});
// blas::dot(a, b, c);
// BOOST_TEST( c() == 19. - 27.*I );
// }
// }
// {
// cuda::array<complex, 1> const acu = {1. + I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
// cuda::array<complex, 1> const bcu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
// {
// cuda::array<complex, 0> ccu;
// blas::dot(acu, bcu, ccu);
// BOOST_REQUIRE( ccu() == 19. - 27.*I );
// }
// BOOST_REQUIRE( blas::C(bcu)[1] == 6. - 6.*I );
// {
// cuda::array<complex, 0> ccu;
// static_assert( multi::blas::is_complex_array<multi::array<complex, 1>>{}, "!" );
// static_assert( multi::blas::is_complex_array<cuda::array<complex, 1>>{}, "!" );
// blas::dot(acu, blas::C(bcu), ccu);
// BOOST_REQUIRE( ccu() == 121. - 43.*I );
// }
// {
// auto const ccu = blas::dot(acu, blas::C(bcu));
// BOOST_REQUIRE( ccu() == 121. - 43.*I );
// }
// {
// cuda::array<complex, 1> ccu = {1, 2, 3};
// blas::dot(acu, blas::C(bcu), ccu[0]);
// BOOST_REQUIRE( ccu[0] == 121. - 43.*I );
// }
// {
// cuda::array<complex, 2> ccu({1, 1});
// blas::dot(acu, blas::C(bcu), ccu[0][0]);
// BOOST_REQUIRE( ccu[0][0] == 121. - 43.*I );
// }
// }
// {
// namespace cuda = multi::cuda;
// cuda::managed::array<complex, 1> const amcu = {1. + I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
// cuda::managed::array<complex, 1> const bmcu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
// {
// cuda::managed::array<complex, 0> cmcu;
// blas::dot(amcu, bmcu, cmcu);
// BOOST_REQUIRE( cmcu() == 19.- I*27. );
// }
// {
// cuda::array<complex, 1> cmcu = {1, 2, 3};
// blas::dot(amcu, blas::C(bmcu), cmcu[0]);
// BOOST_REQUIRE( cmcu[0] == complex(121., -43.) );
// }
// }
// {
// using complex = std::complex<double>; complex const I{0, 1};
// cuda::array<complex, 1> const acu = {1. + I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
// cuda::array<complex, 1> const bcu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
// {
// cuda::array<complex, 0> ccu;
// blas::dot(acu, bcu, ccu);
// BOOST_REQUIRE( ccu() == 19. - 27.*I );
// }
// }
// {
// using complex = thrust::complex<double>; complex const I{0, 1};
// cuda::managed::array<complex, 1> const acu = {1. + I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
// cuda::managed::array<complex, 1> const bcu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
// {
// cuda::managed::array<complex, 0> ccu;
// blas::dot(acu, bcu, ccu);
// BOOST_REQUIRE( ccu() == 19. - 27.*I );
// }
// }
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,277 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
// © Alfredo A. Correa 2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS gemv"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "config.hpp"
#include "../../../adaptors/blas/gemv.hpp"
#include "../../../array.hpp"
#include "../../../array.hpp"
#include "../../../utility.hpp"
#include "../../blas/axpy.hpp"
#include "../../blas/dot.hpp"
#include "../../blas/gemm.hpp"
#include "../../blas/nrm2.hpp"
#include<random>
namespace multi = boost::multi;
namespace blas = multi::blas;
template<class T> void what(T&&) = delete;
template<class M, class VI, class VO>
void MV(M const& m, VI const& x, VO&& y){
std::transform(
begin(m), end(m), begin(y),
[&x](auto&& row){return std::inner_product(begin(row), end(row), begin(x), 0.);}
);
}
BOOST_AUTO_TEST_CASE(multi_blas_gemv){//, *utf::tolerance(0.0001)){
multi::array<double, 2> const M = {
{ 9., 24., 30., 9.},
{ 4., 10., 12., 7.},
{14., 16., 36., 1.}
};
multi::array<double, 1> const v = {1.1, 2.1, 3.1, 4.1};
{
multi::array<double, 1> w(size(M));
blas::gemv_n(1., begin(M), size(M), begin(v), 0., begin(w));
BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.0001 );
BOOST_REQUIRE_CLOSE( w[2] , +blas::dot(M[2], v) , 0.0001 );
}
{
multi::array<double, 1> w(size(M));
multi::array<double, 2> const MT = ~M;
blas::gemv_n(1., begin(~MT), size(~MT), begin(v), 0., begin(w));
BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.0001 );
BOOST_REQUIRE_CLOSE( w[2] , +blas::dot(M[2], v), 0.0001 );
}
{
multi::array<double, 1> w(size(M));
auto mv = blas::gemv(1., M, v);
copy_n(mv.begin(), mv.size(), w.begin());
BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
multi::array<double, 1> w2(size(M));
MV(M, v, w2);
BOOST_REQUIRE_CLOSE( w2[0] , w[0], 0.00001 );
}
{
multi::array<double, 1> w(size(M));
w = blas::gemv(1., M, v);
BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
}
{
multi::array<double, 1> w = blas::gemv(1., M, v);
BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
}
{
multi::array<double, 1> w(size(M), 0.);
w += blas::gemv(1., M, v);
BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
}
{
multi::array<double, 1> w = {4., 5., 6.};
blas::gemv(1.1, M, v, 1., w); // y = a*M*x + b*y
BOOST_REQUIRE_CLOSE( w[1] , 105.43 , 0.00001 );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_gemv_real){//, *utf::tolerance(0.0001)){
namespace blas = multi::blas;
using std::abs;
multi::array<double, 2> const M = {
{ 9., 24., 30., 9.},
{ 4., 10., 12., 7.},
{14., 16., 36., 1.}
};
multi::array<double, 1> const X = {1.1, 2.1, 3.1, 4.1};
{
multi::array<double, 1> Y = {4.,5.,6.};
double const a = 1.1;
double const b = 1.2;
blas::gemv(a, M, X, b, Y); // y = a*M*x + b*y
multi::array<double, 1> const Y3 = {214.02, 106.43, 188.37};
BOOST_REQUIRE( abs(Y[1] - Y3[1]) < 2e-14 );
}
{
auto Y = +blas::gemv(1., M, X);
BOOST_REQUIRE_CLOSE( Y[0] , +blas::dot(M[0], X) , 0.00001 );
BOOST_REQUIRE_CLOSE( Y[1] , +blas::dot(M[1], X) , 0.00001 );
BOOST_REQUIRE_CLOSE( Y[2] , +blas::dot(M[2], X) , 0.00001 );
}
{
multi::array<double, 1> const a = {1., 2., 3.};
multi::array<double, 1> const b = {4., 5., 6.};
multi::array<double, 1> const dot = blas::gemv(1., multi::array<double, 2>({a}), b);
BOOST_REQUIRE( dot[0] == blas::dot(a, b) );
}
{
using blas::operators::operator%;
using blas::operators::operator-;
using blas::operators::operator^;
BOOST_REQUIRE_SMALL( ((~+~M)%X - M%X)^2 , 1e-13 );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_gemv_real_complex){
namespace blas = multi::blas;
using complex = std::complex<double>; //#define I *std::complex<double>(0, 1)
using std::abs;
multi::array<complex, 2> const M = {
{ 9., 24., 30., 9.},
{ 4., 10., 12., 7.},
{14., 16., 36., 1.}
};
multi::array<complex, 1> const X = {1.1, 2.1, 3.1, 4.1};
{
multi::array<complex, 1> Y = {4., 5., 6.};
double const a = 1.1;
double const b = 1.2;
blas::gemv(a, M, X, b, Y); // y = a*M*x + b*y
multi::array<complex, 1> const Y3 = {214.02, 106.43, 188.37};
using blas::operators::operator-;
double const n2 = blas::nrm2(Y - Y3);
BOOST_REQUIRE_SMALL( n2 , 1e-13);
}
}
#if CUDA_FOUND
#include<thrust/complex.h>
BOOST_AUTO_TEST_CASE(multi_blas_gemv_real_complex_thrust){
namespace blas = multi::blas;
using complex = thrust::complex<double>; //#define I *std::complex<double>(0, 1)
using std::abs;
multi::array<complex, 2> const M = {
{ 9., 24., 30., 9.},
{ 4., 10., 12., 7.},
{14., 16., 36., 1.}
};
multi::array<complex, 1> const X = {1.1, 2.1, 3.1, 4.1};
{
multi::array<complex, 1> Y = {4., 5., 6.};
double const a = 1.1;
double const b = 1.2;
blas::gemv(a, M, X, b, Y); // y = a*M*x + b*y
multi::array<complex, 1> const Y3 = {214.02, 106.43, 188.37};
}
{
multi::array<complex, 1> Y = {4., 5., 6.};
blas::gemv(1.1, M, X, 1., Y); // y = a*M*x + b*y
BOOST_REQUIRE( Y[1] == 105.43 );
}
}
#endif
BOOST_AUTO_TEST_CASE(multi_blas_gemv_complex){
namespace blas = multi::blas;
using complex = std::complex<double>; std::complex<double> const I{0, 1};
using std::abs;
multi::array<complex, 2> const M = {{2. + 3.*I, 2. + 1.*I, 1. + 2.*I}, {4. + 2.*I, 2. + 4.*I, 3. + 1.*I},
{7. + 1.*I, 1. + 5.*I, 0. + 3.*I}};
multi::array<complex, 1> const X = {1. + 2.*I, 2. + 1.*I, 9. + 2.*I};
BOOST_REQUIRE(( +blas::gemv(1., M, X) == multi::array<complex, 1>{4. + 31.*I, 25. + 35.*I, -4. + 53.*I} ));
auto MT = +~M;
BOOST_REQUIRE(( +blas::gemv(1., ~MT, X) == multi::array<complex, 1>{4. + 31.*I, 25. + 35.*I, -4. + 53.*I} ));
// auto MH = +*~M;
BOOST_REQUIRE( +blas::gemv(1., ~M, X) == (multi::array<complex, 1>{63. + 38.*I, -1. + 62.*I, -4. + 36.*I}) );
BOOST_REQUIRE( +blas::gemv(1., ~M, X) == +blas::gemv(1., MT, X) );// == multi::array<complex, 1>{4. + 31.*I, 25. + 35.*I, -4. + 53.*I} ));
// BOOST_REQUIRE( +blas::gemv(1., *M, X) == (multi::array<complex, 1>{26. - 15.*I, 45. - 3.*I, 22. - 23.*I}) );
// BOOST_REQUIRE( +blas::gemv(1., ~*M, X) == (multi::array<complex, 1>{83. + 6.*I, 31. - 46.*I, 18. - 26.*I}) ); // not supported by blas
}
BOOST_AUTO_TEST_CASE(multi_blas_gemv_temporary){
using complex = std::complex<double>;
multi::array<complex, 2> const A = {
{1., 0., 0.},
{0., 1., 0.},
{0., 0., 1.}
};
auto const B = [](auto _){
auto rand = [d=std::normal_distribution<>{}, g=std::mt19937{1}]()mutable{return complex{d(g), d(g)};}; // NOLINT(cert-msc32-c,cert-msc51-cpp): test purposes
std::generate(_.elements().begin(), _.elements().end(), rand);
return _;
}(multi::array<complex, 2>({3, 3}));
using blas::operators::operator*;
using blas::operators::operator-;
using blas::operators::operator^;
BOOST_REQUIRE( (((A*B)[0] - B[0])^2) == 0. );
BOOST_REQUIRE( (((A*B)[1] - B[1])^2) == 0. );
BOOST_REQUIRE( (((A*B)[2] - B[2])^2) == 0. );
}
BOOST_AUTO_TEST_CASE(multi_blas_gemv_context){//, *utf::tolerance(0.0001)){
multi::array<double, 2> const M = {
{ 9., 24., 30., 9.},
{ 4., 10., 12., 7.},
{14., 16., 36., 1.}
};
multi::array<double, 1> const v = {1.1, 2.1, 3.1, 4.1};
blas::context ctxt;
{
multi::array<double, 1> w(size(M));
blas::gemv_n(ctxt, 1., begin(M), size(M), begin(v), 0., begin(w));
BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.0001 );
BOOST_REQUIRE_CLOSE( w[2] , +blas::dot(M[2], v) , 0.0001 );
}
{
multi::array<double, 1> w(size(M));
multi::array<double, 2> const MT = ~M;
blas::gemv_n(ctxt, 1., begin(~MT), size(~MT), begin(v), 0., begin(w));
BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
BOOST_REQUIRE_CLOSE( w[2] , +blas::dot(M[2], v) , 0.00001 );
}
{
multi::array<double, 1> w(size(M));
auto&& mv = blas::gemv(ctxt, 1., M, v);
copy_n(mv.begin(), mv.size(), w.begin());
BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
}
{
multi::array<double, 1> w(size(M));
w = blas::gemv(ctxt, 1., M, v);
BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
}
{
multi::array<double, 1> w = blas::gemv(ctxt, 1., M, v);
BOOST_REQUIRE_CLOSE( w[1] , 91.3 , 0.00001 );
}
{
multi::array<double, 1> w(size(M), 0.);
w += blas::gemv(ctxt, 1., M, v);
BOOST_REQUIRE_CLOSE( w[1] , 91.3, 0.00001 );
}
{
multi::array<double, 1> w = {4., 5., 6.};
w += blas::gemv(ctxt, 1.1, M, v);
BOOST_REQUIRE_CLOSE( w[1] , 105.43, 0.00001 );
}
}

View File

@ -0,0 +1,272 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXX $0 -o $0x -lcudart -lcublas -lboost_unit_test_framework `pkg-config --libs blas`&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS herk"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
//#include "../../../adaptors/cuda.hpp" // multi::cuda ns
//#include "../../../adaptors/blas/cuda.hpp"
#include "../../../adaptors/blas/gemm.hpp"
#include "../../../adaptors/blas/herk.hpp"
#include "../../../array.hpp"
namespace multi = boost::multi;
//namespace cuda = multi::cuda;
BOOST_AUTO_TEST_CASE(multi_blas_herk){
namespace blas = multi::blas;
using complex = std::complex<double>; constexpr complex I{0, 1};
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
{
multi::array<complex, 2> c({2, 2}, 9999.);
blas::herk(a, c);
BOOST_REQUIRE( c[1][0] == complex(50., -49.) );
BOOST_REQUIRE( c[0][1] == complex(50., +49.) );
multi::array<complex, 2> const c_copy = blas::herk(1., a);
BOOST_REQUIRE( c == c_copy );
BOOST_REQUIRE( +blas::gemm(1., a, blas::H(a)) == blas::herk(a) );
}
}
//BOOST_AUTO_TEST_CASE(multi_blas_cuda_herk_complex){
// namespace blas = multi::blas;
// multi::array<complex, 2> const a = {
// { 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
// { 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
// };
// {
// cuda::array<complex, 2> const acu = a;
// BOOST_REQUIRE(a == acu);
// cuda::array<complex, 2> ccu({2, 2}, 9999.);
// blas::herk(acu, ccu);
// BOOST_REQUIRE( ccu[1][0] == complex(50., -49.) );
// BOOST_REQUIRE( ccu[0][1] == complex(50., +49.) );
// cuda::array<complex, 2> const ccu_copy = blas::herk(1., acu);
// BOOST_REQUIRE( blas::herk(1., acu) == ccu );
// }
// {
// cuda::managed::array<complex, 2> const amcu = a; BOOST_REQUIRE(a == amcu);
// cuda::managed::array<complex, 2> cmcu({2, 2}, 9999.);
// blas::herk(1., amcu, cmcu);
// BOOST_REQUIRE( cmcu[1][0] == complex(50., -49.) );
// BOOST_REQUIRE( cmcu[0][1] == complex(50., +49.) );
// cuda::managed::array<complex, 2> const cmcu_copy = blas::herk(1., amcu);
// BOOST_REQUIRE( cmcu_copy == cmcu );
// }
// {
// multi::array<complex, 2> c({3, 3}, 9999.);
// blas::herk(1., blas::H(a), c);
// BOOST_REQUIRE( c[2][1] == complex(41, +2) );
// BOOST_REQUIRE( c[1][2] == complex(41, -2) );
// multi::array<complex, 2> const c_copy = blas::herk(1., blas::H(a));
// BOOST_REQUIRE( c_copy == c );
// }
// {
// cuda::array<complex, 2> const acu = a;
// BOOST_REQUIRE(a == acu);
// cuda::array<complex, 2> ccu({3, 3}, 9999.);
// blas::herk(1., blas::H(acu), ccu);
// BOOST_REQUIRE( ccu[2][1] == complex(41, +2) );
// BOOST_REQUIRE( ccu[1][2] == complex(41, -2) );
// cuda::array<complex, 2> const ccu_copy = blas::herk(1., blas::H(acu));
// BOOST_REQUIRE( ccu_copy == ccu );
// }
// {
// cuda::managed::array<complex, 2> const acu = a; BOOST_REQUIRE(a == acu);
// cuda::managed::array<complex, 2> ccu({3, 3}, 9999.);
// blas::herk(1., blas::H(acu), ccu);
// BOOST_REQUIRE( ccu[2][1] == complex(41, +2) );
// BOOST_REQUIRE( ccu[1][2] == complex(41, -2) );
// cuda::managed::array<complex, 2> const ccu_copy = blas::herk(1., blas::H(acu));
// BOOST_REQUIRE( ccu_copy == ccu );
// }
//}
//BOOST_AUTO_TEST_CASE(multi_blas_cuda_herk_n_complex){
// namespace blas = multi::blas;
// multi::array<complex, 2> const a = {
// { 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
// { 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
// };
// blas::context ctxt;
// {
// multi::array<complex, 2> c({2, 2}, 9999.);
// blas::herk_n(ctxt, blas::filling::upper, 1., a.begin(), a.size(), 0., c.begin());
// BOOST_TEST_REQUIRE( c[0][1] == complex(50., +49.) );
// BOOST_TEST_REQUIRE( c[1][0] == 9999. );
// }
// {
// multi::array<complex, 2> c({2, 2}, 9999.);
// blas::herk_n(ctxt, blas::filling::lower, 1., a.begin(), a.size(), 0., c.begin());
// BOOST_TEST_REQUIRE( c[0][1] == 9999. );
// BOOST_TEST_REQUIRE( c[1][0] == complex(50., -49.) );
// }
// {
// multi::array<complex, 2> c({2, 2}, 9999.);
// blas::herk_n(ctxt, blas::filling::lower, 1., a.begin(), a.size(), 0., c.begin());
// blas::herk_n(ctxt, blas::filling::upper, 1., a.begin(), a.size(), 0., c.begin());
// BOOST_TEST_REQUIRE( c[0][1] == complex(50., +49.) );
// BOOST_TEST_REQUIRE( c[1][0] == complex(50., -49.) );
// }
// {
// multi::array<complex, 2> c({3, 3}, 9999.);
// blas::herk_n(ctxt, blas::filling::lower, 1., blas::H(a).begin(), blas::H(a).size(), 0., c.begin());
// BOOST_TEST_REQUIRE( c[1][2] == 9999. );
// BOOST_TEST_REQUIRE( c[2][1] == complex(41., +2.) );
// }
// {
// multi::array<complex, 2> c({3, 3}, 9999.);
// blas::herk_n(ctxt, blas::filling::upper, 1., blas::H(a).begin(), blas::H(a).size(), 0., c.begin());
// BOOST_TEST_REQUIRE( c[1][2] == complex(41., -2.) );
// BOOST_TEST_REQUIRE( c[2][1] == 9999. );
// }
// {
// multi::array<complex, 2> c({3, 3}, 9999.);
// blas::herk_n(ctxt, blas::filling::lower, 1., blas::H(a).begin(), blas::H(a).size(), 0., c.begin());
// blas::herk_n(ctxt, blas::filling::upper, 1., blas::H(a).begin(), blas::H(a).size(), 0., c.begin());
// BOOST_TEST_REQUIRE( c[1][2] == complex(41., -2.) );
// BOOST_TEST_REQUIRE( c[2][1] == complex(41., +2.) );
// }
// {
// multi::array<complex, 2> c({3, 3}, 9999.);
// blas::herk_n(ctxt, 1., blas::H(a).begin(), blas::H(a).size(), c.begin());
// BOOST_TEST_REQUIRE( c[1][2] == complex(41., -2.) );
// BOOST_TEST_REQUIRE( c[2][1] == complex(41., +2.) );
// }
//}
//BOOST_AUTO_TEST_CASE(multi_blas_cuda_herk_row){
// namespace blas = multi::blas;
// auto const a = []{
// multi::array<complex, 2> ret({1, 100});
// std::generate(begin(ret[0]), end(ret[0]), [c=complex{1, 2}]()mutable{return c+=2.;});
// return ret;
// }();
// BOOST_REQUIRE( size(a) == 1 );
// {
// BOOST_REQUIRE( +blas::gemm(1., a, blas::H(a)) == blas::herk(a) );
// cuda::array<complex, 2> const agpu = a;
// BOOST_REQUIRE( blas::gemm(agpu, blas::H(agpu)) == blas::herk(agpu) );
// cuda::managed::array<complex, 2> const amng = a;
// BOOST_REQUIRE( blas::gemm(amng, blas::H(amng)) == blas::herk(amng) );
// }
//}
//#if 1
//BOOST_AUTO_TEST_CASE(multi_blas_cuda_herk_real){
// namespace blas = multi::blas;
// multi::array<double, 2> const a = {
// { 1., 3., 4.},
// { 9., 7., 1.}
// };
// {
// multi::array<double, 2> c({2, 2}, 9999);
// blas::herk(1., a, c);
// BOOST_REQUIRE( c[1][0] == 34 );
// BOOST_REQUIRE( c[0][1] == 34 );
// // multi::array<double, 2> const c_copy = blas::herk(1., a);
// // BOOST_REQUIRE( c == c_copy );
// }
// {
// cuda::array<double, 2> acu = a;
// BOOST_REQUIRE(a == acu);
// cuda::array<double, 2> ccu({2, 2}, 9999.);
// // blas::herk(acu, ccu);
// // BOOST_REQUIRE( ccu[1][0] == 34 );
// // BOOST_REQUIRE( ccu[0][1] == 34 );
// // cuda::array<double, 2> const ccu_copy = blas::herk(1., acu);
// // BOOST_REQUIRE( herk(1., acu) == ccu );
// }
//}
//#endif
#if 0
{
cuda::array<double, 2> const acu = a; BOOST_REQUIRE(a == acu);
// cuda::array<double, 2> ccu({2, 2}, 9999.);
using multi::blas::herk;
cuda::array<double, 2> ccu = herk(acu);
BOOST_REQUIRE( ccu[1][0] == 34 );
BOOST_REQUIRE( ccu[0][1] == 34 );
cuda::array<double, 2> const ccu_copy = herk(1., acu);
BOOST_REQUIRE( herk(1., acu) == ccu );
}
{
cuda::managed::array<double, 2> const amcu = a; BOOST_REQUIRE(a == amcu);
cuda::managed::array<double, 2> cmcu({2, 2}, 9999.);
using multi::blas::herk;
herk(1., amcu, cmcu);
BOOST_REQUIRE( cmcu[1][0] == 34 );
BOOST_REQUIRE( cmcu[0][1] == 34 );
cuda::managed::array<double, 2> const cmcu_copy = herk(1., amcu);
BOOST_REQUIRE( cmcu_copy == cmcu );
}
if(0){
multi::array<double, 2> c({3, 3}, 9999.);
using multi::blas::herk;
using multi::blas::hermitized;
herk(1., hermitized(a), c);
BOOST_REQUIRE( c[2][1] == 19 );
BOOST_REQUIRE( c[1][2] == 19 );
multi::array<double, 2> const c_copy = herk(1., hermitized(a));
BOOST_REQUIRE( c_copy == c );
}
if(0){
cuda::array<double, 2> const acu = a; BOOST_REQUIRE(acu == a);
cuda::array<double, 2> ccu({3, 3}, 9999.);
using multi::blas::herk;
using multi::blas::hermitized;
herk(1., hermitized(acu), ccu);
BOOST_REQUIRE( ccu[2][1] == 19 );
BOOST_REQUIRE( ccu[1][2] == 19 );
cuda::array<double, 2> const c_copy = herk(1., hermitized(a));
BOOST_REQUIRE( c_copy == ccu );
}
if(0){
cuda::managed::array<double, 2> const amcu = a; BOOST_REQUIRE(amcu == a);
cuda::managed::array<double, 2> cmcu({3, 3}, 9999.);
using multi::blas::herk;
using multi::blas::hermitized;
herk(1., hermitized(amcu), cmcu);
BOOST_REQUIRE( cmcu[2][1] == 19 );
BOOST_REQUIRE( cmcu[1][2] == 19 );
cuda::managed::array<double, 2> const c_copy = herk(1., hermitized(a));
BOOST_REQUIRE( c_copy == cmcu );
}
}
#endif

View File

@ -0,0 +1,49 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXX $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS/cuBLAS iamax"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../blas/iamax.hpp"
#include "../../../array.hpp"
#include "../../../adaptors/cuda.hpp"
#include "../../../adaptors/blas/cuda.hpp"
#include<complex>
using std::cout;
namespace multi = boost::multi;
namespace blas = multi::blas;
using complex = std::complex<double>; constexpr complex I{0, 1};
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_iamax){
multi::array<complex, 2> const A = {
{1. + 2.*I, 2., 3., 4.},
{5., 6. + 3.*I, 7., 8.},
{9., 10., 11.+ 4.*I, 12.}
};
using blas::iamax;
auto chess = [](auto const& a, auto const& b){
using std::abs;
return abs(real(a))+abs(imag(a)) < abs(real(b))+abs(imag(b));
};
BOOST_REQUIRE(iamax(A[1])==std::max_element(begin(A[1]), end(A[1]), chess)-begin(A[1]));
BOOST_REQUIRE(A[1][iamax(A[1])]==*std::max_element(begin(A[1]), end(A[1]), chess));
}
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_iamax_cuda){
multi::cuda::array<complex, 2> const A = {
{1. + 2.*I, 2. , 3. , 4.},
{5. , 6. + 3.*I, 7. , 8.},
{9. , 10. , 11.+ 4.*I, 12.}
};
using blas::iamax;
BOOST_REQUIRE(iamax(A[1])==1);
}

View File

@ -0,0 +1,47 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4-*-
$CXX $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS nrm2"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../blas.hpp"
#include "../../../array.hpp"
#include "../../../adaptors/cuda.hpp"
#include "../../../adaptors/blas/cuda.hpp"
#include<complex>
namespace multi = boost::multi;
using complex = std::complex<double>; constexpr complex I{0,1};
BOOST_AUTO_TEST_CASE(multi_blas_nrm2){
namespace blas = multi::blas;
multi::array<double, 2> const A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
BOOST_REQUIRE( blas::nrm2(A[1]) == std::sqrt(blas::dot(A[1], A[1])) );
{
multi::array<complex, 1> A = {1.+I, 3.+2.*I, 3.+4.*I};
BOOST_REQUIRE( blas::dot(A, A)() == (1.+I)*(1.+I) + (3.+2.*I)*(3.+2.*I) + (3.+4.*I)*(3.+4.*I) );
}
{
multi::cuda::array<double, 2> const Agpu = A;
multi::cuda::static_array<double, 0> n = 1.2;
blas::nrm2(Agpu[1], n);
}
{
multi::cuda::array<double, 2> Agpu = A;
double n = 99.;
blas::nrm2(Agpu[1], n); // cuda supports putting scalar results in CPU
double n2{blas::nrm2(Agpu[1])};
BOOST_REQUIRE( n == n2 );
}
}

View File

@ -0,0 +1,253 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
// © Alfredo A. Correa 2019-2021
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS numeric"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "config.hpp"
#include "../../../array.hpp"
#include "../../blas/numeric.hpp"
#include "../../blas/operations.hpp"
#include<complex>
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_numeric_imag){
using complex = std::complex<double>; constexpr complex I{0, 1};
namespace blas = multi::blas;
multi::array<complex, 1> a = { 1. + 2.*I, 3. + 5.*I, 9. + 2.*I };
BOOST_REQUIRE( blas::imag(a)[2] == 2. );
BOOST_REQUIRE( blas::real(a)[2] == 9. );
}
BOOST_AUTO_TEST_CASE(multi_blas_numeric_real_conjugated){
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> B = {
{1. - 3.*I, 6. + 2.*I},
{8. + 2.*I, 2. + 4.*I},
{2. - 1.*I, 1. + 1.*I}
};
BOOST_REQUIRE( B[0][0] == 1. - 3.*I );
multi::array<complex, 2> const Bconst = {
{1. - 3.*I, 6. + 2.*I},
{8. + 2.*I, 2. + 4.*I},
{2. - 1.*I, 1. + 1.*I}
};
BOOST_REQUIRE( Bconst[0][0] == 1. - 3.*I );
namespace blas = multi::blas;
auto BdataC = blas::make_conjugater(B.data_elements());
decltype(blas::make_conjugater(Bconst.data_elements())) ppp;// = BdataC;
ppp = BdataC;
BOOST_REQUIRE( *ppp == 1. + 3.*I );
// static_assert( multi::blas::is_complex_array<multi::array<thrust::complex<double>, 2>>{}, "!");
static_assert( blas::is_complex_array<decltype(B)>{} );
static_assert(not blas::is_conjugated<decltype(B)>{} );
auto&& Bconj = blas::conj(B);
static_assert( blas::is_conjugated<decltype(Bconj)>{} );
BOOST_REQUIRE( Bconj[0][0] == 1. + 3.*I );
BOOST_REQUIRE( imag(*base(Bconj)) == +3 );
// BOOST_TEST_REQUIRE( base(Bconj)->imag() == +3 );
BOOST_REQUIRE( rotated(Bconj)[1][0] == Bconj[0][1] );
// BOOST_REQUIRE( base(Bconj) == -3.*I );
static_assert( blas::is_complex_array<decltype(Bconj)>{} );
BOOST_REQUIRE( blas::conj(Bconj) == B );
BOOST_REQUIRE( blas::conj(B)[1][0] == std::conj(B[1][0]) );
}
BOOST_AUTO_TEST_CASE(multi_blas_numeric_decay){
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> B = {
{ 1. - 3.*I, 6. + 2.*I},
{ 8. + 2.*I, 2. + 4.*I},
{ 2. - 1.*I, 1. + 1.*I}
};
namespace blas = multi::blas;
multi::array<complex, 2> conjB = blas::conj(B);
BOOST_REQUIRE( conjB[2][1] == std::conj(B[2][1]) );
BOOST_REQUIRE( blas::conj(B)[2][1] == std::conj(B[2][1]) );
BOOST_REQUIRE( blas::transposed(B)[1][2] == B[2][1] );
BOOST_REQUIRE( blas::transposed(B) == ~B );
BOOST_REQUIRE( blas::hermitized(B)[2][1] == blas::conj(B)[1][2] );
BOOST_REQUIRE( blas::hermitized(B) == blas::conj(blas::transposed(B)) );
BOOST_REQUIRE( blas::real(B)[2][1] == std::real(B[2][1]) );
BOOST_REQUIRE( blas::imag(B)[2][1] == std::imag(B[2][1]) );
multi::array<double, 2> B_real_doubled = {
{ 1., -3., 6., 2.},
{ 8., 2., 2., 4.},
{ 2., -1., 1., 1.}
};
BOOST_REQUIRE( blas::real_doubled(B) == B_real_doubled );
}
#if defined(CUDA_FOUND) and CUDA_FOUND
#include<thrust/complex.h>
BOOST_AUTO_TEST_CASE(multi_blas_numeric_decay_thrust){
using complex = thrust::complex<double>; complex const I{0, 1};
multi::array<complex, 2> B = {
{1. - 3.*I, 6. + 2.*I},
{8. + 2.*I, 2. + 4.*I},
{2. - 1.*I, 1. + 1.*I}
};
namespace blas = multi::blas;
multi::array<complex, 2> conjB = blas::conj(B);
BOOST_REQUIRE( conjB[1][2] == conj(B[1][2]) );
}
#endif
//#if defined(CUDA_FOUND) and CUDA_FOUND
//#include "../../blas/cuda.hpp"
//#include "../../../adaptors/cuda.hpp"
//namespace cuda = multi::cuda;
//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_numeric_imag_cuda){
// cuda::array<complex, 1> a = { 1. + 2.*I, 3. + 5.*I, 9. + 2.*I };
// namespace blas = multi::blas;
// BOOST_REQUIRE( blas::imag(a)[2] == 2. );
//}
//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_numeric_imag_cuda_managed){
// cuda::managed::array<complex, 1> a = { 1. + 2.*I, 3. + 5.*I, 9. + 2.*I };
// using multi::blas::imag;
// BOOST_REQUIRE( imag(a)[2] == 2. );
//}
//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_numeric_hermitized_cuda){
// cuda::array<complex, 2> const a = {
// { 1. + 2.*I, 3. + 5.*I, 9. + 2.*I },
// { 1. + 2.*I, 3. + 5.*I, 9. + 2.*I },
// { 1. + 2.*I, 3. + 5.*I, 9. + 2.*I },
// };
// using multi::blas::hermitized;
// hermitized(a);
//}
//#endif
BOOST_AUTO_TEST_CASE(multi_blas_numeric_real_imag_part){
using complex = std::complex<double>; complex const I{0., 1.};
multi::array<double, 2> A = {
{1., 3., 4.},
{9., 7., 1.}
};
multi::array<complex, 2> Acplx = A;
BOOST_REQUIRE( Acplx[1][1] == A[1][1] );
multi::array<complex, 2> B = {
{1. - 3.*I, 6. + 2.*I},
{8. + 2.*I, 2. + 4.*I},
{2. - 1.*I, 1. + 1.*I}
};
multi::array<double, 2> Breal = {
{1., 6.},
{8., 2.},
{2., 1.}
};
multi::array<double, 2> Bimag = {
{-3., +2.},
{+2., +4.},
{-1., +1.}
};
using multi::blas::real;
using multi::blas::imag;
BOOST_REQUIRE( Breal == real(B) );
BOOST_REQUIRE( real(B) == Breal );
BOOST_REQUIRE( imag(B) == Bimag );
BOOST_REQUIRE( B[1][0] == 8. + 2.*I );
BOOST_REQUIRE( B[1][0].imag() == 2. );
namespace blas = multi::blas;
BOOST_REQUIRE( blas::hermitized(B)[1][2] == std::conj( B[2][1] ) );
blas::hermitized(B)[1][2] = 20. + 30.*I;
BOOST_REQUIRE( B[2][1] == 20. - 30.*I );
// using multi::blas::hermitized;
// BOOST_REQUIRE( hermitized(B)[0][1] == 8. - 2.*I );
// BOOST_REQUIRE( imag(hermitized(B)[0][1]) == -2. );
}
#if 0
namespace cuda = multi::cuda;
{
cuda::array<complex, 2> Bgpu = B;
using multi::blas::imag;
BOOST_REQUIRE( imag(Bgpu)[1][1] == imag(B)[1][1] );
BOOST_REQUIRE( real(Bgpu)[1][1] == real(B)[1][1] );
}
{
cuda::managed::array<complex, 2> Bgpu = B;
using multi::blas::imag;
BOOST_REQUIRE( imag(Bgpu)[1][1] == imag(B)[1][1] );
BOOST_REQUIRE( real(Bgpu)[1][1] == real(B)[1][1] );
}
multi::array_ref<double, 2> rB(reinterpret_cast<double*>(data_elements(B)), {size(B), 2*size(*begin(B))});
auto&& Bconj = multi::static_array_cast<complex, multi::blas::detail::conjugater<complex*>>(B);
assert( size(Bconj) == size(B) );
assert( conj(B[1][2]) == Bconj[1][2] );
// auto&& BH = multi::blas::hermitized(B);
// assert( BH[1][2] == conj(B[2][1]) );
// std::cout << BH[1][2] << " " << B[2][1] << std::endl;
// auto&& BH1 = multi::static_array_cast<complex, multi::blas::detail::conjugater<complex*>>(rotated(B));
// auto&& BH2 = rotated(multi::static_array_cast<complex, multi::blas::detail::conjugater<complex*>>(B));
// what( BH1, BH2 );
// using multi::blas::imag;
// assert( real(A)[1][2] == 1. );
// assert( imag(A)[1][2] == -3. );
// print(A) <<"--\n";
// print(real(A)) <<"--\n";
// print(imag(A)) <<"--\n";
multi::array<complex, 2> C({2, 2});
multi::array_ref<double, 2> rC(reinterpret_cast<double*>(data_elements(C)), {size(C), 2*size(*begin(C))});
// gemm('T', 'T', 1., A, B, 0., C);
// gemm('T', 'T', 1., A, B, 0., C);
// gemm('T', 'T', 1., real(A), B, 0., C);
}
#endif

View File

@ -0,0 +1,51 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXX $0 -o $0x `pkg-config --libs blas` -lcudart -lcublas -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS operations and cuda"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../blas/dot.hpp"
#include "../../../array.hpp"
#include "../../blas/cuda.hpp"
#include "../../../adaptors/cuda.hpp"
#include "../../../complex.hpp"
#include<complex>
#include<cassert>
#include<numeric>
using std::cout;
namespace multi = boost::multi;
namespace blas = multi::blas;
using complex = std::complex<double>; constexpr complex I{0, 1};
BOOST_AUTO_TEST_CASE(blas_conjugated_cpu){
multi::array<complex, 1> const a = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
BOOST_REQUIRE( blas::C(a)[1] == conj(a[1]) );
namespace cuda = multi::cuda;
cuda::array<complex, 1> const agpu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
BOOST_REQUIRE( blas::C(agpu)[1] == conj(agpu[1]) );
}
BOOST_AUTO_TEST_CASE(blas_conjugated_gpu){
#if 0
cuda::array<complex, 1> const acu = {1. + I, 2. + 3.*I, 3. + 2.*I, 4. - 9.*I};
cuda::array<complex, 1> const bcu = {5. + 2.*I, 6. + 6.*I, 7. + 2.*I, 8. - 3.*I};
{
cuda::array<complex, 0> ccu;
blas::dot(acu, bcu, ccu);
BOOST_REQUIRE( ccu() == 19. - 27.*I );
}
BOOST_REQUIRE( blas::C(bcu)[1] == 2. - 3.*I );
#endif
}

View File

@ -0,0 +1,153 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
// © Alfredo A. Correa 2019-2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS scal"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../blas/scal.hpp"
#include "../../../array.hpp"
#include<complex>
namespace multi = boost::multi;
namespace blas = multi::blas;
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_n){
multi::array<double, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
BOOST_REQUIRE( (A[0][2] == 3.) and (A[2][2] == 11.) );
blas::scal_n(2., A[2].begin(), A[2].size());
BOOST_REQUIRE( A[0][2] == 3. and A[2][2] == 11.*2. );
}
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_it){
multi::array<double, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
BOOST_REQUIRE( A[0][2] == 3. );
BOOST_REQUIRE( A[2][2] == 11.);
blas::scal(2., A[2].begin(), A[2].end());
BOOST_REQUIRE( A[0][2] == 3. );
BOOST_REQUIRE(A[2][2] == 11.*2. );
}
template<class T> void what(T&&) = delete;
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_real){
multi::array<double, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
BOOST_REQUIRE( A[0][2] == 3. );
BOOST_REQUIRE( A[2][2] == 11. );
BOOST_REQUIRE( blas::scal(1., A[2]) == A[2] );
BOOST_REQUIRE( &blas::scal(1., A[2]) == &A[2] );
BOOST_REQUIRE( +blas::scal(1., A[2]) == A[2] );
blas::scal(2., A[2]);
BOOST_REQUIRE( A[0][2] == 3. and A[2][2] == 11.*2. );
BOOST_REQUIRE( &blas::scal(1., A[2]) == &A[2] );
}
//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_complex_real_case){
// using complex = std::complex<double>;
// multi::array<complex, 2> A = {
// {1., 2., 3., 4.},
// {5., 6., 7., 8.},
// {9., 10., 11., 12.}
// };
// BOOST_TEST( A[0][2] == 3. );
// BOOST_TEST( A[2][2] == 11. );
// blas::scal(2., A[2]); // zscal (2. is promoted to complex later)
// BOOST_TEST( A[0][2] == 3. );
// BOOST_REQUIRE( A[2][2] == 11.*2. );
// blas::scal(1./2, A[2]); // zdscal
// BOOST_TEST( A[0][2] == 3. );
// BOOST_TEST( A[2][1] == 10. );
// BOOST_TEST( A[2][2] == 11. );
//}
//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_complex){
// multi::array<complex, 2> A = {
// {1. + 2.*I, 2. + 3.*I, 3. + 4.*I, 4. + 5.*I},
// {5. + 2.*I, 6. + 3.*I, 7. + 4.*I, 8. + 5.*I},
// {1. + 1.*I, 2. + 2.*I, 3. + 3.*I, 4. + 4.*I}
// };
// blas::scal(2., A[1]); // zscal (2. is promoted to complex later)
// BOOST_TEST( A[1][2] == 14. + 8.*I );
// blas::scal(3.*I, A[0]);
// BOOST_TEST( A[0][1] == (2. + 3.*I)*3.*I );
// blas::scal(2., blas::imag(A[2]));
// assert( A[2][1] == 2. + 4.*I );
//}
////BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_cuda_noconst){
//// namespace cuda = multi::cuda;
//// cuda::array<complex, 2> A = {
//// {1. + 2.*I, 2. + 3.*I, 3. + 4.*I, 4. + 5.*I},
//// {5. + 2.*I, 6. + 3.*I, 7. + 4.*I, 8. + 5.*I},
//// {1. + 1.*I, 2. + 2.*I, 3. + 3.*I, 4. + 4.*I}
//// };
//// blas::scal(2., A[1]); // zscal (2. is promoted to complex later)
//// BOOST_REQUIRE( A[1][2] == 14. + 8.*I );
//// cuda::array<complex, 1> a = {1. + 10.*I, 2. + 20.*I, 3. + 30.*I};
//// blas::scal(2., a);
//// BOOST_REQUIRE(( a[1] == complex{4, 40} ));
////// blas::scal(3., blas::imag(a)); // gives internal compilation error in gcc
////// BOOST_REQUIRE(( a[1] == complex{4, 120} ));
////}
////BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_cuda_const){
//// namespace cuda = multi::cuda;
//// cuda::array<complex, 2> const A = {
//// {1. + 2.*I, 2. + 3.*I, 3. + 4.*I, 4. + 5.*I},
//// {5. + 2.*I, 6. + 3.*I, 7. + 4.*I, 8. + 5.*I},
//// {1. + 1.*I, 2. + 2.*I, 3. + 3.*I, 4. + 4.*I}
//// };
//// auto A1cpy = blas::scal(2., A[1]); // zscal (2. is promoted to complex later)
//// BOOST_REQUIRE( A1cpy[2] == 14. + 8.*I );
////// cuda::array<complex, 1> a = {1. + 10.*I, 2. + 20.*I, 3. + 30.*I};
////// blas::scal(2., a);
////// BOOST_REQUIRE(( a[1] == complex{4, 40} ));
////// blas::scal(3., blas::imag(a));
////// BOOST_REQUIRE(( a[1] == complex{4, 120} ));
////}
//#if 0
//BOOST_AUTO_TEST_CASE(multi_adaptors_blas_test_scal_cuda_managed){
// cuda::managed::array<complex, 2> A = {
// {1. + 2.*I, 2. + 3.*I, 3. + 4.*I, 4. + 5.*I},
// {5. + 2.*I, 6. + 3.*I, 7. + 4.*I, 8. + 5.*I},
// {1. + 1.*I, 2. + 2.*I, 3. + 3.*I, 4. + 4.*I}
// };
// using blas::scal;
// scal(2., A[1]);
// BOOST_REQUIRE( A[1][2] == 14. + 8.*I );
// scal(2., blas::imag(A[1]));
// BOOST_REQUIRE( A[1][2] == 14. + 16.*I );
//}
//#endif

View File

@ -0,0 +1,90 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXX $0 -o $0x `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x; exit
#endif
// © Alfredo A. Correa 2019-2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS swap"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../blas.hpp"
#include "../../../array.hpp"
#include<complex>
#include<cassert>
using std::cout;
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(lapack_potrf, *boost::unit_test::tolerance(0.00001) ){
{
multi::array<double, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
BOOST_REQUIRE( A[0][2] == 3. );
BOOST_REQUIRE( A[2][2] == 11. );
multi::blas::swap(A[0], A[2]); // blas swap
BOOST_REQUIRE( A[0][2] == 11. );
BOOST_REQUIRE( A[2][2] == 3. );
swap(A[0], A[2]); // built-in swap
BOOST_REQUIRE( A[0][2] == 3. );
BOOST_REQUIRE( A[2][2] == 11. );
}
{
multi::array<double, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
BOOST_REQUIRE( A[0][0] == 1. );
BOOST_REQUIRE( A[0][3] == 4. );
multi::blas::swap(rotated(A)[0], rotated(A)[3]); // blas swap (deep)
BOOST_REQUIRE( A[0][0] == 4. );
BOOST_REQUIRE( A[0][3] == 1. );
swap(rotated(A)[0], rotated(A)[3]); // built-in swap (deep)
BOOST_REQUIRE( A[0][0] == 1. );
BOOST_REQUIRE( A[0][3] == 4. );
}
{
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> A = {
{1.+ 2.*I, 2., 3., 4. + 3.*I},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
BOOST_REQUIRE( A[0][0] == 1.+ 2.*I );
BOOST_REQUIRE( A[0][3] == 4. + 3.*I );
multi::blas::swap(rotated(A)[0], rotated(A)[3]); // blas swap (deep)
BOOST_REQUIRE( A[0][0] == 4. + 3.*I );
BOOST_REQUIRE( A[0][3] == 1.+ 2.*I );
swap(rotated(A)[0], rotated(A)[3]); // built-in swap (deep)
BOOST_REQUIRE( A[0][0] == 1.+ 2.*I );
BOOST_REQUIRE( A[0][3] == 4. + 3.*I );
}
{
multi::array<double, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.}
};
BOOST_REQUIRE( A[0][2] == 3. );
BOOST_REQUIRE( A[2][2] == 11. );
auto it = multi::blas::swap(begin(A[0]), end(A[0]) - 1, begin(A[2])); // blas swap
BOOST_REQUIRE( it == end(A[2]) - 1 );
BOOST_REQUIRE( A[0][2] == 11. );
BOOST_REQUIRE( A[2][2] == 3. );
using std::swap_ranges;
swap_ranges(begin(A[0]), end(A[0]), begin(A[2])); // built-in swap
BOOST_REQUIRE( A[0][2] == 3. );
BOOST_REQUIRE( A[2][2] == 11. );
}
}

View File

@ -0,0 +1,33 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXXX $CXXFLAGS $0 -o $0.$X `pkg-config --cflags --libs blas cuda-11.0` -lboost_unit_test_framework&&$0.$X&&rm $0.$X;exit
#endif
#include "../../blas/traits.hpp"
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS traits"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "./config.hpp"
#include<complex>
namespace multi = boost::multi;
namespace blas = multi::blas;
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_traits){
static_assert( blas::is_d<double>{} );
static_assert( blas::is_s<float >{} );
static_assert( blas::is_c<std::complex<float>>{} );
static_assert( blas::is_z<std::complex<double>>{} );
}
#if CUDA_FOUND
#include<thrust/complex.h>
BOOST_AUTO_TEST_CASE(multi_adaptors_blas_traits_thrust){
static_assert( blas::is_c<thrust::complex<float>>{} );
static_assert( blas::is_z<thrust::complex<double>>{} );
}
#endif

View File

@ -0,0 +1,604 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
$CXX $0 -o $0x -lcudart -lcublas `pkg-config --libs blas` -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2021
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi BLAS trsm"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
//#include "../../../memory/adaptors/cuda/managed/ptr.hpp"
#include "../../../adaptors/blas/gemm.hpp"
#include "../../../adaptors/blas/trsm.hpp"
//#include "../../../adaptors/blas/cuda.hpp"
//#include "../../../adaptors/cuda.hpp"
#include "../../../array.hpp"
#include <config.hpp>
namespace multi = boost::multi;
template<class Matrix>
auto triangular(multi::blas::filling f, Matrix const& m){
auto ret =+ m;
switch(f){
case multi::blas::filling::upper:
for(multi::size_type i = 0; i != size( ret); ++i){
for(multi::size_type j = 0; j != std::min(i, size(~ret)); ++j){
ret[i][j] = 0.;
}
}
break;
case multi::blas::filling::lower:
for(multi::size_type j = 0; j != size(~ret); ++j){
for(multi::size_type i = 0; i != std::min(j, size( ret)); ++i){
ret[i][j] = 0.;
}
}
break;
}
return ret;
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_double_0x0){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
multi::array<double, 2> const A;
{
multi::array<double, 2> B;
// B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
blas::trsm(blas::side::left, blas::filling::upper, blas::diagonal::general, 1., A, B);
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_double_1x1){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
multi::array<double, 2> const A = {
{10.,},
};
{
multi::array<double, 2> B = {
{3.,},
};
auto const B_cpy = B;
blas::trsm(blas::side::left, blas::filling::upper, blas::diagonal::general, 1., A, B);
// B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
BOOST_REQUIRE_CLOSE( B[0][0] , 3./10. , 0.00001 );
BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A, B))[0][0] , B_cpy[0][0] , 0.00001 );
}
{
multi::array<double, 2> B = {
{3.,},
};
auto const B_cpy = B;
// B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
blas::trsm(blas::side::left, blas::filling::upper, blas::diagonal::general, 2., A, B);
BOOST_REQUIRE_CLOSE( B[0][0] , 2.*3./10. , 0.00001 );
BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A, B))[0][0] , 2.*B_cpy[0][0] , 0.00001 );
}
{
multi::array<double, 2> B = {
{3., 4., 5.},
};
auto const B_cpy = B;
// B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
blas::trsm(blas::side::left, blas::filling::upper, blas::diagonal::general, 1., A, B);
BOOST_REQUIRE_CLOSE( B[0][0] , 3./10. , 0.00001 );
BOOST_REQUIRE_CLOSE( B[0][1] , 4./10. , 0.00001 );
BOOST_REQUIRE_CLOSE( B[0][2] , 5./10. , 0.00001 );
BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A, B))[0][1] , B_cpy[0][1] , 0.00001 );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_real_square){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
multi::array<double, 2> const A = {
{ 1., 3., 4.},
{ NAN, 7., 1.},
{ NAN, NAN, 8.}
};
auto const A_cpy = triangular(blas::filling::upper, A);
{
multi::array<double, 2> B = {
{1., 3., 4.},
{2., 7., 1.},
{3., 4., 2.}
};
auto const B_cpy = B;
// B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
blas::trsm(blas::side::left, blas::filling::upper, 1., A, B);
BOOST_REQUIRE_CLOSE( B[1][2] , 0.107143 , 0.001 );
BOOST_REQUIRE( (+blas::gemm(1., A_cpy, B))[1][2] == B_cpy[1][2] );
}
{
auto const AT =+ ~A;
auto const AT_cpy = triangular(blas::filling::lower, AT);
multi::array<double, 2> B = {
{1., 3., 4.},
{2., 7., 1.},
{3., 4., 2.}
};
auto const B_cpy = B;
blas::trsm(blas::side::left, blas::filling::upper, 1., blas::T(AT), B);
BOOST_REQUIRE_CLOSE( B[1][2] , 0.107143 , 0.001 );
BOOST_REQUIRE( (+blas::gemm(1., blas::T(AT_cpy), B))[1][2] == B_cpy[1][2] );
}
{
auto const AT =+ ~A;
auto const AT_cpy = triangular(blas::filling::lower, AT);
multi::array<double, 2> const B = {
{1., 3., 4.},
{2., 7., 1.},
{3., 4., 2.}
};
auto BT =+ ~B;
blas::trsm(blas::side::left, blas::filling::upper, 1., blas::T(AT), blas::T(BT));
BOOST_REQUIRE_CLOSE( blas::T(BT)[1][2] , 0.107143 , 0.001 );
BOOST_REQUIRE( (+blas::gemm(1., blas::T(AT_cpy), blas::T(BT)))[1][2] == B[1][2] );
}
{
// auto const AT =+ ~A;
multi::array<double, 2> const B = {
{1., 3., 4.},
{2., 7., 1.},
{3., 4., 2.}
};
auto BT =+ ~B;
blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::T(BT));
BOOST_REQUIRE_CLOSE( (~BT)[1][2] , 0.107143 , 0.001 );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 2.*I, 3. - 1.*I, 4. + 9.*I},
{NAN , 7. + 4.*I, 1. + 8.*I},
{NAN , NAN , 8. + 2.*I}
};
multi::array<complex, 2> B = {
{1. - 9.*I, 3. + 2.*I, 4. + 3.*I},
{2. - 2.*I, 7. - 2.*I, 1. - 1.*I},
{3. + 1.*I, 4. + 8.*I, 2. + 7.*I}
};
blas::trsm(blas::side::left, blas::filling::lower, 2.+1.*I, blas::H(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
BOOST_REQUIRE_CLOSE( real(B[1][2]) , 2.33846 , 0.0001 );
BOOST_REQUIRE_CLOSE( imag(B[1][2]) , -0.0923077 , 0.0001 );
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_rectangular){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 2.*I, 3. - 1.*I, 4. + 9.*I},
{NAN , 7. + 4.*I, 1. + 8.*I},
{NAN , NAN , 8. + 2.*I}
};
multi::array<complex, 2> B = {
{1. - 9.*I, 3. + 2.*I},
{2. - 2.*I, 7. - 2.*I},
{3. + 1.*I, 4. + 8.*I}
};
blas::trsm(blas::side::left, blas::filling::lower, 2.+1.*I, blas::H(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
BOOST_REQUIRE_CLOSE( real(B[2][0]) , -4.16471 , 0.0001 );
BOOST_REQUIRE_CLOSE( imag(B[2][0]) , 8.25882 , 0.0001 );
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_column){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 2.*I, 3. - 1.*I, 4. + 9.*I},
{NAN , 7. + 4.*I, 1. + 8.*I},
{NAN , NAN , 8. + 2.*I}
};
multi::array<complex, 2> B = {
{1. - 9.*I},
{2. - 2.*I},
{3. + 1.*I}
};
blas::trsm(blas::side::left, blas::filling::lower, 2.+1.*I, blas::H(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
BOOST_REQUIRE_CLOSE( real(B[2][0]) , -4.16471 , 0.0001);
BOOST_REQUIRE_CLOSE( imag(B[2][0]) , 8.25882 , 0.0001);
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_column_cpu){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 2.*I, 3. - 1.*I, 4. + 9.*I},
{NAN , 7. + 4.*I, 1. + 8.*I},
{NAN , NAN , 8. + 2.*I}
};
multi::array<complex, 2> B = {
{1. - 9.*I},
{2. - 2.*I},
{3. + 1.*I}
};
blas::trsm(blas::side::left, blas::filling::lower, 2.+1.*I, blas::H(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
BOOST_REQUIRE_CLOSE( real(B[2][0]) , -4.16471 , 0.0001 );
BOOST_REQUIRE_CLOSE( imag(B[2][0]) , 8.25882 , 0.0001 );
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_hydrogen_inq_case_real){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
multi::array<double, 2> const A = {{2.,},};
{
multi::array<double, 2> B = {{1., 2., 3.},};
auto const B_cpy = B;
blas::trsm(blas::side::left, blas::filling::lower, 1., A, B);
BOOST_REQUIRE( B[0][1] == B_cpy[0][1]/A[0][0] );
}
{
multi::array<double, 2> B = {
{1.},
{2.},
{3.},
};
auto const B_cpy = B;
blas::trsm(blas::side::left, blas::filling::lower, 1., A, blas::T(B));
BOOST_REQUIRE( blas::T(B)[0][1] == blas::T(B_cpy)[0][1]/A[0][0] );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_hydrogen_inq_case_complex){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
using complex = std::complex<double>;
multi::array<complex, 2> const A = {{2.,},};
{
multi::array<complex, 2> B = {{1., 2., 3.},};
auto const B_cpy = B;
blas::trsm(blas::side::left, blas::filling::lower, 1., A, B);
BOOST_REQUIRE( B[0][1] == B_cpy[0][1]/A[0][0] );
}
multi::array<complex, 2> B1 = {
{1.},
{2.},
{3.},
};
multi::array<complex, 2> B2 = {
{1.},
{2.},
{3.},
};
{
// auto const B_cpy = B1;
blas::trsm(blas::side::left, blas::filling::lower, 1., A, blas::H(B1));
// BOOST_REQUIRE( (+blas::gemm(1., A, blas::H(B1)))[0][1] == blas::H(B_cpy)[0][1] );
}
{
auto const B_cpy = B2;
blas::trsm(blas::side::right, blas::filling::upper, 1., blas::H(A), B2);
// BOOST_REQUIRE( (+blas::gemm(1., A, blas::H(B)))[0][1] == blas::H(B_cpy)[0][1] );
BOOST_REQUIRE( (+blas::gemm(1., B2, blas::H(A)))[1][0] == B_cpy[1][0] );
}
BOOST_REQUIRE( B1 == B2 );
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_real_nonsquare){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
multi::array<double, 2> const A = {
{ 1., 3., 4.},
{ NAN, 7., 1.},
{ NAN, NAN, 8.}
};
auto const A_cpy = triangular(blas::filling::upper, A);
{
multi::array<double, 2> B = {
{1., 3., 4., 8.},
{2., 7., 1., 9.},
{3., 4., 2., 1.},
};
auto const B_cpy =+ B;
multi::array<double, 2> BT =+ ~B;
BOOST_REQUIRE( BT == ~B );
blas::trsm(blas::side::left, blas::filling::upper, 1., A, B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
BOOST_REQUIRE_CLOSE( B[1][2] , 0.107143 , 0.001);
BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A_cpy, B))[1][2] , B_cpy[1][2] , 0.001);
auto const BT_cpy = BT;
blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::T(BT));
BOOST_REQUIRE_CLOSE( blas::T(BT)[1][2], 0.107143, 0.001 );
BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A_cpy, blas::T(BT)))[1][2] , blas::T(BT_cpy)[1][2] , 0.00001 );
}
{
multi::array<double, 2> B = {
{1., 3., 4., 8.},
{2., 7., 1., 9.},
{3., 4., 2., 1.},
};
multi::array<double, 2> AT = ~A;
multi::array<double, 2> BT = ~B;
blas::trsm(blas::side::left, blas::filling::upper, 1., blas::T(AT), B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
BOOST_REQUIRE_CLOSE( B[1][2] , 0.107143 , 0.001 );
blas::trsm(blas::side::left, blas::filling::upper, 1., blas::T(AT), blas::T(BT));
BOOST_REQUIRE_CLOSE( (~BT)[1][2] , 0.107143, 0.001 );
}
{
multi::array<double, 2> B = {
{1.},
{2.},
{3.},
};
auto const B_cpy =+ B;
blas::trsm(blas::side::left, blas::filling::upper, 1., A, B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
BOOST_REQUIRE_CLOSE( B[2][0] , 0.375 , 0.00001 );
BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A_cpy, B))[1][0] , B_cpy[1][0] , 0.00001 );
}
{
multi::array<double, 2> B = {
{1.},
{2.},
{3.},
};
auto const B_cpy =+ B;
blas::trsm(blas::side::left, blas::filling::upper, 1.2, A, B);
BOOST_REQUIRE_CLOSE( (+blas::gemm(1., A_cpy, B))[1][0] , 1.2*B_cpy[1][0] , 0.00001 );
BOOST_REQUIRE_CLOSE( (+blas::gemm(1./1.2, A_cpy, B))[1][0] , B_cpy[1][0] , 0.00001 );
}
{
multi::array<double, 2> B = {
{1.},
{2.},
{3.},
};
multi::array<double, 2> BT = rotated(B);
blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::T(BT));
BOOST_REQUIRE_CLOSE( (~BT)[2][0] , 0.375 , 0.00001);
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_nonsquare_default_diagonal_hermitized_gemm_check_no_const){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 4.*I, 3., 4.- 10.*I},
{ 0., 7.- 3.*I, 1.},
{ 0., 0., 8.- 2.*I}
};
multi::array<complex, 2> B = {
{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
};
using multi::blas::trsm;
using multi::blas::filling;
using multi::blas::hermitized;
blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::H(B)); // B†←A⁻¹.B†, B←B.A⁻¹†, B←(A⁻¹.B†)†
BOOST_REQUIRE_CLOSE( imag(B[1][2]) , -0.147059 , 0.001);
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_nonsquare_default_diagonal_hermitized_gemm_check){//, *utf::tolerance(0.00001)){
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 4.*I, 3., 4.- 10.*I},
{ 0., 7.- 3.*I, 1.},
{ 0., 0., 8.- 2.*I}
};
namespace blas = multi::blas;
{
{
multi::array<complex, 2> B = {
{1. + 1.*I, 5. + 3.*I},
{2. + 1.*I, 9. + 3.*I},
{3. + 1.*I, 1. - 1.*I},
};
auto S = blas::trsm(blas::side::left, blas::filling::lower, 1., blas::H(A), B); // S = A⁻¹†.B, S† = B†.A⁻¹
BOOST_REQUIRE_CLOSE( real(S[2][1]) , 1.71608 , 0.001 );
}
{
multi::array<complex, 2> B = {
{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
};
auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::H(B)); // S = A⁻¹B†, S†=B.A⁻¹†, S=(B.A⁻¹)†, B <- S†, B <- B.A⁻¹†
BOOST_REQUIRE_CLOSE( imag(S[2][1]) , +0.147059 , 0.001);
BOOST_REQUIRE_CLOSE( imag(B[1][2]) , -0.147059 , 0.001);
}
{
multi::array<complex, 2> B = {
{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
};
auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 2., A, blas::H(B)); // S = A⁻¹B†, S†=B.A⁻¹†, S=(B.A⁻¹)†, B <- S†, B <- B.A⁻¹†
BOOST_REQUIRE_CLOSE( imag(S[2][1]) , +0.147059*2. , 0.001 );
BOOST_REQUIRE_CLOSE( imag(B[1][2]) , -0.147059*2. , 0.001 );
}
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_real_1x1_check){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
multi::array<double, 2> const A = {
{ 4.},
};
{
{
multi::array<double, 2> B = {
{5.},
};
auto S =+ blas::trsm(blas::side::left, blas::filling::upper, blas::diagonal::general, 3., A, B);
BOOST_REQUIRE( S[0][0] == 3.*5./4. );
}
{
multi::array<double, 2> B = {
{5.},
};
auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 1., A, B);
BOOST_REQUIRE( S[0][0] == 1.*5./4. );
}
{
multi::array<double, 2> B = {
{5.},
};
auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 1., A, B);
BOOST_REQUIRE( S[0][0] == 1.*5./4. );
}
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_1x1_check){//, *utf::tolerance(0.00001)){
using complex = std::complex<double>; complex const I = complex{0, 1};
multi::array<complex, 2> const A = {
{ 4. + 2.*I},
};
namespace blas = multi::blas;
{
multi::array<complex, 2> B = {
{5. + 1.*I},
};
auto const B_cpy =+ B;
blas::trsm(blas::side::left, blas::filling::upper, 3.+5.*I, A, B);
BOOST_REQUIRE_CLOSE( real((+blas::gemm(1., A, B))[0][0]) , real((3.+5.*I)*B_cpy[0][0]) , 0.00001 );
BOOST_REQUIRE_CLOSE( imag((+blas::gemm(1., A, B))[0][0]) , imag((3.+5.*I)*B_cpy[0][0]) , 0.00001 );
BOOST_REQUIRE_CLOSE( real((+blas::gemm(1./(3.+5.*I), A, B))[0][0]) , real(B_cpy[0][0]) , 0.00001 );
BOOST_REQUIRE_CLOSE( imag((+blas::gemm(1./(3.+5.*I), A, B))[0][0]) , imag(B_cpy[0][0]) , 0.00001 );
}
}
#if defined(CUDA_FOUND) and CUDA_FOUND
#include<thrust/complex.h>
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_thrust_nonsquare_default_diagonal_hermitized_gemm_check){//, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
using complex = thrust::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 4.*I, 3. , 4.- 10.*I},
{ 0. , 7.- 3.*I, 1. },
{ 0. , 0. , 8.- 2.*I}
};
{
{
multi::array<complex, 2> B = {
{1. + 1.*I, 5. + 3.*I},
{2. + 1.*I, 9. + 3.*I},
{3. + 1.*I, 1. - 1.*I},
};
auto S = blas::trsm(blas::side::left, blas::filling::lower, 1., blas::H(A), B); // S = A⁻¹†.B, S† = B†.A⁻¹
BOOST_REQUIRE_CLOSE( S[2][1].real() , 1.71608 , 0.001 );
BOOST_REQUIRE( S == B );
}
{
multi::array<complex, 2> B = {
{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
};
auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 1., A, blas::H(B)); // S = A⁻¹B†, S†=B.A⁻¹†, S=(B.A⁻¹)†, B <- S†, B <- B.A⁻¹†
BOOST_REQUIRE_CLOSE( B[1][2].imag() , -0.147059 , 0.001 );
BOOST_REQUIRE( S == blas::H(B) );
}
{
multi::array<complex, 2> B = {
{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
};
auto S =+ blas::trsm(blas::side::left, blas::filling::upper, 2., A, blas::H(B)); // S = A⁻¹B†, S†=B.A⁻¹†, S=(B.A⁻¹)†, B <- S†, B <- B.A⁻¹†
BOOST_REQUIRE_CLOSE( B[1][2].imag() , -0.147059*2. , 0.001 );
BOOST_REQUIRE( S == blas::H(B) );
}
}
}
//BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_column_cuda, *utf::tolerance(0.00001)){
// namespace cuda = multi::cuda;
// cuda::array<complex, 2> A = {
// { 1., 3., 4.},
// {NAN, 7., 1.},
// {NAN, NAN, 8.}
// };
//// multi::cuda::array<complex, 2> const B = {
//// {1.},
//// {2.},
//// {3.}
//// };
// namespace blas = multi::blas;
//// auto Bcpy = blas::trsm(blas::filling::upper, 1., A, B); // B ⬅ α Inv[A].B, B† ⬅ B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
//// multi::array<complex, 2> Bcpu = Bcpy;
//// BOOST_TEST_REQUIRE( std::real(Bcpu[2][0]) == 0.375 );
//// BOOST_TEST_REQUIRE( std::imag(Bcpu[2][0]) == 0. );
//}
#endif
#if 0
//template<class T> void what(T&&) = delete;
BOOST_AUTO_TEST_CASE(multi_blas_trsm_double_column_cuda, *utf::tolerance(0.00001)){
multi::cuda::array<double, 2> const A = {
{ 1., 3., 4.},
{NAN, 7., 1.},
{NAN, NAN, 8.}
};
multi::cuda::array<double, 2> B = {
{1.},
{2.},
{3.}
};
namespace blas = multi::blas;
using blas::filling;
using blas::hermitized;
trsm(filling::upper, 1., A, B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
BOOST_REQUIRE( B[2][0] == 0.375 );
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_column_cuda2, *utf::tolerance(0.00001)){
multi::cuda::array<complex, 2> const A = {
{ 1. + 2.*I, 3. - 1.*I, 4. + 9.*I},
{NAN , 7. + 4.*I, 1. + 8.*I},
{NAN , NAN , 8. + 2.*I}
};
multi::cuda::array<complex, 2> B = {
{1. - 9.*I},
{2. - 2.*I},
{3. + 1.*I}
};
namespace blas = multi::blas;
using blas::filling;
using blas::hermitized;
trsm(filling::lower, 2.+1.*I, hermitized(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
multi::array<complex, 2> Bcpu = B;
BOOST_REQUIRE( real(Bcpu[2][0]) == -4.16471 );
BOOST_REQUIRE( imag(Bcpu[2][0]) == 8.25882 );
}
BOOST_AUTO_TEST_CASE(multi_blas_cuda_trsm_complex, *utf::tolerance(0.00001)){
multi::cuda::array<complex, 2> const A = {
{ 1. + 2.*I, 3. - 1.*I, 4. + 9.*I},
{NAN , 7. + 4.*I, 1. + 8.*I},
{NAN , NAN , 8. + 2.*I}
};
multi::cuda::array<complex, 2> const B = {
{1. - 9.*I, 3. + 2.*I, 4. + 3.*I},
{2. - 2.*I, 7. - 2.*I, 1. - 1.*I},
{3. + 1.*I, 4. + 8.*I, 2. + 7.*I}
};
namespace blas = multi::blas;
using blas::filling;
using blas::hermitized;
// auto C = trsm(filling::lower, 2.+1.*I, hermitized(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
auto C = trsm(filling::lower, 1., hermitized(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit
}
BOOST_AUTO_TEST_CASE(multi_blas_cuda_managed_trsm_complex, *utf::tolerance(0.00001)){
multi::cuda::managed::array<complex, 2> const A = {
{ 1. + 2.*I, 3. - 1.*I, 4. + 9.*I},
{NAN , 7. + 4.*I, 1. + 8.*I},
{NAN , NAN , 8. + 2.*I}
};
multi::cuda::managed::array<complex, 2> const B = {
{1. - 9.*I, 3. + 2.*I, 4. + 3.*I},
{2. - 2.*I, 7. - 2.*I, 1. - 1.*I},
{3. + 1.*I, 4. + 8.*I, 2. + 7.*I}
};
namespace blas = multi::blas;
using blas::filling;
using blas::hermitized;
auto C = trsm(filling::lower, 2.+1.*I, hermitized(A), B); // B=alpha Inv[A†].B, B†=B†.Inv[A], Solve(A†.X=B, X), Solve(X†.A=B†, X), A is upper triangular (with implicit zeros below)
}
#endif

View File

@ -0,0 +1,111 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXX $0 -o $0x -lcudart -lcublas -lboost_unit_test_framework `pkg-config --libs blas`&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS trsv"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../../memory/adaptors/cuda/managed/ptr.hpp"
#include "../../../adaptors/blas/trsv.hpp"
#include "../../../adaptors/blas/cuda.hpp"
#include "../../../adaptors/cuda.hpp"
#include "../../../array.hpp"
namespace multi = boost::multi;
template<class M> decltype(auto) print(M const& C){
using multi::size; using std::cout;
for(int i = 0; i != size(C); ++i){
for(int j = 0; j != size(C[i]); ++j) cout<< C[i][j] <<' ';
cout<<std::endl;
}
return cout<<std::endl;
}
namespace utf = boost::unit_test;
using complex = std::complex<double>;
complex const I{0, 1};
namespace multi = boost::multi;
namespace blas = multi::blas;
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_cpu, *utf::tolerance(0.00001)){
multi::array<complex, 2> const A = {
{ 1. + 1.*I, 3. - 2.*I, 4. + 1.*I},
{NAN , 7. - 10.*I, 1. + 2.*I},
{NAN , NAN , 8. + 1.*I}
};
multi::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
blas::trsv(blas::filling::upper, blas::diagonal::general, A, b);
BOOST_TEST_REQUIRE( real(b[0]) == -1.37259 );
BOOST_TEST_REQUIRE( real(b[1]) == 0.2127 );
BOOST_TEST_REQUIRE( real(b[2]) == 0.569231 );
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_cuda, *utf::tolerance(0.0001)){
namespace cuda = multi::cuda;
cuda::managed::array<complex, 2> const A = {
{ 1. + 1.*I, 3. - 2.*I, 4. + 1.*I},
{NAN , 7. - 10.*I, 1. + 2.*I},
{NAN , NAN , 8. + 1.*I}
};
cuda::managed::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
blas::trsv(blas::filling::upper, blas::diagonal::general, A, b);
BOOST_TEST_REQUIRE( real(b[0]) == -1.37259 );
BOOST_TEST_REQUIRE( real(b[1]) == 0.2127 );
BOOST_TEST_REQUIRE( real(b[2]) == 0.569231 );
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_cuda_managed, *utf::tolerance(0.00001)){
namespace cuda = multi::cuda;
cuda::managed::array<complex, 2> const A = {
{ 1. + 1.*I, 3. - 2.*I, 4. + 1.*I},
{NAN , 7. - 10.*I, 1. + 2.*I},
{NAN , NAN , 8. + 1.*I}
};
cuda::managed::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
blas::trsv(blas::filling::upper, A, b); // this operation happens in GPU when #include "adaptors/blas/cuda.hpp"
multi::array<complex, 1> const b_cpu = b;
BOOST_TEST_REQUIRE( real(b_cpu[0]) == -1.37259 );
BOOST_TEST_REQUIRE( real(b_cpu[1]) == 0.2127 );
BOOST_TEST_REQUIRE( real(b_cpu[2]) == 0.569231 );
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_double_cuda_managed, *utf::tolerance(0.00001)){
namespace cuda = multi::cuda;
cuda::managed::array<double, 2> const A = {
{ 1., 3., 4.},
{NAN, 7., 1.},
{NAN, NAN, 8.}
};
cuda::managed::array<double, 1> b = {1., 3., 4.};
blas::trsv(blas::filling::upper, A, b); // this operation happens in GPU when #include "adaptors/blas/cuda.hpp"
multi::array<double, 1> const b_cpu = b;
BOOST_TEST_REQUIRE( b_cpu[0] == -2.07143 );
BOOST_TEST_REQUIRE( b_cpu[1] == 0.357143 );
BOOST_TEST_REQUIRE( b_cpu[2] == 0.5 );
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_cuda2, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
multi::cuda::array<complex, 2> const A = {
{ 1. + 1.*I, 3. - 2.*I, 4. + 1.*I},
{NAN , 7. - 10.*I, 1. + 2.*I},
{NAN , NAN , 8. + 1.*I}
};
multi::cuda::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
blas::trsv(blas::filling::upper, blas::diagonal::general, A, b);
BOOST_TEST_REQUIRE( real(b[0]) == -1.37259 );
BOOST_TEST_REQUIRE( real(b[1]) == 0.2127 );
BOOST_TEST_REQUIRE( real(b[2]) == 0.569231 );
}

View File

@ -0,0 +1,41 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_TRAITS_HPP
#define MULTI_ADAPTORS_BLAS_TRAITS_HPP
#include<complex>
#include<type_traits>
namespace boost{
namespace multi::blas{
template<class F, class=std::enable_if_t<sizeof(F)==sizeof(float ) and std::is_convertible<decltype(std::declval<F&&>()/std::declval<F&&>()), float>{} >>
std::true_type is_s_aux(F&&);
std::false_type is_s_aux(...);
template<class T> struct is_s : decltype(is_s_aux(std::declval<T>())){using archetype = float;};
template<class D, class=std::enable_if_t<sizeof(D)==sizeof(double) and std::is_convertible<decltype(std::declval<D&&>()/std::declval<D&&>()), double>{}>>
std::true_type is_d_aux(D&&);
std::false_type is_d_aux(...);
template<class T> struct is_d : decltype(is_d_aux(std::declval<T>())){using archetype = double;};
template<class C, class=std::enable_if_t<sizeof(C)==sizeof(std::complex<float>) and is_s<decltype(std::declval<C>().real())>{} and is_s<decltype(std::declval<C>().imag())>{}>>
std::true_type is_c_aux(C&&);
std::false_type is_c_aux(...);
template<class C> struct is_c : decltype(is_c_aux(std::declval<C>())){using archetype = std::complex<float>;};
template<class Z, class=std::enable_if_t<sizeof(Z)==sizeof(std::complex<double>) and is_d<decltype(std::declval<Z>().real())>{} and is_d<decltype(std::declval<Z>().imag())>{}>>
std::true_type is_z_aux(Z&&);
std::false_type is_z_aux(...);
template<class Z> struct is_z : decltype(is_z_aux(std::declval<Z>())){using archetype = std::complex<double>;};
}
}
#endif

View File

@ -0,0 +1,93 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
$CXXX $CXXFLAGS $0 -o $0.$X -lboost_unit_test_framework `pkg-config --cflags --libs blas` -lboost_timer&&$0.$X&&rm $0.$X;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_TRSM_HPP
#define MULTI_ADAPTORS_BLAS_TRSM_HPP
#include "../blas/core.hpp"
#include "../blas/operations.hpp" // uplo
#include "../blas/filling.hpp"
#include "../blas/side.hpp"
namespace boost{
namespace multi::blas{
enum class diagonal : char{
unit = 'U',
non_unit = 'N', general = non_unit
};
using core::trsm;
template<class Context, class A2D, class B2D>
decltype(auto) trsm(Context&& ctxt, blas::side a_side, blas::filling a_fill, blas::diagonal a_diag, typename A2D::element_type alpha, A2D const& a, B2D&& b) try{
;;;; if(a_side == blas::side::left ) assert(size(~a) >= size( b));
else if(a_side == blas::side::right) assert(size( a) >= size(~b));
assert( stride( a) == 1 or stride(~a) == 1 );
assert( stride( b) == 1 or stride(~b) == 1 );
if(size(b)!=0){
#define CTXT std::forward<Context>(ctxt)
;;;; if constexpr(not is_conjugated<A2D>{} and not is_conjugated<B2D>{}){
;;;; if(stride( a)==1 and stride( b)==1) CTXT->trsm((char) (a_side), (char)-a_fill, 'N', (char)a_diag, size( b), size(~b), alpha , base(a) , stride(~a), base(b) , stride(~b));
else if(stride(~a)==1 and stride(~b)==1) CTXT->trsm((char)swap(a_side), (char)+a_fill, 'N', (char)a_diag, size(~b), size( b), alpha , base(a) , stride( a), base(b) , stride( b));
else if(stride( a)==1 and stride(~b)==1) CTXT->trsm((char)swap(a_side), (char)-a_fill, 'T', (char)a_diag, size(~b), size( b), alpha , base(a) , stride(~a), base(b) , stride( b));
else if(stride(~a)==1 and stride( b)==1) CTXT->trsm((char) (a_side), (char)+a_fill, 'T', (char)a_diag, size( b), size(~b), alpha , base(a) , stride( a), base(b) , stride(~b));
else assert(0 && "not implemented in blas");
}else if constexpr( is_conjugated<A2D>{} and not is_conjugated<B2D>{}){
;;;; if(stride( a)==1 and stride(~b)==1) CTXT->trsm((char)swap(a_side), (char)-a_fill, 'C', (char)a_diag, size(~b), size( b), alpha , underlying(base(a)), stride(~a), base(b) , stride( b));
else if(stride(~a)==1 and stride( b)==1) CTXT->trsm((char) (a_side), (char)+a_fill, 'C', (char)a_diag, size( b), size(~b), alpha , underlying(base(a)), stride( a), base(b) , stride(~b));
else if(stride( a)==1 and stride( b)==1) assert(0 && "not implemented in blas");
else if(stride(~a)==1 and stride(~b)==1) assert(0 && "not implemented in blas");
else assert(0 && "not implemented in blas");
}else if constexpr(not is_conjugated<A2D>{} and is_conjugated<B2D>{}){
;;;; if(stride(~a)==1 and stride( b)==1) CTXT->trsm((char) (a_side), (char)+a_fill, 'C', (char)a_diag, size( b), size(~b), conj(alpha), base(a) , stride( a), underlying(base(b)), stride(~b));
else if(stride( a)==1 and stride(~b)==1) CTXT->trsm((char)swap(a_side), (char)-a_fill, 'C', (char)a_diag, size(~b), size( b), conj(alpha), base(a) , stride(~a), underlying(base(b)), stride( b));
else if(stride(~a)==1 and stride(~b)==1) assert(0);
else if(stride( a)==1 and stride( b)==1) assert(0);
else assert(0 && "not implemented in blas");
}else if constexpr( is_conjugated<A2D>{} and is_conjugated<B2D>{}){
;;;; if(stride( a)==1 and stride(~b)==1) CTXT->trsm((char)swap(a_side), (char)-a_fill, 'T', (char)a_diag, size(~b), size( b), conj(alpha), underlying(base(a)), stride(~a), underlying(base(b)), stride( b));
else if(stride(~a)==1 and stride( b)==1) CTXT->trsm((char) (a_side), (char)+a_fill, 'T', (char)a_diag, size( b), size(~b), conj(alpha), underlying(base(a)), stride( a), underlying(base(b)), stride(~b));
else if(stride(~a)==1 and stride(~b)==1) assert(0 && "not implemented in blas");
else if(stride( a)==1 and stride( b)==1) assert(0 && "not implemented in blas");
else assert(0 && "not implemented in blas");
}
#undef CTXT
}
return std::forward<B2D>(b);
}catch(std::logic_error& le){
using std::to_string;
throw std::logic_error{
"couldn't do "+std::string(__PRETTY_FUNCTION__)+" of layout a_side="+ (char)a_side +" a_fill="+ (char)a_fill +" a_diag="+(char)a_diag+" alpha=xx"
+" a_conj="+ to_string(is_conjugated<A2D>{}) +" a_strides="+to_string(stride(a)) +","+ to_string(stride(~a))+" a_sizes="+to_string(size(a)) +","+ to_string(size(~a))
+" b_conj="+ to_string(is_conjugated<B2D>{}) +" b_strides="+to_string(stride(b)) +","+ to_string(stride(~b))+" b_sizes="+to_string(size(b)) +","+ to_string(size(~b))
+" because " + le.what()
};
}
template<class A2D, class B2D>
decltype(auto) trsm(blas::side a_side, blas::filling a_fill, blas::diagonal a_diag, typename A2D::element_type alpha, A2D const& a, B2D&& b){
if constexpr(not is_conjugated<A2D>{}) return trsm(default_context_of( a.base() ), a_side, a_fill, a_diag, alpha, a, std::forward<B2D>(b));
else return trsm(default_context_of(underlying(a.base())), a_side, a_fill, a_diag, alpha, a, std::forward<B2D>(b));
}
template<class Context, class A2D, class B2D>
auto trsm(Context&& ctxt, blas::side a_side, blas::filling a_fill, typename A2D::element_type alpha, A2D const& a, B2D&& b)
->decltype(trsm(std::forward<Context>(ctxt), a_side, a_fill, blas::diagonal::general, alpha, a, std::forward<B2D>(b))){
return trsm(std::forward<Context>(ctxt), a_side, a_fill, blas::diagonal::general, alpha, a, std::forward<B2D>(b));}
template<class A2D, class B2D>
decltype(auto) trsm(blas::side a_side, blas::filling a_fill, typename A2D::element_type alpha, A2D const& a, B2D&& b){
if constexpr(not is_conjugated<A2D>{}) return trsm(default_context_of( a.base() ), a_side, a_fill, alpha, a, std::forward<B2D>(b));
else return trsm(default_context_of(underlying(a.base())), a_side, a_fill, alpha, a, std::forward<B2D>(b));
} // EDG based compilers (e.g. nvcc) need option: -Xcudafe \"--diag_suppress=implicit_return_from_non_void_function\""
}}
#endif

View File

@ -0,0 +1,609 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXX $0 -o $0x `pkg-config --libs blas` -lcudart -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_BLAS_TRSV_HPP
#define MULTI_ADAPTORS_BLAS_TRSV_HPP
#include "../blas/core.hpp"
#include "../blas/operations.hpp" // uplo
#include "../blas/filling.hpp"
#include "../blas/side.hpp"
#include "../../config/NODISCARD.hpp"
namespace boost{
namespace multi{namespace blas{
//enum DIAG : char{U='U', N='N'};
enum class diagonal : char{//typename std::underlying_type<char>::type{
unit = 'U',
non_unit = 'N', general = non_unit
};
using core::trsv;
template<class A, std::enable_if_t<not is_conjugated<A>{}, int> =0>
auto trsv_base(A&& a){return base(a);}
template<class A, std::enable_if_t< is_conjugated<A>{}, int> =0>
auto trsv_base(A&& a){return underlying(base(a));}
template<class A2D, class X1D>
auto trsv(filling a_nonzero_side, diagonal a_diag, A2D const& a, X1D&& x)
->decltype(trsv(static_cast<char>(flip(a_nonzero_side)), 'N', static_cast<char>(a_diag), size(x), trsv_base(a), stride(rotated(a)), trsv_base(x), stride(x)), std::forward<X1D>(x))
{
// if(is_conjugated(x)) trsv(a_nonzero_side, a_diag, conjugated(a), conjugated(std::forward<X1D>(x)));
{
auto base_a = trsv_base(a);
auto base_x = trsv_base(x);
if(not is_conjugated<A2D>{}){
if(stride( a )==1) trsv(static_cast<char>(flip(a_nonzero_side)), 'N', static_cast<char>(a_diag), size(x), base_a, stride(rotated(a)), base_x, stride(x));
else if(stride(rotated(a))==1) trsv(static_cast<char>( a_nonzero_side ), 'T', static_cast<char>(a_diag), size(x), base_a, stride( a ), base_x, stride(x));
else assert(0);
}else{
if(stride( a )==1) assert(0); //TODO fallback to trsm?
else if(stride(rotated(a))==1) trsv(static_cast<char>( a_nonzero_side ), 'C', static_cast<char>(a_diag), size(x), base_a, stride( a ), base_x, stride(x));
else assert(0);
}
}
return std::forward<X1D>(x);
}
template<class A2D, class X1D>
auto trsv(filling a_nonzero_side, A2D const& a, X1D&& x)
->decltype(trsv(a_nonzero_side, diagonal::general, a, std::forward<X1D>(x))){
return trsv(a_nonzero_side, diagonal::general, a, std::forward<X1D>(x));}
#if 0
#if 1
template<class A2D, class X1D, class Ret = typename X1D::decay_type>
Ret trsv(filling a_nonzero_side, diagonal a_diag, A2D const& a, X1D const& x, void* = 0){
return trsv(a_nonzero_side, a_diag, a, Ret{x});}
template<class A2D, class X1D, class Ret = typename X1D::decay_type>
Ret trsv(filling a_nonzero_side, A2D const& a, X1D const& x, void* = 0){
return trsv(a_nonzero_side, a, Ret{x});}
#endif
#endif
}}}
#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_TRSV
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi.BLAS trsv"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../blas/gemm.hpp"
#include "../../array.hpp"
#include<iostream>
namespace multi = boost::multi;
template<class M> decltype(auto) print_1D(M const& C){
using boost::multi::size; using std::cout;
for(int i = 0; i != size(C); ++i)
cout<< C[i] <<' ';
cout<<std::endl;
}
template<class M> decltype(auto) print(M const& C){
using boost::multi::size; using std::cout;
for(int i = 0; i != size(C); ++i){
for(int j = 0; j != size(C[i]); ++j)
cout<< C[i][j] <<' ';
cout<<std::endl;
}
return cout<<std::endl;
}
namespace utf = boost::unit_test;
namespace blas = multi::blas;
BOOST_AUTO_TEST_CASE(multi_blas_trsv_real_square, *utf::tolerance(0.0001)){
{
multi::array<double, 2> const A = {
{ 1., 3., 4.},
{ NAN, 7., 1.},
{ NAN, NAN, 8.}
};
multi::array<double, 1> b = {1., 3., 4.};
blas::trsv(blas::filling::upper, blas::diagonal::general, A, b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
BOOST_TEST( b[0] == -2.07143 );
BOOST_TEST( b[1] == 0.357143 );
BOOST_TEST( b[2] == 0.5 );
}
{
multi::array<double, 2> const A = {
{ 1., 3., 4.},
{ NAN, 7., 1.},
{ NAN, NAN, 8.}
};
multi::array<double, 1> b = {1., 3., 4.};
blas::trsv(blas::filling::lower, blas::diagonal::general, blas::T(A), b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
BOOST_TEST( b[0] == 1. );
BOOST_TEST( b[1] == 0. );
BOOST_TEST( b[2] == 0. );
}
#if 0
{
multi::array<double, 1> b = {3., 3., 1.};
// trsv(filling::lower, diagonal::general, hermitized(A), b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
// BOOST_TEST( b[0] == 3. );
// BOOST_TEST( b[1] == -0.857143 );
// BOOST_TEST( b[2] == -1.26786 );
}
#endif
}
#if 0
using complex = std::complex<double>;
BOOST_AUTO_TEST_CASE(multi_blas_trsv_complex_real_case_square, *utf::tolerance(0.00001)){
multi::array<complex, 2> const A = {
{ 1., 3., 4.},
{NAN, 7., 1.},
{NAN, NAN, 8.}
};
using blas::filling;
using blas::diagonal;
using blas::transposed;
using blas::hermitized;
using blas::conjugated;
using blas::trsv;
{
multi::array<complex, 1> b = {1., 3., 4.};
blas::trsv(filling::upper, diagonal::general, A, b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
BOOST_TEST( real(b[0]) == -2.07143 );
BOOST_TEST( real(b[1]) == 0.357143 );
BOOST_TEST( real(b[2]) == 0.5 );
}
{
multi::array<complex, 1> const b = {1., 3., 4.};
auto b_copy = blas::trsv(filling::upper, A, b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
BOOST_TEST( real(b[0]) == 1. );
BOOST_TEST( real(b_copy[0]) == -2.07143 );
BOOST_TEST( real(b_copy[1]) == 0.357143 );
BOOST_TEST( real(b_copy[2]) == 0.5 );
}
{
multi::array<complex, 1> const b = {1., 3., 4.};
auto b_copy = blas::trsv(filling::upper, diagonal::general, A, b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
BOOST_TEST( real(b[0]) == 1. );
BOOST_TEST( real(b_copy[0]) == -2.07143 );
BOOST_TEST( real(b_copy[1]) == 0.357143 );
BOOST_TEST( real(b_copy[2]) == 0.5 );
}
{
multi::array<complex, 1> b = {3., 3., 1.};
trsv(filling::lower, diagonal::general, transposed(A), b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
BOOST_TEST( real(b[0]) == 3. );
BOOST_TEST( real(b[1]) == -0.857143 );
BOOST_TEST( real(b[2]) == -1.26786 );
}
{
multi::array<complex, 1> b = {3., 3., 1.};
// trsv(filling::lower, diagonal::general, hermitized(A), b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
// BOOST_TEST( real(b[0]) == 3. );
// BOOST_TEST( real(b[1]) == -0.857143 );
// BOOST_TEST( real(b[2]) == -1.26786 );
}
{
multi::array<complex, 1> b = {3., 3., 1.};
// trsv(filling::lower, diagonal::general, hermitized(A), conjugated(b)); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
// BOOST_TEST( real(b[0]) == 3. );
// BOOST_TEST( real(b[1]) == -0.857143 );
// BOOST_TEST( real(b[2]) == -1.26786 );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsv_complex_square, *utf::tolerance(0.00001)){
namespace blas = multi::blas;
multi::array<complex, 2> const A = {
{ 1. + 1.*I, 3. - 2.*I, 4. + 1.*I},
{NAN , 7. - 10.*I, 1. + 2.*I},
{NAN , NAN , 8. + 1.*I}
};
using blas::filling;
using blas::diagonal;
using blas::transposed;
using blas::hermitized;
using blas::conjugated;
using blas::trsv;
{
multi::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
blas::trsv(filling::upper, diagonal::general, A, b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
BOOST_TEST( real(b[0]) == -1.37259 );
BOOST_TEST( real(b[1]) == 0.2127 );
BOOST_TEST( real(b[2]) == 0.569231 );
}
{
multi::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
trsv(filling::lower, diagonal::general, transposed(A), b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
BOOST_TEST( real(b[0]) == 1.5 ); BOOST_TEST( imag(b[0]) == 0.5 );
BOOST_TEST( real(b[1]) == -0.285235 ); BOOST_TEST( imag(b[1]) == -0.0503356 );
BOOST_TEST( real(b[2]) == -0.129272 ); BOOST_TEST( imag(b[2]) == 0.28126 );
}
{
multi::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
trsv(filling::upper, diagonal::general, blas::H(A), b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
print_1D(b);
BOOST_TEST( real(b[0]) == -0.661693 ); BOOST_TEST( imag(b[0]) == -1.13934 );
BOOST_TEST( real(b[1]) == 0.135261 ); BOOST_TEST( imag(b[1]) == -0.0283944 );
BOOST_TEST( real(b[2]) == 0.415385 ); BOOST_TEST( imag(b[2]) == 0.676923 );
}
{
multi::array<complex, 1> b = {1. - 2.*I, 3. - 1.*I, 4. - 5.*I};
trsv(filling::upper, diagonal::general, blas::H(A), blas::conj(b)); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
// print_1D(b);
// BOOST_TEST( real(conjugated(b)[0]) == -0.661693 ); BOOST_TEST( imag(conjugated(b)[0]) == -1.13934 );
// BOOST_TEST( real(conjugated(b)[1]) == 0.135261 ); BOOST_TEST( imag(conjugated(b)[1]) == -0.0283944 );
// BOOST_TEST( real(conjugated(b)[2]) == 0.415385 ); BOOST_TEST( imag(conjugated(b)[2]) == 0.676923 );
}
{
multi::array<complex, 1> b = {1. + 2.*I, 3. + 1.*I, 4. + 5.*I};
// trsv(filling::lower, diagonal::general, hermitized(A), b); // B<-Solve(A.X==B), B<-A⁻¹.B, B<-(A⁻¹.B), B<-B.A⁻¹
// BOOST_TEST( real(b[0]) == -0.5 ); BOOST_TEST( imag(b[0]) == 1.5 );
// BOOST_TEST( real(b[1]) == 0.184564 ); BOOST_TEST( imag(b[1]) == -0.620805 );
// BOOST_TEST( real(b[2]) == 0.691791 ); BOOST_TEST( imag(b[2]) == 0.0227155 );
}
}
#if 0
BOOST_AUTO_TEST_CASE(multi_blas_trsm_double_1x1, *utf::tolerance(0.00001)){
multi::array<double, 2> const A = {
{10.,},
};
using multi::blas::side;
using multi::blas::filling;
using multi::blas::diagonal;
{
multi::array<double, 2> B = {
{3.,},
};
trsm(filling::upper, diagonal::general, 1., A, B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
BOOST_TEST( B[0][0] == 3./10. );
}
{
multi::array<double, 2> B = {
{3.,},
};
trsm(filling::upper, diagonal::general, 2., A, B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
BOOST_TEST( B[0][0] == 2.*3./10. );
}
{
multi::array<double, 2> B = {
{3., 4., 5.},
};
trsm(filling::upper, diagonal::general, 1., A, B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
BOOST_TEST( B[0][1] == 4./10. );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_double_0x0, *utf::tolerance(0.00001)){
multi::array<double, 2> const A;
using multi::blas::side;
using multi::blas::filling;
using multi::blas::diagonal;
{
multi::array<double, 2> B;
trsm(filling::upper, diagonal::general, 1., A, B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_real_nonsquare, *utf::tolerance(0.00001)){
multi::array<double, 2> const A = {
{ 1., 3., 4.},
{ 0., 7., 1.},
{ 0., 0., 8.}
};
using multi::blas::side;
using multi::blas::filling;
using multi::blas::diagonal;
{
multi::array<double, 2> B = {
{1., 3., 4., 8.},
{2., 7., 1., 9.},
{3., 4., 2., 1.},
};
multi::array<double, 2> BT = rotated(B);
trsm(filling::upper, diagonal::general, 1., A, B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
BOOST_TEST( B[1][2] == 0.107143 );
trsm(filling::upper, diagonal::general, 1., A, rotated(BT));
BOOST_TEST( rotated(BT)[1][2] == 0.107143 );
}
{
multi::array<double, 2> B = {
{1., 3., 4., 8.},
{2., 7., 1., 9.},
{3., 4., 2., 1.},
};
multi::array<double, 2> AT = rotated(A);
multi::array<double, 2> BT = rotated(B);
trsm(filling::upper, diagonal::general, 1., rotated(AT), B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
BOOST_TEST( B[1][2] == 0.107143 );
trsm(filling::upper, diagonal::general, 1., rotated(AT), rotated(BT));
print(rotated(BT));
BOOST_TEST( rotated(BT)[1][2] == 0.107143 );
}
{
multi::array<double, 2> B = {
{1.},
{2.},
{3.},
};
trsm(filling::upper, diagonal::general, 1., A, B); // B=Solve(A.X=alpha*B, X) B=A⁻¹B, B=B.(A)⁻¹, A upper triangular (implicit zeros below)
BOOST_TEST( B[2][0] == 0.375 );
}
{
multi::array<double, 2> B = {
{1.},
{2.},
{3.},
};
multi::array<double, 2> BT = rotated(B);
trsm(filling::upper, diagonal::general, 1., A, rotated(BT));
BOOST_TEST( rotated(BT)[2][0] == 0.375 );
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_real_nonsquare_default_diagonal_gemm_check, *utf::tolerance(0.00001)){
multi::array<double, 2> const A = {
{ 1., 3., 4.},
{ 0., 7., 1.},
{ 0., 0., 8.}
};
using multi::blas::side;
using multi::blas::filling;
using multi::blas::diagonal;
{
multi::array<double, 2> const B = {
{1.},// 3., 4.},
{2.},// 7., 1.},
{3.},// 4., 2.},
};
using multi::blas::gemm;
{
auto S = trsm(filling::upper, diagonal::general, 1., A, B);
BOOST_REQUIRE( S[2][0] == 0.375 );
auto Bck=gemm(1., A, S);
BOOST_REQUIRE( Bck[2][0] == 3. );
for(int i{};i<3;++i)for(int j{};j<size(rotated(B));++j) BOOST_CHECK_SMALL(Bck[i][j]-B[i][j], 0.00001);
}
{
multi::array<double, 2> const BT = rotated(B);
auto Bck=gemm(1., A, trsm(filling::upper, diagonal::general, 1., A, rotated(BT)));
for(int i{};i<3;++i)for(int j{};j<size(rotated(B));++j) BOOST_CHECK_SMALL(Bck[i][j]-B[i][j], 0.00001);
}
{
auto const AT = rotated(A);
auto Bck=gemm(1., rotated(AT), trsm(filling::upper, diagonal::general, 1., rotated(AT), B));
for(int i{};i<3;++i)for(int j{};j<size(rotated(B));++j) BOOST_CHECK_SMALL(Bck[i][j]-B[i][j], 0.00001);
}
{
auto const AT =* rotated(A);
auto const BT =* rotated(B);
auto const Bck=gemm(1., A, trsm(filling::upper, diagonal::general, 1., rotated(AT), rotated(BT)));
for(int i{};i<3;++i)for(int j{};j<size(rotated(B));++j) BOOST_REQUIRE_SMALL(Bck[i][j]-B[i][j], 0.00001);
}
{
auto const AT =* rotated(A);
auto const BT =* rotated(B);
using multi::blas::trsm;
// auto const Bck=gemm(A, trsm(rotated(AT), rotated(BT)));
// for(int i{};i<3;++i)for(int j{};j<size(rotated(B));++j) BOOST_CHECK_SMALL(Bck[i][j]-B[i][j], 0.00001);
}
{
using multi::blas::trsm;
// auto const Bck=gemm(A, trsm(A, B));
// for(int i{};i<3;++i)for(int j{};j<size(rotated(B));++j) BOOST_CHECK_SMALL(Bck[i][j]-B[i][j], 0.00001);
}
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_real_1x1_check, *utf::tolerance(0.00001)){
multi::array<double, 2> const A = {
{ 4.},
};
using multi::blas::side;
using multi::blas::filling;
using multi::blas::diagonal;
{
multi::array<double, 2> const B = {
{5.},
};
{
auto S = trsm(filling::upper, diagonal::general, 3., A, B);
BOOST_REQUIRE( S[0][0] == 3.*5./4. );
}
{
auto S = trsm(filling::upper, 1., A, B);
BOOST_REQUIRE( S[0][0] == 1.*5./4. );
}
{
auto S = trsm(filling::upper, A, B);
BOOST_REQUIRE( S[0][0] == 1.*5./4. );
}
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_1x1_check, *utf::tolerance(0.00001)){
using complex = std::complex<double>; complex const I = complex{0, 1};
multi::array<complex, 2> const A = {
{ 4. + 2.*I},
};
using multi::blas::side;
using multi::blas::filling;
using multi::blas::diagonal;
{
multi::array<complex, 2> const B = {
{5. + 1.*I},
};
using multi::blas::gemm;
{
auto S = trsm(filling::upper, diagonal::general, 3.+5.*I, A, B);
BOOST_TEST( real(S[0][0]) == real((3.+5.*I)*B[0][0]/A[0][0]) );
BOOST_TEST( imag(S[0][0]) == imag((3.+5.*I)*B[0][0]/A[0][0]) );
}
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_nonsquare_default_diagonal_one_check, *utf::tolerance(0.00001)){
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 4.*I, 3., 4.- 10.*I},
{ 0., 7.- 3.*I, 1.},
{ 0., 0., 8.- 2.*I}
};
using multi::blas::side;
using multi::blas::filling;
using multi::blas::diagonal;
{
multi::array<complex, 2> const B = {
{1. + 1.*I},
{2. + 1.*I},
{3. + 1.*I},
};
using multi::blas::gemm;
{
auto S = trsm(filling::upper, diagonal::general, 1., A, B);
BOOST_TEST( real(S[2][0]) == 0.323529 );
}
{
auto const BT = +rotated(B);
auto S = trsm(filling::upper, diagonal::general, 1., A, rotated(BT));
BOOST_TEST( real(S[2][0]) == 0.323529 );
}
{
auto const AT = +rotated(A);
auto S = trsm(filling::upper, diagonal::general, 1., rotated(AT), B);
BOOST_TEST( real(S[2][0]) == 0.323529 );
}
{
auto const AT = +rotated(A);
auto const BT = +rotated(B);
auto S = trsm(filling::upper, diagonal::general, 1., rotated(AT), rotated(BT));
BOOST_TEST( real(S[2][0]) == 0.323529 );
}
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_nonsquare_default_diagonal_gemm_check, *utf::tolerance(0.00001)){
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 4.*I, 3., 4.- 10.*I},
{ 0., 7.- 3.*I, 1.},
{ 0., 0., 8.- 2.*I}
};
using multi::blas::side;
using multi::blas::filling;
using multi::blas::diagonal;
{
multi::array<complex, 2> const B = {
{1. + 1.*I, 5. + 3.*I},
{2. + 1.*I, 9. + 3.*I},
{3. + 1.*I, 1. - 1.*I},
};
using multi::blas::gemm;
{
auto S = trsm(filling::upper, diagonal::general, 1., A, B); // S = Ainv.B
BOOST_TEST( real(S[2][1]) == 0.147059 );
}
{
auto const BT = +rotated(B);
auto S = trsm(filling::upper, diagonal::general, 1., A, rotated(BT));
BOOST_TEST( real(S[2][1]) == 0.147059 );
}
{
auto const AT = +rotated(A);
auto S = trsm(filling::upper, diagonal::general, 1., rotated(AT), B);
BOOST_TEST( real(S[2][1]) == 0.147059 );
}
{
auto const AT = +rotated(A);
auto const BT = +rotated(B);
auto S = trsm(filling::upper, diagonal::general, 1., rotated(AT), rotated(BT));
BOOST_TEST( real(S[2][1]) == 0.147059 );
}
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_nonsquare_default_diagonal_hermitized_gemm_check, *utf::tolerance(0.00001)){
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 4.*I, 3., 4.- 10.*I},
{ 0., 7.- 3.*I, 1.},
{ 0., 0., 8.- 2.*I}
};
using multi::blas::filling;
using multi::blas::diagonal;
{
multi::array<complex, 2> const B = {
{1. + 1.*I, 5. + 3.*I},
{2. + 1.*I, 9. + 3.*I},
{3. + 1.*I, 1. - 1.*I},
};
using multi::blas::hermitized;
{
auto S = trsm(filling::lower, diagonal::general, 1., hermitized(A), B); // S = A⁻¹†.B, S† = B†.A⁻¹
BOOST_TEST( real(S[2][1]) == 1.71608 );
}
{
multi::array<complex, 2> const B = {
{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
};
auto S =* trsm(filling::upper, 1., A, hermitized(B)); // S = A⁻¹B†, S†=B.A⁻¹†, S=(B.A⁻¹)†, B <- S†, B <- B.A⁻¹†
BOOST_TEST( imag(S[2][1]) == +0.147059 );
BOOST_TEST( imag(B[1][2]) == -0.147059 );
}
{
multi::array<complex, 2> const B = {
{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
};
auto S =* trsm(filling::upper, 2., A, hermitized(B)); // S = A⁻¹B†, S†=B.A⁻¹†, S=(B.A⁻¹)†, B <- S†, B <- B.A⁻¹†
BOOST_TEST( imag(S[2][1]) == +0.147059*2. );
BOOST_TEST( imag(B[1][2]) == -0.147059*2. );
}
}
}
BOOST_AUTO_TEST_CASE(multi_blas_trsm_complex_nonsquare_default_diagonal_hermitized_gemm_check_no_const, *utf::tolerance(0.00001)){
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const A = {
{ 1. + 4.*I, 3., 4.- 10.*I},
{ 0., 7.- 3.*I, 1.},
{ 0., 0., 8.- 2.*I}
};
multi::array<complex, 2> B = {
{1. + 1.*I, 2. + 1.*I, 3. + 1.*I},
{5. + 3.*I, 9. + 3.*I, 1. - 1.*I}
};
using multi::blas::trsm;
using multi::blas::filling;
using multi::blas::hermitized;
trsm(filling::upper, A, hermitized(B)); // B†←A⁻¹.B†, B←B.A⁻¹†, B←(A⁻¹.B†)†
BOOST_TEST( imag(B[1][2]) == -0.147059 );
}
#endif
#endif
#endif
#endif

View File

@ -0,0 +1,283 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4-*-
$CXX $0 -o $0x -lcudart -lboost_unit_test_framework -lboost_timer -ldl&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2019-2020
#ifndef MULTI_ADAPTORS_CUDA_HPP
#define MULTI_ADAPTORS_CUDA_HPP
#include "../memory/adaptors/cuda/allocator.hpp"
#include "../memory/adaptors/cuda/managed/allocator.hpp"
#include "../adaptors/cuda/algorithms/copy.hpp"
#include "../array.hpp"
namespace boost{
namespace multi{
namespace cuda{
template<class T>
using allocator = multi::memory::cuda::allocator<T>;
template<class T> using ptr = multi::memory::cuda::ptr<T>;
template<class T, multi::dimensionality_type D>
using array = multi::array<T, D, cuda::allocator<T>>;
template<class T, multi::dimensionality_type D>
using array_ref = multi::array_ref<T, D, cuda::ptr<T>>;
template<class T, multi::dimensionality_type D>
using static_array = multi::static_array<T, D, cuda::allocator<T>>;
// template<class A> auto raw_array_cast(A&& a)
// ->decltype(static_array_cast<typename A::element_type, decltype(raw_pointer_cast(base(std::forward<A>(a))))>(std::forward<A>(a))){
// return static_array_cast<typename A::element_type, decltype(raw_pointer_cast(base(std::forward<A>(a))))>(std::forward<A>(a));}
template<class A> auto raw_array_cast(A&& a)
->decltype(std::forward<A>(a).template static_array_cast<typename A::element_type, decltype(raw_pointer_cast(base(std::forward<A>(a))))>()){
return std::forward<A>(a).template static_array_cast<typename A::element_type, decltype(raw_pointer_cast(base(std::forward<A>(a))))>();}
namespace managed{
template<class T>
using allocator = multi::memory::cuda::managed::allocator<T>;
template<class T> using ptr = multi::memory::cuda::managed::ptr<T>;
template<class T, multi::dimensionality_type D>
using array = multi::array<T, D, cuda::managed::allocator<T>>;
template<class T, multi::dimensionality_type D>
using array_ref = multi::array<T, D, multi::memory::cuda::managed::ptr<T>>;
template<class T, multi::dimensionality_type D>
using static_array = multi::array<T, D, multi::memory::cuda::managed::ptr<T>>;
}
}
/*
auto copy(const double* first, const double* last, boost::multi::array_iterator<double, 1, boost::multi::memory::cuda::managed::ptr<double, double*>, double&> d_first){
return copy(
boost::multi::array_iterator<double, 1, double const*, double const&>(first),
boost::multi::array_iterator<double, 1, double const*, double const&>(last),
d_first
);
}*/
}}
#if defined(__INCLUDE_LEVEL__) and not __INCLUDE_LEVEL__
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi CUDA adaptor"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include<boost/timer/timer.hpp>
#include<chrono>
#include<numeric>
template <class T>
__attribute__((always_inline)) inline void DoNotOptimize(const T &value) {
asm volatile("" : "+m"(const_cast<T &>(value)));
}
struct watch : private std::chrono::high_resolution_clock{
std::string label_; time_point start_;
watch(std::string label ="") : label_{label}, start_{now()}{}
~watch(){
std::cerr<< label_<<": "<< std::chrono::duration<double>(now() - start_).count() <<" sec"<<std::endl;
}
};
namespace multi = boost::multi;
namespace cuda = multi::cuda;
namespace utf = boost::unit_test::framework;
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_construct_1d){
multi::array<double, 1> A(4, 99.);
cuda::array<double, 1> Agpu{A};
BOOST_REQUIRE( extensions(A) == extensions(Agpu) );
BOOST_REQUIRE( Agpu == A );
}
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_copy_1d){
multi::array<double, 1> A(4, 99.);
cuda::array<double, 1> Agpu(4);
BOOST_REQUIRE( extensions(A) == extensions(Agpu) );
Agpu({0, 4}) = A({0, 4});
BOOST_REQUIRE( Agpu == A );
}
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_construct_2d){
multi::array<double, 2> A({4, 6}, 99.);
cuda::array<double, 2> Agpu{A};
BOOST_REQUIRE( extensions(A) == extensions(Agpu) );
BOOST_REQUIRE( Agpu == A );
A[1][1] = Agpu[1][1];
}
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_copy_2d){
multi::array<double, 2> A({4, 6}); std::iota(A.data_elements(), A.data_elements() + A.num_elements(), 1.);
cuda::array<double, 2> Agpu({4, 6}, 99.);
BOOST_REQUIRE( extensions(A) == extensions(Agpu) );
Agpu({0, 4}, {1, 6}) = A({0, 4}, {1, 6});
BOOST_REQUIRE( Agpu != A );
Agpu = A;
BOOST_REQUIRE( Agpu == A );
}
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_1d_initializer_list){
cuda::array<double, 1> Bgpu = {1., 2., 3., 4.};
BOOST_REQUIRE( Bgpu[1] == 2. );
}
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_2d_initializer_list){
cuda::array<double, 2> Bgpu = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{9., 10., 11., 12.},
};
BOOST_REQUIRE( size(Bgpu) == 3 );
BOOST_REQUIRE( Bgpu[1][1] == 6. );
}
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_2d_initializer_list_bis){
multi::array<double, 2> A({3, 4}); std::iota(A.data_elements(), A.data_elements() + A.num_elements(), 1.);
cuda::array<double, 2> Agpu({3, 4}, 99.);
BOOST_REQUIRE( extensions(A) == extensions(Agpu) );
Agpu({0, 3}, {1, 4}) = A({0, 3}, {1, 4});
BOOST_REQUIRE( Agpu != A );
Agpu = A;
BOOST_REQUIRE( Agpu == A );
}
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_copy_vs_move){
cuda::array<double, 4> Agpu({30, 100, 100, 100}, 99.);
[&, _ = watch{utf::current_test_case().full_name()+" COPY"}]{
cuda::array<double, 4> Agpu_cpy = Agpu;
BOOST_REQUIRE( &Agpu_cpy[1][2][3][4] != &Agpu[1][2][3][4] );
BOOST_REQUIRE( Agpu_cpy[1][2][3][4] == Agpu[1][2][3][4] );
}();
[&, _ = watch{utf::current_test_case().full_name()+" MOVE"}]{
cuda::array<double, 4> Agpu_mov = std::move(Agpu);
BOOST_REQUIRE( Agpu.empty() );
BOOST_REQUIRE( Agpu_mov.size() == 30 );
}();
}
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_copy_vs_move_complex){
using complex = std::complex<double>;
cuda::array<complex, 4> Agpu({30, 100, 100, 100}, 99.);
[&, _ = watch{utf::current_test_case().full_name()+" COPY"}]{
cuda::array<complex, 4> Agpu_cpy = Agpu;
BOOST_REQUIRE( &Agpu_cpy[1][2][3][4] != &Agpu[1][2][3][4] );
BOOST_REQUIRE( Agpu_cpy[1][2][3][4] == Agpu[1][2][3][4] );
}();
[&, _ = watch{utf::current_test_case().full_name()+" MOVE"}]{
cuda::array<complex, 4> Agpu_mov = std::move(Agpu);
BOOST_REQUIRE( Agpu.empty() );
BOOST_REQUIRE( Agpu_mov.size() == 30 );
}();
}
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_managed_double){
cuda::managed::array<double, 4> A({2,3,4,5});
cuda::managed::array<double, 4> B({2,3,4,5}, 0.);
cuda::managed::array<double, 4> C({2,3,4,5}, 5.);
}
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_managed_ai3){
using ai3 = std::array<int, 3>;
cuda::managed::array<ai3, 4> A({2,3,4,5}); // default initialize elements
cuda::managed::array<ai3, 4> B({2,3,4,5}, ai3{} ); // value initialize elements
cuda::managed::array<ai3, 4> C({2,3,4,5}, ai3{11, 22, 33} ); // value initialize elements
}
BOOST_AUTO_TEST_CASE(multi_adaptor_cuda_decay){
cuda::array<double, 2> A = {
{1., 2., 3., 4.},
{5., 6., 7., 8.},
{1., 2., 3., 4.}
};
cuda::array<double, 1> A1 = A[1];
// cuda::array<complex, 2> A = {
// {1. + 2.*I, 2. + 3.*I, 3. + 4.*I, 4. + 5.*I},
// {5. + 2.*I, 6. + 3.*I, 7. + 4.*I, 8. + 5.*I},
// {1. + 1.*I, 2. + 2.*I, 3. + 3.*I, 4. + 4.*I}
// };
// cuda::array<complex
}
#if 0
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda_copy){
multi::array<double, 2> A({4, 5}, 99.);
cuda::array<double, 2> Agpu = A;
}
BOOST_AUTO_TEST_CASE(multi_adaptors_cuda){
multi::array<double, 2> A({4, 5}, 99.);
cuda::array<double, 2> Agpu = A;
assert( Agpu == A );
cuda::managed::array<double, 2> Amng = A;
assert( Amng == Agpu );
cuda::array_ref<double, 2> Rgpu(data_elements(Agpu), extensions(Agpu));
{std::allocator<double> a = get_allocator(A);}
{
cuda::ptr<double> p;
using multi::get_allocator;
cuda::allocator<double> a = get_allocator(p); (void)a;
}
{
cuda::managed::ptr<double> p;
using multi::get_allocator;
cuda::managed::allocator<double> a = get_allocator(p); (void)a;
}
{
double* p = nullptr;
using multi::get_allocator;
std::allocator<double> a = get_allocator(p); (void)a;
}
{
multi::array<double, 2> arr;
std::allocator<double> a = get_allocator(arr);
}
{
cuda::array<double, 2> arr;
cuda::allocator<double> a = get_allocator(arr); (void)a;
}
{
// cuda::array<double, 0> arr = 45.;
// BOOST_REQUIRE( arr() == 45. );
}
{
// cuda::managed::array<double, 0> arr = 45.;
// BOOST_REQUIRE( arr() == 45. );
}
{
cuda::managed::array<double, 1> arr = {1.2, 3.4, 4.5};
}
{
using complex = std::complex<double>;
cuda::managed::array<complex, 2> a({1000, 1000}, 99.);
BOOST_REQUIRE( size(a) == 1000 );
cuda::managed::array<complex, 2> b;
b = std::move(a);
BOOST_REQUIRE( size(b) == 1000 );
BOOST_REQUIRE( size(a) == 0 );
}
}
#endif
#endif
#endif

View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="CPP_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/cuda.iml" filepath="$PROJECT_DIR$/.idea/cuda.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
</component>
</project>

View File

@ -0,0 +1,20 @@
cmake_minimum_required(VERSION 3.11)
project(boost-multi-adaptor-cuda VERSION 0.1 LANGUAGES CXX)
set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
enable_language(CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -std=c++17 --extended-lambda --expt-relaxed-constexpr -Xcudafe \"--display_error_number --set_error_limit=2 --verbose_diagnostics --emit_warnings_as_errors --diag_suppress=implicit_return_from_non_void_function\"")
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
enable_testing()
list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure") # needs cmake 3.1
add_subdirectory(thrust/test)
add_subdirectory(cublas/test)

View File

@ -0,0 +1,235 @@
#ifdef COMPILATION_INSTRUCTIONS//-*-indent-tabs-mode: t; c-basic-offset: 4; tab-width: 4;-*-
nvcc -D_TEST_MULTI_ADAPTORS_CUDA_ALGORITHMS_COPY -x cu $0 -o $0x -lboost_unit_test_framework -lboost_timer&&$0x&&
clang++ -D_TEST_MULTI_ADAPTORS_CUDA_ALGORITHMS_COPY -x cuda --cuda-gpu-arch=sm_61 -std=c++14 $0 -o $0x -lcudart -lboost_unit_test_framework -lboost_timer&&$0x&&
rm $0x; exit
#endif
#ifndef MULTI_ADAPTORS_CUDA_ALGORITHMS_COPY_HPP
#define MULTI_ADAPTORS_CUDA_ALGORITHMS_COPY_HPP
#include<cassert>
#include<iostream>
#include "../../../adaptors/cuda.hpp"
//#include "../algorithms/for_each.hpp"
//#include "/home/correaa/prj/alf/boost/iterator/zipper.hpp"
#ifndef HD
#if defined(__CUDACC__)
#define HD __host__ __device__
#else
#define HD
#endif
#endif
namespace boost{
namespace multi{namespace cuda{
#if 0
template<typename From, typename To, typename = std::enable_if_t<std::is_trivially_assignable<To&, From>{}> >
array_iterator<To, 1, To*> copy(
array_iterator<From, 1, memory::cuda::ptr<To>> f,
array_iterator<From, 1, memory::cuda::ptr<To>> l,
array_iterator<To, 1, To*> d
){
assert(0);
assert(f.stride() == l.stride()); static_assert(sizeof(From) == sizeof(To), "!");
auto n = std::distance(f, l);
if(f.stride()==1 and d.stride()==1){
auto s = cudaMemcpy(d.data(), raw_pointer_cast(f.data()), n*sizeof(To), cudaMemcpyDeviceToHost); assert( s == cudaSuccess );
}else{
auto s = cudaMemcpy2D(d.data(), d.stride()*sizeof(To), raw_pointer_cast(f.data()), f.stride()*sizeof(To), sizeof(To), n, cudaMemcpyDeviceToHost);
assert( s == cudaSuccess );
}
return d + n;
}
template<typename From, typename From2, typename To, typename To2, typename = std::enable_if_t<std::is_trivially_assignable<To&, From>{}> >
array_iterator<To, 1, To*> copy(
array_iterator<From, 1, memory::cuda::ptr<From2>> f,
array_iterator<From, 1, memory::cuda::ptr<From2>> l,
array_iterator<To , 1, memory::cuda::ptr<To2> > d
){
assert(0);
assert(f.stride() == l.stride()); static_assert(sizeof(From) == sizeof(To), "!");
auto n = std::distance(f, l);
if(f.stride()==1 and d.stride()==1){
auto s = cudaMemcpy(raw_pointer_cast(d.data()), raw_pointer_cast(f.data()), n*sizeof(To), cudaMemcpyDeviceToHost); assert( s == cudaSuccess );
}else{
auto s = cudaMemcpy2D(raw_pointer_cast(d.data()), d.stride()*sizeof(To), raw_pointer_cast(f.data()), f.stride()*sizeof(To), sizeof(To), n, cudaMemcpyDeviceToDevice);
assert( s == cudaSuccess );
}
return d + n;
}
#endif
}}
}
#ifdef _TEST_MULTI_ADAPTORS_CUDA_ALGORITHMS_COPY
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi CUDA copy"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../../adaptors/cuda.hpp"
#include <thrust/for_each.h>
#include <thrust/execution_policy.h>
#include <boost/timer/timer.hpp>
#if __cpp_lib_parallel_algorithm >= 201603
#include<execution>
#endif
namespace multi = boost::multi;
namespace cuda = multi::memory::cuda;
template<class T> __device__ void WHAT(T&&) = delete;
template<class T> __device__ void WHAT(int) = delete;
template<class T> T&& what(T&&) = delete;
BOOST_AUTO_TEST_CASE(copy_by_iterator){
auto const A_cpu = []{
multi::array<double, 2> r({198, 23});
std::generate(r.data_elements(), r.data_elements()+r.num_elements(), &std::rand);
return r;
}();
multi::cuda::array<double, 2> A = A_cpu;
multi::cuda::array<double, 2> B(extensions(A));
B() = A();
// BOOST_REQUIRE( A[13] == B[13] );
}
BOOST_AUTO_TEST_CASE(copy_by_pointer){
auto const A_cpu = []{
multi::array<double, 2> r({198, 23});
std::generate(r.data_elements(), r.data_elements()+r.num_elements(), &std::rand);
return r;
}();
multi::cuda::array<double, 2> A = A_cpu;
multi::cuda::array<double, 2> B(extensions(A));
B = A;
// BOOST_REQUIRE( A[13] == B[13] );
}
BOOST_AUTO_TEST_CASE(cuda_copy){
multi::cuda::array<double, 1> A(1<<27); CUDA_SLOW( A[10] = 99. );
multi::cuda::array<double, 1> B(size(A));
{
boost::timer::auto_cpu_timer t{"thrust copy_n cuda::ptr %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
thrust::copy_n(thrust::device, A.data_elements(), A.num_elements(), B.data_elements());
}
{
boost::timer::auto_cpu_timer t{"cuda copy_n cuda::ptr copy_n %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
copy_n(A.data_elements(), A.num_elements(), B.data_elements());
}
{
boost::timer::auto_cpu_timer t{"cuda copy_n cuda::ptr copy_n %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
multi::adl::copy_n(A.data_elements(), A.num_elements(), B.data_elements());
}
#if 0
{
boost::timer::auto_cpu_timer t{"cuda ptr copy_n %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
/*multi::cuda::*/copy_n(A.data_elements(), A.num_elements(), B.data_elements());
}
{
boost::timer::auto_cpu_timer t{"indirect cuda ptr copy_n %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
B = A;
}
{
boost::timer::auto_cpu_timer t{"indirect cuda ptr uninitialized_copy_n %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
multi::cuda::array<double, 1> C = A;
BOOST_REQUIRE( CUDA_SLOW( C[10] == 99. ) );
}
{
boost::timer::auto_cpu_timer t{"indirect cuda ptr uninitialized_copy_n %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
multi::cuda::array<double, 1> C = A;//();
BOOST_REQUIRE( CUDA_SLOW( C[10] == 99. ) );
}
BOOST_REQUIRE( CUDA_SLOW( B[10] == 99. ) );
CUDA_SLOW( B[10] = 10. );
{
boost::timer::auto_cpu_timer t{"thrust copy_n %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
thrust::copy_n(thrust::device, begin(A), size(A), begin(B));
}
BOOST_REQUIRE( CUDA_SLOW( B[10] == 99. ) );
#endif
/* multi::cuda::for_each_n(
boost::iterators::zip(begin(A), begin(B)),
size(A),
[]__device__(auto&& e){
std::get<1>(e) = std::get<0>(e);
printf( "**** %f %f\n", static_cast<double const&>(std::get<0>(e)), static_cast<double const&>(std::get<1>(e)) );
}
);*/
// auto l =
// BOOST_REQUIRE( l == end(B) );
// std::cout << B[8] << std::endl;
// multi::cuda::array<double, 1> A(10, 99.);
// BOOST_REQUIRE( CUDA_SLOW( A[5] == 99. ) );
// int uno = 1.;
// for_each(begin(A), end(A), [uno]__device__(auto&& e){e = uno;});
// BOOST_REQUIRE( CUDA_SLOW( A[5] == 1. ) );
}
#if 0
BOOST_AUTO_TEST_CASE(cuda_for_each){
multi::cuda::array<double, 1> A(10, 99.);
BOOST_REQUIRE( CUDA_SLOW( A[5] == 99. ) );
int uno = 1.;
for_each(begin(A), end(A), [uno]__device__(auto&& e){e = uno;});
BOOST_REQUIRE( CUDA_SLOW( A[5] == 1. ) );
}
BOOST_AUTO_TEST_CASE(cuda_timing){
multi::cuda::managed::array<double, 1> A(1<<29); //std::cout << A.size()*8 << std::endl;
{
boost::timer::auto_cpu_timer t{"cuda cold %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
multi::cuda::for_each(begin(A), end(A), []__device__(auto&& e){e = 11.;});
} BOOST_REQUIRE( CUDA_SLOW( A[size(A) - 10] == 11.) );
{
boost::timer::auto_cpu_timer t{"cuda %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
multi::cuda::for_each(begin(A), end(A), []__device__(auto&& e){e = 22.;});
} BOOST_REQUIRE( CUDA_SLOW( A[size(A) - 10] == 22.) );
{
boost::timer::auto_cpu_timer t{"thrust %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
thrust::for_each(thrust::device, begin(A), end(A), []__device__(auto&& e){e = 222.;});
} BOOST_REQUIRE( CUDA_SLOW( A[size(A) - 10] == 222.) );
{
std::for_each(begin(A), end(A), [](auto&& e){e = 55.;});
} BOOST_REQUIRE( CUDA_SLOW( A[size(A) - 10] == 55.) );
#if __cpp_lib_parallel_algorithm >= 201603
{
boost::timer::auto_cpu_timer t{"par %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
std::for_each(std::execution::par_unseq, begin(A), end(A), [](auto&& e){e = 33.;});
} BOOST_REQUIRE( CUDA_SLOW( A[size(A) - 10] == 33.) );
#endif
{
boost::timer::auto_cpu_timer t{"seq %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
std::for_each(begin(A), end(A), [](auto&& e){e = 55.;});
} BOOST_REQUIRE( CUDA_SLOW( A[size(A) - 10] == 55.) );
{
boost::timer::auto_cpu_timer t{"cuda cold %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
multi::cuda::for_each(begin(A), end(A), []__device__(auto&& e){e = 66.;});
} BOOST_REQUIRE( CUDA_SLOW( A[size(A) - 10] == 66.) );
{
boost::timer::auto_cpu_timer t{"cuda %ws wall, %us user + %ss system = %ts CPU (%p%)\n"};
multi::cuda::for_each(begin(A), end(A), []__device__(auto&& e){e = 77.;});
} BOOST_REQUIRE( CUDA_SLOW( A[size(A) - 10] == 77.) );
}
#endif
#endif
#endif

View File

@ -0,0 +1,7 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
// © Alfredo A. Correa 2021
#pragma once
#include "./cublas/context.hpp"

View File

@ -0,0 +1,22 @@
#ifndef MULTI_ADAPTORS_CUDA_CUBLAS_CALL_HPP
#define MULTI_ADAPTORS_CUDA_CUBLAS_CALL_HPP
#include "../cublas/error.hpp"
#include<cuda_runtime.h> // cudaDeviceSynchronize
namespace boost{
namespace multi::cuda::cublas{
template<auto Function, class... Args> // needs C++17
void call(Args... args){
auto e = static_cast<enum cublas::error>(Function(args...));
if(e != cublas::error::success) throw std::system_error{e, "cannot call function "+ std::string{__PRETTY_FUNCTION__}};
}
#define CUBLAS_(F) call<F>
}
}
#endif

View File

@ -0,0 +1,212 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
// © Alfredo A. Correa 2020-2021
#ifndef MULTI_ADAPTORS_CUDA_CUBLAS_CONTEXT_HPP
#define MULTI_ADAPTORS_CUDA_CUBLAS_CONTEXT_HPP
#include "../../../config/MARK.hpp"
#include "../../../adaptors/cuda/cublas/call.hpp"
#include "../../../adaptors/blas/traits.hpp"
#include "../../../adaptors/blas/core.hpp"
#include "../../../memory/adaptors/cuda/ptr.hpp"
#include "../../../memory/adaptors/cuda/managed/ptr.hpp"
#include <thrust/system/cuda/memory.h>
#include<mutex>
namespace boost{
namespace multi::cuda::cublas{
class operation{
cublasOperation_t impl_;
public:
operation(char trans) : impl_{[=]{
switch(trans){
case 'N': return CUBLAS_OP_N;
case 'T': return CUBLAS_OP_T;
case 'C': return CUBLAS_OP_C;
default : assert(0);
}
return cublasOperation_t{};
}()}{}
operator cublasOperation_t() const{return impl_;}
};
class side{
cublasSideMode_t impl_;
public:
side(char trans) : impl_{[=]{
switch(trans){
case 'L': return CUBLAS_SIDE_LEFT;
case 'R': return CUBLAS_SIDE_RIGHT;
}
assert(0); return cublasSideMode_t{};
}()}{}
operator cublasSideMode_t() const{return impl_;}
};
class filling{
cublasFillMode_t impl_;
public:
filling(char trans) : impl_{[=]{
switch(trans){
case 'L': return CUBLAS_FILL_MODE_LOWER;
case 'U': return CUBLAS_FILL_MODE_UPPER;
}
assert(0); return cublasFillMode_t{};
}()}{}
operator cublasFillMode_t() const{return impl_;}
};
class diagonal{
cublasDiagType_t impl_;
public:
diagonal(char trans) : impl_{[=]{
switch(trans){
case 'N': return CUBLAS_DIAG_NON_UNIT;
case 'U': return CUBLAS_DIAG_UNIT;
}
assert(0); return cublasDiagType_t{};
}()}{}
operator cublasDiagType_t() const{return impl_;}
};
using blas::is_z;
using blas::is_d;
using std::is_assignable;
using std::is_convertible_v;
class context : private std::unique_ptr<std::decay_t<decltype(*cublasHandle_t{})>, decltype(&cublasDestroy)>{
using pimpl_t = std::unique_ptr<std::decay_t<decltype(*cublasHandle_t{})>, decltype(&cublasDestroy)>;
cudaStream_t stream() const{cudaStream_t streamId; cublas::call<cublasGetStream>(this->get(), &streamId); return streamId;}
template<auto Function, class... Args> // needs C++17
void sync_call(Args... args){
call<Function>(this->get(), args...);
this->synchronize();
}
public:
using pimpl_t::get;
static context& get_instance(){
thread_local context ctxt;
return ctxt;
};
context() : pimpl_t{[]{cublasHandle_t h; cublasCreate(&h); return h;}(), &cublasDestroy}{}
using ssize_t = int;
static int version(){int ret; cublas::call<cublasGetVersion>(nullptr, &ret); return ret;}
void synchronize(){
// cudaError_t e = cudaDeviceSynchronize();
cudaError_t e = cudaStreamSynchronize(stream());
if(e != cudaSuccess) throw std::runtime_error{"cannot synchronize stream in cublas context"};
}
template<class ALPHA, class XP, class X = typename std::pointer_traits<XP>::element_type, class YP, class Y = typename std::pointer_traits<YP>::element_type,
std::enable_if_t<is_d<X>{} and is_d<Y>{}, int> = 0
// std::enable_if_t<is_d<X>{} and is_d<Y>{} and is_assignable<Y&, ALPHA{}*X{} + Y{}>{} and is_convertible_v<XP, thrust::cuda::pointer<X>> and is_convertible_v<YP, thrust::cuda::pointer<Y>>, int> = 0
>
void axpy(ssize_t n, ALPHA const* alpha, XP x, ssize_t incx, YP y, ssize_t incy){
sync_call<cublasDaxpy>(
n,
(double const*)alpha,
(double const*)raw_pointer_cast(x), incx,
(double*)raw_pointer_cast(y), incy
);
}
template<class ALPHA, class AAP, class AA = typename std::pointer_traits<AAP>::element_type, class BBP, class BB = typename std::pointer_traits<BBP>::element_type, class BETA, class CCP, class CC = typename std::pointer_traits<CCP>::element_type,
std::enable_if_t<
is_z<AA>{} and is_z<BB>{} and is_z<CC>{} and is_z<ALPHA>{} and is_z<BETA>{} and is_assignable<CC&, decltype(ALPHA{}*AA{}*BB{})>{} and
std::is_convertible_v<AAP, memory::cuda::ptr<AA>> and std::is_convertible_v<BBP, memory::cuda::ptr<BB>> and std::is_convertible_v<CCP, memory::cuda::ptr<CC>>
,int> =0
>
void gemm(char transA, char transB, ssize_t m, ssize_t n, ssize_t k, ALPHA const* alpha, AAP aa, ssize_t lda, BBP bb, ssize_t ldb, BETA const* beta, CCP cc, ssize_t ldc){
MULTI_MARK_SCOPE("cublasZgemm");
sync_call<cublasZgemm>(cublas::operation{transA}, cublas::operation{transB}, m, n, k, (cuDoubleComplex const*)alpha, (cuDoubleComplex const*)raw_pointer_cast(aa), lda, (cuDoubleComplex const*)raw_pointer_cast(bb), ldb, (cuDoubleComplex const*)beta, (cuDoubleComplex*)raw_pointer_cast(cc), ldc);
}
template<class ALPHA, class AAP, class AA = typename std::pointer_traits<AAP>::element_type, class BBP, class BB = typename std::pointer_traits<BBP>::element_type, class BETA, class CCP, class CC = typename std::pointer_traits<CCP>::element_type,
std::enable_if_t<
is_d<AA>{} and is_d<BB>{} and is_d<CC>{} and is_assignable<CC&, decltype(ALPHA{}*AA{}*BB{})>{} and
std::is_convertible_v<AAP, memory::cuda::ptr<AA>> and std::is_convertible_v<BBP, memory::cuda::ptr<BB>> and std::is_convertible_v<CCP, memory::cuda::ptr<CC>>
,int> =0
>
void gemm(char transA, char transB, ssize_t m, ssize_t n, ssize_t k, ALPHA const* alpha, AAP aa, ssize_t lda, BBP bb, ssize_t ldb, BETA const* beta, CCP cc, ssize_t ldc){
MULTI_MARK_SCOPE("cublasDgemm");
sync_call<cublasDgemm>(cublas::operation{transA}, cublas::operation{transB}, m, n, k, (double const*)alpha, (double const*)raw_pointer_cast(aa), lda, (double const*)raw_pointer_cast(bb), ldb, (double const*)beta, (double*)raw_pointer_cast(cc), ldc);
}
template<class ALPHA, class AAP, class AA = typename pointer_traits<AAP>::element_type, class BBP, class BB = typename pointer_traits<BBP>::element_type,
std::enable_if_t<
is_z<AA>{} and is_z<BB>{} and is_assignable<BB&, decltype(AA{}*BB{}/ALPHA{})>{} and is_assignable<BB&, decltype(ALPHA{}*BB{}/AA{})>{} and
is_convertible_v<AAP, memory::cuda::ptr<AA>> and is_convertible_v<BBP, memory::cuda::ptr<BB>>
,int> =0
>
void trsm(char side, char ul, char transA, char diag, ssize_t m, ssize_t n, ALPHA alpha, AAP aa, ssize_t lda, BBP bb, ssize_t ldb){
sync_call<cublasZtrsm>(cublas::side{side}, cublas::filling{ul}, cublas::operation{transA}, cublas::diagonal{diag}, m, n, (cuDoubleComplex const*)&alpha, (cuDoubleComplex const*)raw_pointer_cast(aa), lda, (cuDoubleComplex*)raw_pointer_cast(bb), ldb);
}
template<
class XXP, class XX = typename std::pointer_traits<XXP>::element_type,
class YYP, class YY = typename std::pointer_traits<YYP>::element_type,
class RRP, class RR = typename std::pointer_traits<RRP>::element_type,
std::enable_if_t<
is_d<XX>{} and is_d<YY>{} and is_d<RR>{} and is_assignable<RR&, decltype(XX{}*YY{})>{} and
is_convertible_v<XXP, memory::cuda::ptr<XX>> and is_convertible_v<YYP, memory::cuda::ptr<YY>> and is_convertible_v<RRP, RR*>
, int> =0
>
void dot(int n, XXP xx, int incx, YYP yy, int incy, RRP rr){
cublasPointerMode_t mode;
auto s = cublasGetPointerMode(get(), &mode); assert( s == CUBLAS_STATUS_SUCCESS );
assert( mode == CUBLAS_POINTER_MODE_HOST );
sync_call<cublasDdot>(n, raw_pointer_cast(xx), incx, raw_pointer_cast(yy), incy, rr);
}
template<
class XXP, class XX = typename std::pointer_traits<XXP>::element_type,
class YYP, class YY = typename std::pointer_traits<YYP>::element_type,
class RRP, class RR = typename std::pointer_traits<RRP>::element_type,
std::enable_if_t<
is_z<XX>{} and is_z<YY>{} and is_z<RR>{} and is_assignable<RR&, decltype(XX{}*YY{})>{} and
is_convertible_v<XXP, memory::cuda::ptr<XX>> and is_convertible_v<YYP, memory::cuda::ptr<YY>> and is_convertible_v<RRP, RR*>
, int> =0
>
void dotc(int n, XXP xx, int incx, YYP yy, int incy, RRP rr){
cublasPointerMode_t mode;
auto s = cublasGetPointerMode(get(), &mode); assert( s == CUBLAS_STATUS_SUCCESS );
assert( mode == CUBLAS_POINTER_MODE_HOST );
sync_call<cublasZdotc>(n, (cuDoubleComplex const*)raw_pointer_cast(xx), incx, (cuDoubleComplex const*)raw_pointer_cast(yy), incy, (cuDoubleComplex*)rr);
}
// template<class ALPHA, class AAP, class AA = typename pointer_traits<AAP>::element_type, class BETA, class CCP, class CC = typename pointer_traits<CCP>::element_type,
// std::enable_if_t<
// is_z<AA>{} and is_z<CC>{} and is_d<ALPHA>{} and is_d<BETA>{} and is_assignable<CC&, decltype(ALPHA{}*AA{}*AA{})>{} and
// is_convertible_v<AAP, AA*> and is_convertible_v<CCP, CC*>
// , int
// > =0
// >
// void herk(char ul, char transA, ssize_t n, ssize_t k, ALPHA const* alpha, AAP aa, ssize_t lda, BETA const* beta, CCP cc, ssize_t ldc){
// MULTI_MARK_SCOPE("cublasZherk");
// cublas::call<cublasZherk>(this->get(), cublas::filling{ul}, cublas::operation{transA}, n, k, (double const*)&alpha, (cuDoubleComplex const*)raw_pointer_cast(aa), lda, (double const*)&beta, (cuDoubleComplex*)raw_pointer_cast(cc), ldc);
// }
};
}
}
namespace boost::multi::blas{
template<> struct is_context<boost::multi::cuda::cublas::context > : std::true_type{};
template<> struct is_context<boost::multi::cuda::cublas::context&> : std::true_type{};
template<class Ptr, class T = typename std::pointer_traits<Ptr>::element_type, std::enable_if_t<std::is_convertible<Ptr, multi::memory::cuda::ptr<T>>{}, int> =0>
boost::multi::cuda::cublas::context* default_context_of(Ptr const&){
namespace multi = boost::multi;
return &multi::cuda::cublas::context::get_instance();
}
template<class T>
boost::multi::cuda::cublas::context* default_context_of(boost::multi::memory::cuda::managed::ptr<T> const&){
namespace multi = boost::multi;
return &multi::cuda::cublas::context::get_instance();
}
}
#endif

View File

@ -0,0 +1,93 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
$CXXX $CXXFLAGS $0 -o $0.$X `pkg-config --cflags --libs cudart-11.0 cublas-11.0 blas` -lboost_unit_test_framework&&$0.$X&&rm $0.$X;exit
#endif
// © Alfredo A. Correa 2020
#ifndef MULTI_ADAPTORS_CUDA_CUBLAS_ERROR_HPP
#define MULTI_ADAPTORS_CUDA_CUBLAS_ERROR_HPP
#include<cublas_v2.h> // cublasStatus_t
#include<string>
#include<system_error> // std::error_category
#include<type_traits> // std::underlying_type
namespace boost{
namespace multi::cuda::cublas{
enum class error : typename std::underlying_type<cublasStatus_t>::type{
success = CUBLAS_STATUS_SUCCESS,
not_initialized = CUBLAS_STATUS_NOT_INITIALIZED,
allocation_failed = CUBLAS_STATUS_ALLOC_FAILED,
invalid_value = CUBLAS_STATUS_INVALID_VALUE,
architecture_mismatch = CUBLAS_STATUS_ARCH_MISMATCH,
mapping_error = CUBLAS_STATUS_MAPPING_ERROR,
execution_failed = CUBLAS_STATUS_EXECUTION_FAILED,
internal_error = CUBLAS_STATUS_INTERNAL_ERROR,
not_supported = CUBLAS_STATUS_NOT_SUPPORTED,
license_error = CUBLAS_STATUS_LICENSE_ERROR
};
std::string inline error_string(enum cublas::error err){ //https://stackoverflow.com/questions/13041399/equivalent-of-cudageterrorstring-for-cublas
switch(err){
case cublas::error::success : return "CUBLAS_STATUS_SUCCESS" ;
case cublas::error::not_initialized : return "CUBLAS_STATUS_NOT_INITIALIZED" ;
case cublas::error::allocation_failed : return "CUBLAS_STATUS_ALLOC_FAILED" ;
case cublas::error::invalid_value : return "CUBLAS_STATUS_INVALID_VALUE" ;
case cublas::error::architecture_mismatch: return "CUBLAS_STATUS_ARCH_MISMATCH" ;
case cublas::error::mapping_error : return "CUBLAS_STATUS_MAPPING_ERROR" ;
case cublas::error::execution_failed : return "CUBLAS_STATUS_EXECUTION_FAILED";
case cublas::error::internal_error : return "CUBLAS_STATUS_INTERNAL_ERROR" ;
case cublas::error::not_supported : return "CUBLAS_STATUS_NOT_SUPPORTED" ;
case cublas::error::license_error : return "CUBLAS_STATUS_LICENSE_ERROR" ;
}
return "cublas status <unknown>";
}
struct error_category : std::error_category{
char const* name() const noexcept override{return "cublas wrapper";}
std::string message(int err) const override{return error_string(static_cast<enum cublas::error>(err));}
static error_category& instance(){static cublas::error_category instance; return instance;}
};
inline std::error_code make_error_code(cublas::error err) noexcept{
return std::error_code(int(err), cublas::error_category::instance());
}
}
}
namespace std{
template<> struct is_error_code_enum<::boost::multi::cuda::cublas::error> : true_type{};
}
#if not __INCLUDE_LEVEL__ // _TEST_MULTI_ADAPTORS_BLAS_CUDA
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuBLAS"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
//#include "../../array.hpp"
//#include "../../utility.hpp"
//#include "../../adaptors/cuda.hpp"
//#include "../../adaptors/blas.hpp"
//#include "../../adaptors/blas/cuda.hpp"
#include<cassert>
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(multi_cublas_error){
BOOST_CHECK_THROW(
throw (std::system_error{multi::cuda::cublas::make_error_code(multi::cuda::cublas::error::not_initialized), "error test"}),
std::system_error
);
}
#endif
#endif

View File

@ -0,0 +1,71 @@
cmake_minimum_required(VERSION 3.11)
set(CMAKE_VERBOSE_MAKEFILE ON)
project(boost-multi-adaptors-cuda-cublas-test VERSION 0.1 LANGUAGES CXX CUDA)
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
find_package(BLAS REQUIRED)
find_path(BLAS_INCLUDE_DIRS cblas.h
/usr/include
/usr/local/include
$ENV{BLAS_HOME}/include)
link_libraries(${BLAS_LIBRARIES})
include_directories(${TEST_EXE} PRIVATE ${BLAS_INCLUDE_DIRS})
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
if(ENABLE_CUDA OR DEFINED CXXCUDA)
enable_language(CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe \"--display_error_number --diag_suppress=implicit_return_from_non_void_function\"")
endif()
find_package(CUDA)
enable_testing()
list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure") # needs cmake 3.17
include(CTest)
include_directories(${CMAKE_BINARY_DIR})
#file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
set(TEST_SRCS
# herk.cu
gemm.cu
)
foreach(TEST_FILE ${TEST_SRCS})
SET(TEST_EXE "${TEST_FILE}.x")
add_executable (${TEST_EXE} ${TEST_FILE})
if(ENABLE_CUDA OR DEFINED CXXCUDA)
set_source_files_properties(${TEST_FILE} PROPERTIES LANGUAGE CUDA)
target_compile_options (${TEST_EXE} PRIVATE -std=c++17)
endif()
# target_compile_features (${TEST_EXE} PUBLIC cxx_std_17)
target_compile_definitions(${TEST_EXE} PRIVATE "BOOST_PP_VARIADICS")
target_compile_definitions(${TEST_EXE} PRIVATE ${Boost_DEFINITIONS})
target_include_directories(${TEST_EXE} PRIVATE ${Boost_INCLUDE_DIRS})
target_include_directories(${TEST_EXE} PRIVATE ${CUDA_INCLUDE_DIRS})
target_link_libraries (${TEST_EXE} PRIVATE ${Boost_LIBRARIES})
target_link_directories (${TEST_EXE} PRIVATE ${Boost_LIBRARY_DIRS})
target_link_libraries (${TEST_EXE} PRIVATE ${CUDA_LIBRARIES})
target_link_directories (${TEST_EXE} PRIVATE ${CUDA_LIBRARY_DIRS})
# if(NOT ENABLE_CUDA)
# target_compile_options (${TEST_EXE} PRIVATE
# $<$<CXX_COMPILER_ID:GNU>:
# -Werror -Wall -Wextra -fno-common -Wpedantic -Wformat-truncation -fstack-usage>#-Wconversion
# $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:
# -Werror -Wall -Wextra -fno-common -Wpedantic -Wmove>
# $<$<CXX_COMPILER_ID:Intel>:
# -Werror -Wall -Wextra -fno-common -wd161 -diag-disable=remark -Warray-bounds -Wchar-subscripts -Wcomment -Wenum-compare -Wformat -Wuninitialized -Wmaybe-uninitialized -Wmain -Wnarrowing -Wnonnull -Wparentheses -Wpointer-sign -Wreorder -Wno-return-type -Wsign-compare -Wsequence-point -Wtrigraphs -Wunused-function -Wunused-but-set-variable -Wunused-variable -Wwrite-strings -Werror -diag-error:3846
# >
# $<$<CXX_COMPILER_ID:MSVC>:
# /W4>)
# endif()
add_test(NAME ${TEST_EXE} COMMAND ./${TEST_EXE})
endforeach()

View File

@ -0,0 +1,75 @@
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi CUBLAS herk"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
//#include "../../../../adaptors/cuda.hpp" // multi::cuda ns
#include "../../../../adaptors/blas/gemm.hpp"
#include "../../../../adaptors/cuda/cublas.hpp"
#include "../../../adaptors/cuda/thrust.hpp"
//#include "../../../complex.hpp"
#include<thrust/complex.h>
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(multi_cublas_gemm_double){
multi::array<double, 2> const a = {
{ 1., 3., 4.},
{ 9., 7., 1.}
};
// multi::thrust::cuda::array<double, 2> const a_gpu = a;
}
BOOST_AUTO_TEST_CASE(multi_cublas_gemm_complex){
using complex = std::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
// multi::thrust::cuda::array<complex, 2> const a_gpu = a;
}
//BOOST_AUTO_TEST_CASE(multi_cublas_gemm_thrust_complex){
// using complex = thrust::complex<double>; complex const I{0, 1};
// multi::array<complex, 2> const a = {
// { 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
// { 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
// };
//// multi::thrust::cuda::array<complex, 2> const a_gpu = a;
//}
BOOST_AUTO_TEST_CASE(multi_cublas_gemm_complex2){
// using complex = std::complex<double>; complex const I{0, 1};
// multi::array<complex, 2> const a = {
// {1. + 2.*I, 5. + 2.*I},
// {9. - 1.*I, 9. + 1.*I},
// {1. + 1.*I, 2. + 2.*I}
// };
// multi::array<complex, 2> const b = {
// { 11. - 2.*I, 5. + 2.*I},
// { 7. - 3.*I, 2. + 1.*I},
// { 8. - 1.*I, 1. + 1.*I}
// };
//// multi::thrust::cuda::array<complex, 2> const a_gpu = a;
//// multi::thrust::cuda::array<complex, 2> const b_gpu = b;
// namespace blas = multi::blas;
// {
// multi::array<complex, 2> c({3, 3}, 9999.);
// // blas::gemm(1., a, blas::H(b), 0., c);
// // multi::thrust::cuda::array<complex, 2> const c_gpu;
// // blas::gemm(1., a_gpu, b_gpu, c_gpu);
// // BOOST_REQUIRE( c == c_gpu );
// }
// {
// multi::array<complex, 2> c({3, 3}, 9999.);
// blas::herk(1., blas::H(a), c);
// BOOST_REQUIRE( c[2][1] == complex(41, +2) );
// BOOST_REQUIRE( c[1][2] == complex(41, -2) );
// multi::array<complex, 2> const c_copy = blas::herk(1., blas::H(a));
// BOOST_REQUIRE( c_copy == c );
// }
}

View File

@ -0,0 +1,39 @@
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi CUBLAS herk"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../../../adaptors/cuda.hpp" // multi::cuda ns
#include "../../../../adaptors/blas/herk.hpp"
namespace multi = boost::multi;
using complex = std::complex<double>;
complex const I{0, 1};
BOOST_AUTO_TEST_CASE(multi_blas_herk){
multi::array<complex, 2> const a = {
{ 1. + 3.*I, 3.- 2.*I, 4.+ 1.*I},
{ 9. + 1.*I, 7.- 8.*I, 1.- 3.*I}
};
multi::cuda::array<complex, 2> const a_gpu = a;
namespace blas = multi::blas;
{
multi::array<complex, 2> c({2, 2}, 9999.);
blas::herk(1., a, c);
BOOST_REQUIRE( c[1][0] == complex(50., -49.) );
BOOST_REQUIRE( c[0][1] == complex(50., +49.) );
multi::array<complex, 2> const c_copy = blas::herk(1., a);
BOOST_REQUIRE( c == c_copy );
}
{
multi::array<complex, 2> c({3, 3}, 9999.);
blas::herk(1., blas::H(a), c);
BOOST_REQUIRE( c[2][1] == complex(41, +2) );
BOOST_REQUIRE( c[1][2] == complex(41, -2) );
multi::array<complex, 2> const c_copy = blas::herk(1., blas::H(a));
BOOST_REQUIRE( c_copy == c );
}
}

View File

@ -0,0 +1,66 @@
#pragma once
#include<driver_types.h> // cudaError_t
#include<cuda_runtime_api.h> // cudaGetErrorString
#include<system_error>
#include<type_traits> // underlying_type
namespace boost{
namespace multi{
namespace cuda{
namespace runtime{
enum /*class*/ error : std::underlying_type<cudaError_t>::type{
success = cudaSuccess, // = 0 The API call returned with no errors. In the case of query calls, this also means that the operation being queried is complete (see cudaEventQuery() and cudaStreamQuery()).
missing_configuration = cudaErrorMissingConfiguration,
// invalid_value /*invalid_argument*/ = cudaErrorInvalidValue, // = 1, This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values.
memory_allocation = cudaErrorMemoryAllocation, // = 2 // The API call failed because it was unable to allocate enough memory to perform the requested operation.
initialization_error = cudaErrorInitializationError,
lauch_failure = cudaErrorLaunchFailure,
lauch_timeout = cudaErrorLaunchTimeout,
lauch_out_of_resources = cudaErrorLaunchOutOfResources,
invalid_device_function = cudaErrorInvalidDeviceFunction,
invalid_configuration = cudaErrorInvalidConfiguration,
invalid_device = cudaErrorInvalidDevice,
invalid_value = cudaErrorInvalidValue, ///*invalid_argument*/ = cudaErrorInvalidValue, // = 1 This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values.
invalid_pitch_value = cudaErrorInvalidPitchValue,
invalid_symbol = cudaErrorInvalidSymbol,
unmap_buffer_object_failed = cudaErrorUnmapBufferObjectFailed,
invalid_device_pointer = cudaErrorInvalidDevicePointer,
invalid_texture = cudaErrorInvalidTexture,
invalid_texture_binding = cudaErrorInvalidTextureBinding,
invalid_channel_descriptor = cudaErrorInvalidChannelDescriptor,
invalid_memcpy_direction = cudaErrorInvalidMemcpyDirection,
invalud_filter_setting = cudaErrorInvalidFilterSetting,
invalid_norm_setting = cudaErrorInvalidNormSetting,
unknown = cudaErrorUnknown,
invalid_resource_handle = cudaErrorInvalidResourceHandle,
insuffient_driver = cudaErrorInsufficientDriver,
no_device = cudaErrorNoDevice,
set_on_active_process = cudaErrorSetOnActiveProcess,
startup_failure = cudaErrorStartupFailure,
invalid_ptx = cudaErrorInvalidPtx,
no_kernel_image_for_device = cudaErrorNoKernelImageForDevice,
jit_compiler_not_found = cudaErrorJitCompilerNotFound
};
inline std::string string(enum error e){return cudaGetErrorString(static_cast<cudaError_t>(e));}
struct error_category : std::error_category{
char const* name() const noexcept override{return "cuda wrapper";}
std::string message(int e) const override{return string(static_cast<error>(e));}
static error_category& instance(){
static error_category instance;
return instance;
}
};
inline std::error_code make_error_code(error err) noexcept{
return {int(err), error_category::instance()};
}
}}}}
namespace std{template<> struct is_error_code_enum<boost::multi::cuda::runtime::error> : true_type{};}

View File

@ -0,0 +1,8 @@
#include "../../adaptors/thrust.hpp"
int main(){
}

View File

@ -0,0 +1,95 @@
#ifdef COMPILATION_INSTRUCTIONS//-*-indent-tabs-mode: t; c-basic-offset: 4; tab-width: 4;-*-
$CXX $0 -o $0x -lcudart -lboost_timer -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuda adaptor"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
namespace utf = boost::unit_test;
#include <boost/timer/timer.hpp>
#include "../../../adaptors/cuda.hpp"
#include<complex>
namespace boost::multi::memory::cuda{
template<class T1, class P1, class T2, class P2>
void copy(array_iterator<T1, 1, boost::multi::memory::cuda::ptr<P1>>, array_iterator<T1, 1, boost::multi::memory::cuda::ptr<P1>>, array_iterator<T2, 1, boost::multi::memory::cuda::ptr<P2>>){
assert(0);
}
//std::copy<boost::multi::array_iterator<double, 1, boost::multi::memory::cuda::ptr<const double, const double *>, boost::multi::memory::cuda::ref<const double> >, boost::multi::array_iterator<double, 1, boost::multi::memory::cuda::ptr<double, double *>, boost::multi::memory::cuda::ref<double> > >
}
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(cudart_double, *utf::tolerance(0.00001)*utf::timeout(10)){
auto const in = []{
multi::array<double, 4> r({32, 90, 98, 96});
std::generate(data_elements(r), data_elements(r)+num_elements(r), &std::rand);
return r;
}();
std::cout<<"memory size "<< in.num_elements()*sizeof(decltype(in)::element)/1e6 <<" MB\n";
{
boost::timer::auto_cpu_timer t{"%ws wall, CPU (%p%)\n"};
multi::cuda::array<double, 4> const in_gpu = in;
multi::array<double, 4> const in_cpy = in_gpu;
BOOST_REQUIRE( in == in_cpy );
}
{
boost::timer::auto_cpu_timer t{"%ws wall, CPU (%p%)\n"};
multi::cuda::array<double, 4> const in_gpu = in;
}
{
multi::cuda::array<double, 4> const in_gpu = in;
multi::cuda::array<double, 4> out_gpu = in;
boost::timer::auto_cpu_timer t{"copy assign gpu____ %ws wall, CPU (%p%)\n"};
out_gpu = in_gpu;
auto c = static_cast<double>(out_gpu[1][2][3][4]); (void)c;
(out_gpu << 1) = (in_gpu << 1);
}
{
multi::cuda::managed::array<double, 4> const in_mng = in;
multi::cuda::managed::array<double, 4> out_mng = in;
{
boost::timer::auto_cpu_timer t{"copy assign mng____ %ws wall, CPU (%p%)\n"};
out_mng = in_mng;
auto c = static_cast<double>(out_mng[1][2][3][4]); (void)c;
}
{
boost::timer::auto_cpu_timer t{"copy assign mng_hot %ws wall, CPU (%p%)\n"};
out_mng = in_mng;
auto c = static_cast<double>(out_mng[1][2][3][4]); (void)c;
}
{
boost::timer::auto_cpu_timer t{"copy assign mng loop %ws wall, CPU (%p%)\n"};
out_mng() = in_mng();
auto c = static_cast<double>(out_mng[1][2][3][4]); (void)c;
}
}
}
BOOST_AUTO_TEST_CASE(cudart_complex, *utf::tolerance(0.00001)*utf::timeout(10)){
using complex = std::complex<double>;
auto const in = []{
multi::array<complex, 4> r({32, 90, 98, 96});
std::generate(data_elements(r), data_elements(r)+num_elements(r), &std::rand);
return r;
}();
std::cout<<"memory size "<< in.num_elements()*sizeof(decltype(in)::element)/1e6 <<" MB\n";
{
boost::timer::auto_cpu_timer t{"%ws wall, CPU (%p%)\n"};
multi::cuda::array<complex, 4> const in_gpu = in;
}
{
boost::timer::auto_cpu_timer t{"%ws wall, CPU (%p%)\n"};
multi::cuda::array<complex, 4> const in_gpu = in;
}
}

View File

@ -0,0 +1,47 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
// © Alfredo A. Correa 2021
#pragma once
#include "../../array.hpp"
#include "./thrust/cuda/managed.hpp"
#include <thrust/device_allocator.h>
#include <thrust/system/cuda/memory.h> // ::thrust::cuda::allocator
namespace boost{
namespace multi{
namespace thrust{
template<class T, multi::dimensionality_type D> using device_array = multi::array<T, D, ::thrust::device_allocator<T>>;
template<class T, multi::dimensionality_type D> using host_array = multi::array<T, D >;
namespace device{
template<class T, multi::dimensionality_type D> using array = device_array<T, D>;
}
namespace host{
template<class T, multi::dimensionality_type D> using array = host_array<T, D>;
}
namespace cuda{
template<class T, multi::dimensionality_type D> using array = multi::array<T, D, ::thrust::cuda::allocator<T>>;
namespace managed{
template<class T, multi::dimensionality_type D> using array = multi::array<T, D, boost::multi::thrust::cuda::managed::allocator<T>>;
}
}
}}}

View File

@ -0,0 +1,15 @@
cmake_minimum_required(VERSION 3.11)
project(boost-multi-adaptor-cuda-thrust-test VERSION 0.1 LANGUAGES CXX CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe \"--display_error_number --diag_suppress=implicit_return_from_non_void_function --diag_suppress=class_and_member_name_conflict\"")
enable_testing()
find_program(MEMORYCHECK_COMMAND valgrind)
set(MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --error-exitcode=1")
include (CTest)
add_subdirectory(cuda/test)
add_subdirectory(test)

View File

@ -0,0 +1,144 @@
#pragma once
#include "../../../cuda/runtime/error.hpp"
#include <thrust/system/cuda/pointer.h>
#include<new> // bad_alloc
#include<cassert>
namespace boost{
namespace multi{
namespace thrust{
namespace cuda{
namespace managed{
template<class> class pointer;
template<class T>
class reference : public ::thrust::cuda::reference<T>{
using base_type = ::thrust::cuda::reference<T>;
public:
constexpr explicit reference(::thrust::cuda::reference<T> const& other) : base_type{other}{}
constexpr explicit reference(T& other) : base_type{&other}{}
constexpr operator T&()&&{return raw_reference_cast(static_cast<base_type&>(*this));}
constexpr pointer<T> operator&(){return pointer<T>{base_type::operator&()};}
using ::thrust::cuda::reference<T>::operator=;
};
template<class T>
class pointer{
::thrust::cuda::pointer<T> impl_;
public:
constexpr explicit pointer(::thrust::cuda::pointer<T> const& other) : impl_{other}{}
constexpr explicit pointer(T* other) : impl_(other){}
using difference_type = typename ::thrust::iterator_traits<::thrust::cuda::pointer<T>>::difference_type;
using value_type = typename ::thrust::iterator_traits<::thrust::cuda::pointer<T>>::value_type;
using pointer = pointer<T>; // -Xcudafe \"--diag_suppress=class_and_member_name_conflict\" //TODO
using reference = managed::reference<T>;
using iterator_category = typename ::thrust::iterator_traits<::thrust::cuda::pointer<T>>::iterator_category;
using element_type = T;
constexpr operator T*() const{return raw_pointer_cast(impl_);}
constexpr operator ::thrust::cuda::pointer<T>() const{return impl_;}
constexpr pointer& operator++(){impl_.operator++(); return *this;}
constexpr pointer& operator--(){impl_.operator--(); return *this;}
constexpr auto operator++(int i){return pointer{impl_.operator++(i)};}
constexpr auto operator--(int i){return pointer{impl_.operator--(i)};}
constexpr pointer& operator+=(difference_type n){impl_.operator+=(n); return *this;}
constexpr pointer& operator-=(difference_type n){impl_.operator-=(n); return *this;}
constexpr pointer operator+(difference_type n) const{return pointer{impl_ + n};}
constexpr pointer operator-(difference_type n) const{return pointer{impl_ - n};}
constexpr reference operator*() const{return reference{impl_.operator*()};}
constexpr reference operator[](difference_type n){return *((*this)+n);}
friend auto raw_pointer_cast(pointer const& p){return raw_pointer_cast(p.impl_);}
};
struct bad_alloc : std::bad_alloc{};
template<class T = void>
class allocator{// : cuda::allocator<T>{
static_assert( std::is_same<T, std::decay_t<T>>{}, "!" );
public:
using value_type = T;
using pointer = managed::pointer<T>;
using size_type = ::size_t; // as specified by CudaMalloc
pointer allocate(typename allocator::size_type n, const void* = 0){
if(n == 0) return pointer{nullptr};
T* p = nullptr;
namespace cudart = boost::multi::cuda::runtime;
auto e = static_cast<cudart::error>(cudaMallocManaged(&p, n*sizeof(T)));
switch(e){
case cudart::success : break;
case cudart::memory_allocation: throw bad_alloc{};
default: throw std::system_error{e, "cannot allocate "+std::to_string(n*sizeof(T))+" bytes in '"+__PRETTY_FUNCTION__+"'"};
}
auto ret = static_cast<pointer>(p);
if(!ret) throw bad_alloc{};
return ret;
}
void deallocate(pointer p, size_type){
namespace cudart = boost::multi::cuda::runtime;
auto e = static_cast<cudart::error>(cudaFree(raw_pointer_cast(p)));
if(e!=cudart::success){
throw std::system_error{e, std::string{"cannot "}+ __PRETTY_FUNCTION__};
}
}
template<class P, class... Args>
void construct(P p, Args&&... args){ // remove?
::new(p.rp_) T(std::forward<Args>(args)...);
}
template<class P, class... Args>
void construct(P* p, Args&&... args){ // remove?
::new(p) T(std::forward<Args>(args)...);
}
template<class P> void destroy(P p){p.rp_->~T();} // remove?
template<class P> void destroy(P* p){p->~T();} // remove?
constexpr bool operator==(allocator<T> const&) const{return true;}
constexpr bool operator!=(allocator<T> const&) const{return false;}
template<class InputIt, class ForwardIt>
constexpr ForwardIt alloc_uninitialized_copy(InputIt first, InputIt last, ForwardIt d_first) const{
return ForwardIt{adl_uninitialized_copy(first, last, d_first)};
}
template<class InputIt, class Size, class ForwardIt>
constexpr ForwardIt alloc_uninitialized_copy_n(InputIt first, Size count, ForwardIt d_first) const{
return ForwardIt{adl_uninitialized_copy_n(first, count, d_first)};
}
template<class ForwardIt, class Size>
constexpr ForwardIt alloc_uninitialized_default_construct_n(ForwardIt first, Size n) const{
return ForwardIt{adl_uninitialized_default_construct_n(first, n)};
}
template<class ForwardIt, class Size>
constexpr ForwardIt alloc_destroy_n(ForwardIt first, Size n) const{return ForwardIt{destroy_n(first, n)};}
};
}}}
}}
#if not __INCLUDE_LEVEL__
#include<memory>
#include<iostream>
#include "../../../../array.hpp"
namespace multi = boost::multi;
namespace cuda = multi::memory::cuda;
int main(){
multi::array<double, 1, multi::memory::cuda::managed::allocator<double> > A(32);
A[17] = 3.;
assert( A[17] == 3. );
}
#endif

View File

@ -0,0 +1,42 @@
cmake_minimum_required(VERSION 3.16)
set(CMAKE_VERBOSE_MAKEFILE ON)
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
include_directories(${Boost_INCLUDE_DIRS})
file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cu)
#set(TEST_SRCS
# managed.cu
#)
foreach(TEST_FILE ${TEST_SRCS})
get_filename_component(TEST_EXE ${TEST_FILE} NAME_WE)
add_executable (${TEST_EXE} ${TEST_FILE})
if(ENABLE_CUDA OR DEFINED CXXCUDA)
set_source_files_properties(${TEST_FILE} PROPERTIES LANGUAGE CUDA)
target_compile_options (${TEST_EXE} PRIVATE -std=c++17 --expt-relaxed-constexpr)
endif()
# target_compile_features (${TEST_EXE} PUBLIC cxx_std_17)
target_compile_definitions(${TEST_EXE} PRIVATE "BOOST_PP_VARIADICS")
target_compile_definitions(${TEST_EXE} PRIVATE ${Boost_DEFINITIONS})
# target_include_directories(${TEST_EXE} PRIVATE ${Boost_INCLUDE_DIRS})
target_link_libraries (${TEST_EXE} PRIVATE ${Boost_LIBRARIES})
target_link_directories (${TEST_EXE} PRIVATE ${Boost_LIBRARY_DIRS})
# if(NOT ENABLE_CUDA)
# target_compile_options (${TEST_EXE} PRIVATE
# -Werror -Wall -Wextra -fno-common
# $<$<CXX_COMPILER_ID:GNU>:
# -Wpedantic -Wformat-truncation -fstack-usage>#-Wconversion
# $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:
# -Wpedantic -Wmove>
# $<$<CXX_COMPILER_ID:Intel>:
# -wd161 -diag-disable=remark -Warray-bounds -Wchar-subscripts -Wcomment -Wenum-compare -Wformat -Wuninitialized -Wmaybe-uninitialized -Wmain -Wnarrowing -Wnonnull -Wparentheses -Wpointer-sign -Wreorder -Wno-return-type -Wsign-compare -Wsequence-point -Wtrigraphs -Wunused-function -Wunused-but-set-variable -Wunused-variable -Wwrite-strings -Werror -diag-error:3846
# >
# $<$<CXX_COMPILER_ID:MSVC>:
# /W4>)
# endif()
add_test(NAME ${TEST_EXE} COMMAND ./${TEST_EXE})
endforeach()

View File

@ -0,0 +1,61 @@
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi CUDA thrust"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../cuda/managed.hpp"
namespace multi = boost::multi;
void set_one(double* p){
*p = 1.;
}
void set_two_gpu(thrust::cuda::pointer<double> p){
*p = 2.;
}
void set_three_ref(double& p){
p = 3.;
}
template<class Pointer, class V = typename std::iterator_traits<Pointer>::value_type, class = std::enable_if_t<std::is_same<V, double>{} and std::is_convertible<Pointer, thrust::cuda::pointer<V>>{}> >
void some_fun(Pointer p){}
template<class Pointer, class V = typename std::iterator_traits<Pointer>::value_type, class = std::enable_if_t<std::is_same<V, double>{} and std::is_convertible<Pointer, V*>{}> >
void some_other_fun(Pointer p){}
template<int N> class prio : std::conditional_t<N!=0, prio<N-1>, std::false_type>{};
template<class Pointer, class V = typename std::iterator_traits<Pointer>::value_type, std::enable_if_t<std::is_same<V, double>{} and std::is_convertible<Pointer, thrust::cuda::pointer<V>>{}, int> =0>
int overload_aux(Pointer p, prio<0>){return 0;}
template<class Pointer, class V = typename std::iterator_traits<Pointer>::value_type, std::enable_if_t<std::is_same<V, double>{} and std::is_convertible<Pointer, V*>{}, int> =0>
int overload_aux(Pointer p, prio<1>){return 1;}
template<class Pointer> int overload(Pointer p){return overload_aux(p, prio<1>{});}
BOOST_AUTO_TEST_CASE(vector){
multi::thrust::cuda::managed::allocator<double> alloc;
multi::thrust::cuda::managed::pointer<double> p = alloc.allocate(100);
p[17] = 3.;
BOOST_TEST_REQUIRE( p[17] == 3. );
set_one(p);
BOOST_TEST_REQUIRE( p[0] == 1. );
set_two_gpu(p);
BOOST_TEST_REQUIRE( p[0] == 2. );
set_three_ref( p[1] );
BOOST_TEST_REQUIRE( p[1] == 3. );
some_fun(p);
BOOST_TEST_REQUIRE(overload(p) == 1);
alloc.deallocate(p, 100);
}

View File

@ -0,0 +1,48 @@
cmake_minimum_required(VERSION 3.11)
project(boost-multi-adaptor-cuda-thrust-test VERSION 0.1 LANGUAGES CXX CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe \"--display_error_number --diag_suppress=implicit_return_from_non_void_function\"")
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
enable_testing()
find_program(MEMORYCHECK_COMMAND valgrind)
set(MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --error-exitcode=1")
include (CTest)
#file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
set(TEST_SRCS
array.cu
vector.cu
)
foreach(TEST_FILE ${TEST_SRCS})
SET(TEST_EXE "${TEST_FILE}.x")
add_executable (${TEST_EXE} ${TEST_FILE})
if(ENABLE_CUDA OR DEFINED CXXCUDA)
set_source_files_properties(${TEST_FILE} PROPERTIES LANGUAGE CUDA)
target_compile_options (${TEST_EXE} PRIVATE -std=c++17 --expt-relaxed-constexpr)
endif()
# target_compile_features (${TEST_EXE} PUBLIC cxx_std_17)
target_compile_definitions(${TEST_EXE} PRIVATE "BOOST_PP_VARIADICS")
target_compile_definitions(${TEST_EXE} PRIVATE ${Boost_DEFINITIONS})
target_include_directories(${TEST_EXE} PRIVATE ${Boost_INCLUDE_DIRS})
target_link_libraries (${TEST_EXE} PRIVATE ${Boost_LIBRARIES})
target_link_directories (${TEST_EXE} PRIVATE ${Boost_LIBRARY_DIRS})
# if(NOT ENABLE_CUDA)
# target_compile_options (${TEST_EXE} PRIVATE
# -Werror -Wall -Wextra -fno-common
# $<$<CXX_COMPILER_ID:GNU>:
# -Wpedantic -Wformat-truncation -fstack-usage>#-Wconversion
# $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:
# -Wpedantic -Wmove>
# $<$<CXX_COMPILER_ID:Intel>:
# -wd161 -diag-disable=remark -Warray-bounds -Wchar-subscripts -Wcomment -Wenum-compare -Wformat -Wuninitialized -Wmaybe-uninitialized -Wmain -Wnarrowing -Wnonnull -Wparentheses -Wpointer-sign -Wreorder -Wno-return-type -Wsign-compare -Wsequence-point -Wtrigraphs -Wunused-function -Wunused-but-set-variable -Wunused-variable -Wwrite-strings -Werror -diag-error:3846
# >
# $<$<CXX_COMPILER_ID:MSVC>:
# /W4>)
# endif()
add_test(NAME ${TEST_EXE} COMMAND ./${TEST_EXE})
endforeach()

View File

@ -0,0 +1,87 @@
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi CUDA thrust"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../../../adaptors/cuda/thrust.hpp"
namespace multi = boost::multi;
template<class T> void what(T&&) = delete;
BOOST_AUTO_TEST_CASE(array){
{
multi::thrust::cuda::array<double, 2> C({2, 3});
C[0][0] = 0. ;
C[1][1] = 11.;
BOOST_TEST_REQUIRE( C[1][1] == 11. );
}
{
multi::array<double, 2> const H = {
{00., 01., 02.},
{10., 11., 12.},
};
BOOST_TEST_REQUIRE( H[1][1] == 11. );
{
multi::thrust::cuda::array<double, 2> C(H.extensions());
BOOST_REQUIRE( C.num_elements() == H.num_elements() );
thrust::copy_n(H.data_elements(), H.num_elements(), C.data_elements());
BOOST_TEST_REQUIRE( C[1][1] == 11. );
BOOST_REQUIRE( C == H );
}
{
multi::thrust::cuda::array<double, 2> C(H.extensions());
BOOST_REQUIRE( C.num_elements() == H.num_elements() );
std::copy_n(H.data_elements(), H.num_elements(), C.data_elements());
BOOST_TEST_REQUIRE( C[1][1] == 11. );
BOOST_REQUIRE( C == H );
}
{
multi::thrust::cuda::array<double, 2> C(H.extensions());
BOOST_REQUIRE( C.num_elements() == H.num_elements() );
std::uninitialized_copy_n(H.data_elements(), H.num_elements(), C.data_elements());
BOOST_TEST_REQUIRE( C[1][1] == 11. );
BOOST_REQUIRE( C == H );
}
{
multi::thrust::cuda::array<double, 2> C(H.extensions());
BOOST_REQUIRE( C.num_elements() == H.num_elements() );
what( C.data_elements() );
thrust::uninitialized_copy_n(H.data_elements(), H.num_elements(), C.data_elements());
BOOST_TEST_REQUIRE( C[1][1] == 11. );
BOOST_REQUIRE( C == H );
}
// {
// multi::thrust::cuda::array<double, 2> C(H.extensions());
// BOOST_REQUIRE( C.extensions() == H.extensions() );
// thrust::copy_n(H.begin(), H.size(), C.begin());
// BOOST_REQUIRE( C == H );
// }
// {
// multi::thrust::cuda::array<double, 2> C(H.extensions());
// BOOST_REQUIRE( C.extensions() == H.extensions() );
// std::copy_n(H.begin(), H.size(), C.begin());
// BOOST_REQUIRE( C == H );
// }
// {
// multi::thrust::cuda::array<double, 2> C(H.extensions());
// C = H;
// BOOST_REQUIRE( C == H );
// }
// {
// multi::thrust::cuda::array<double, 2> C = H;
// BOOST_REQUIRE( C == H );
// }
}
}

View File

@ -0,0 +1,13 @@
#include <thrust/device_vector.h>
int main(){
// thrust::device_vector<int> D(5);
// assert( D.size() == 5 );
// cudaDeviceSynchronize();
std::allocator<int> alloc;
int* p = alloc.allocate(10);
p[0] = 2;
return p[0] + 1;
}

View File

@ -0,0 +1,43 @@
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi CUDA thrust"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
BOOST_AUTO_TEST_CASE(vector){
// H has storage for 4 integers
thrust::host_vector<int> H(4);
// initialize individual elements
H[0] = 14;
H[1] = 20;
H[2] = 38;
H[3] = 46;
// H.size() returns the size of vector H
BOOST_TEST_REQUIRE( H.size() == 4 );
// print contents of H
BOOST_TEST_REQUIRE( H[2] == 38 );
// resize H
H.resize(2);
BOOST_REQUIRE( H.size() == 2 );
// Copy host_vector H to device_vector D
thrust::device_vector<int> D = H;
// f(D.data());
// elements of D can be modified
D[0] = 99;
D[1] = 88;
thrust::cuda::pointer<int> p = D.data();
BOOST_REQUIRE( p[0] == 99 );
BOOST_TEST_REQUIRE( D[1] == 88 );
}

View File

@ -0,0 +1,739 @@
#ifdef COMPILATION// -*-indent-tabs-mode: t; c-basic-offset: 4; tab-width: 4;-*-
$CXX $0 -o $0x -lcudart -lcufft `pkg-config --libs fftw3` -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2020
#ifndef MULTI_ADAPTORS_CUFFTW_HPP
#define MULTI_ADAPTORS_CUFFTW_HPP
#include "../adaptors/../utility.hpp"
#include "../adaptors/../array.hpp"
#include "../adaptors/../config/NODISCARD.hpp"
#include "../adaptors/cuda.hpp"
#include<numeric>
#include<tuple> // std::apply
#include<array>
#include<vector>
#include "../complex.hpp"
//#include<execution>
#include<future>
#include<cufft.h>
namespace boost{
namespace multi{
namespace memory{
namespace cuda{
#if 0
template<class T1, class T1const, class T2, class T2const>
auto copy(
array_iterator<T1, 1, managed::ptr<T1const>> first,
array_iterator<T1, 1, managed::ptr<T1const>> last,
array_iterator<T2, 1, managed::ptr<T2const>> d_first
){
assert(first.stride() == last.stride());
auto s = cudaMemcpy2D(raw_pointer_cast(d_first.data()), d_first.stride()*sizeof(T2), raw_pointer_cast(first.data()), first.stride()*sizeof(T2), sizeof(T2), last - first, cudaMemcpyDefault);
switch(s){
case cudaSuccess: break;
case cudaErrorInvalidValue: assert(0);
case cudaErrorInvalidPitchValue: assert(0);
case cudaErrorInvalidDevicePointer: assert(0);
case cudaErrorInvalidMemcpyDirection: assert(0);
default: assert(0); // unknown error
}
return d_first + (last - first);
}
#endif
}}}}
namespace boost{
namespace multi{
namespace cufft{
class sign{
int impl_;
public:
sign() = default;
constexpr sign(int i) : impl_{i}{}
constexpr operator int() const{return impl_;}
};
constexpr sign forward{CUFFT_FORWARD};
constexpr sign none{0};
constexpr sign backward{CUFFT_INVERSE};
static_assert(forward != none and none != backward and backward != forward, "!");
class plan{
using complex_type = cufftDoubleComplex;
complex_type const* idata_ = nullptr;
complex_type* odata_ = nullptr;
int direction_ = 0;
cufftHandle h_;
plan() = default;
plan(plan const&) = delete;
plan(plan&& other) :
idata_{std::exchange(other.idata_, nullptr)},
odata_{std::exchange(other.odata_, nullptr)},
direction_{std::exchange(other.direction_, 0)},
h_{std::exchange(other.h_, {})}
{} // needed in <=C++14 for return
void ExecZ2Z(complex_type const* idata, complex_type* odata, int direction) const{
++tl_execute_count;
// assert(idata_ and odata_);
// assert(direction_!=0);
cufftResult r = ::cufftExecZ2Z(h_, const_cast<complex_type*>(idata), odata, direction);
switch(r){
case CUFFT_SUCCESS : break;// "cuFFT successfully executed the FFT plan."
case CUFFT_INVALID_PLAN : throw std::runtime_error{"The plan parameter is not a valid handle."};
// case CUFFT_ALLOC_FAILED : throw std::runtime_error{"CUFFT failed to allocate GPU memory."};
// case CUFFT_INVALID_TYPE : throw std::runtime_error{"The user requests an unsupported type."};
case CUFFT_INVALID_VALUE : throw std::runtime_error{"At least one of the parameters idata, odata, and direction is not valid."};
case CUFFT_INTERNAL_ERROR : throw std::runtime_error{"Used for all internal driver errors."};
case CUFFT_EXEC_FAILED : throw std::runtime_error{"CUFFT failed to execute an FFT on the GPU."};
case CUFFT_SETUP_FAILED : throw std::runtime_error{"The cuFFT library failed to initialize."};
// case CUFFT_INVALID_SIZE : throw std::runtime_error{"The user specifies an unsupported FFT size."};
// case CUFFT_UNALIGNED_DATA : throw std::runtime_error{"Unaligned data."};
// case CUFFT_INCOMPLETE_PARAMETER_LIST: throw std::runtime_error{"Incomplete parameter list."};
// case CUFFT_INVALID_DEVICE : throw std::runtime_error{"Invalid device."};
// case CUFFT_PARSE_ERROR : throw std::runtime_error{"Parse error."};
// case CUFFT_NO_WORKSPACE : throw std::runtime_error{"No workspace."};
// case CUFFT_NOT_IMPLEMENTED: throw std::runtime_error{"Not implemented."};
// case CUFFT_LICENSE_ERROR : throw std::runtime_error{"License error."};
// case CUFFT_NOT_SUPPORTED : throw std::runtime_error{"CUFFT_NOT_SUPPORTED"};
default : throw std::runtime_error{"cufftExecZ2Z unknown error"};
}
// if(cudaDeviceSynchronize() != cudaSuccess) throw std::runtime_error{"Cuda error: Failed to synchronize"};
}
void swap(plan& other){
using std::swap;
swap(idata_, other.idata_);
swap(odata_, other.odata_);
swap(direction_, other.direction_);
swap(h_, other.h_);
}
public:
thread_local static int tl_execute_count;
plan& operator=(plan other){swap(other); return *this;}
void operator()() const{ExecZ2Z(idata_, odata_, direction_);}
template<class I, class O>
O&& execute_dft(I&& i, O&& o, int direction) const{
ExecZ2Z(
const_cast<complex_type*>(reinterpret_cast<complex_type const*>(base(i))),
const_cast<complex_type*>(reinterpret_cast<complex_type const*>(base(o))),
direction
);
return std::forward<O>(o);
}
template<class I, class O>
void execute_dft(I&& i, O&& o) const{execute_dft(std::forward<I>(i), std::forward<O>(o), direction_);}
~plan(){if(h_) cufftDestroy(h_);}
using size_type = int;
using ssize_type = int;
template<class I, class O, //std::enable_if_t<(I::dimensionality < 4), int> =0,
dimensionality_type D = I::dimensionality,
typename = decltype(raw_pointer_cast(base(std::declval<I const&>())), reinterpret_cast<complex_type* >(raw_pointer_cast(base(std::declval<O&>()))))
>
plan(I const& i, O&& o, sign s) :
idata_{ reinterpret_cast<complex_type const*>(raw_pointer_cast(base(i))) },
odata_{const_cast<complex_type*>(reinterpret_cast<complex_type* >(raw_pointer_cast(base(o))))},
direction_{s}
{
assert( I::dimensionality < 4 );
assert( CUFFT_FORWARD == s or CUFFT_INVERSE == s or s == 0 );
assert( sizes(i) == sizes(o) );
// using std::experimental::apply;// using std::experimental::make_array;
auto ion = std::apply([](auto... t){return std::array< size_type, D>{static_cast< size_type>(t)...};}, sizes (i));
auto istrides = std::apply([](auto... t){return std::array<ssize_type, D>{static_cast<ssize_type>(t)...};}, strides(i));
auto ostrides = std::apply([](auto... t){return std::array<ssize_type, D>{static_cast<ssize_type>(t)...};}, strides(o));
std::array<std::tuple<int, int, int>, I::dimensionality> ssn;
for(std::size_t i = 0; i != ssn.size(); ++i) ssn[i] = std::make_tuple(istrides[i], ostrides[i], ion[i]);
std::sort(ssn.begin(), ssn.end(), std::greater<>{});
for(std::size_t i = 0; i != ssn.size(); ++i){
istrides[i] = std::get<0>(ssn[i]);
ostrides[i] = std::get<1>(ssn[i]);
ion[i] = std::get<2>(ssn[i]);
}// = std::tuple<int, int, int>(istrides[i], ostrides[i], ion[i]);
int istride = istrides.back();
auto inembed = istrides; inembed.fill(0);
int ostride = ostrides.back();
auto onembed = ostrides; onembed.fill(0);
for(std::size_t i = 1; i != onembed.size(); ++i){
assert(ostrides[i-1] >= ostrides[i]); // otherwise ordering is incompatible
assert(ostrides[i-1]%ostrides[i]==0);
onembed[i]=ostrides[i-1]/ostrides[i]; // assert( onembed[i] <= ion[i] );
assert(istrides[i-1]%istrides[i]==0);
inembed[i]=istrides[i-1]/istrides[i]; // assert( inembed[i] <= ion[i] );
}
direction_ = s;
idata_ = reinterpret_cast<complex_type const*>(raw_pointer_cast(base(i))) ;
odata_ = const_cast<complex_type*>(reinterpret_cast<complex_type* >(raw_pointer_cast(base(o))));
switch(::cufftPlanMany(
/*cufftHandle *plan*/ &h_,
/*int rank*/ ion.size(),
/*int *n*/ ion.data(), // /*NX*/ last - first,
/*int *inembed*/ inembed.data(),
/*int istride*/ istride,
/*int idist*/ 1, //stride(first),
/*int *onembed*/ onembed.data(),
/*int ostride*/ ostride,
/*int odist*/ 1, //stride(d_first),
/*cufftType type*/ CUFFT_Z2Z,
/*int batch*/ 1 //BATCH
)){
case CUFFT_SUCCESS : break;// "cuFFT successfully executed the FFT plan."
case CUFFT_ALLOC_FAILED : throw std::runtime_error{"CUFFT failed to allocate GPU memory."};
case CUFFT_INVALID_VALUE : throw std::runtime_error{"At least one of the parameters idata, odata, and direction is not valid."};
case CUFFT_INTERNAL_ERROR : throw std::runtime_error{"Used for all internal driver errors."};
case CUFFT_SETUP_FAILED : throw std::runtime_error{"The cuFFT library failed to initialize."};
case CUFFT_INVALID_SIZE : throw std::runtime_error{"The user specifies an unsupported FFT size."};
default : throw std::runtime_error{"cufftPlanMany unknown error"};
}
}
#ifndef __INTEL_COMPILER
template<class It1, class It2, dimensionality_type D = decltype(*It1{})::dimensionality>
static auto many(It1 first, It1 last, It2 d_first, int sign = 0, unsigned = 0)
->std::decay_t<decltype(const_cast<complex_type*>(reinterpret_cast<complex_type*>(raw_pointer_cast(base(d_first)))), std::declval<plan>())>
#else
template<class It1, class It2,
dimensionality_type D = decltype(*It1{})::dimensionality,
typename TT = decltype(const_cast<complex_type*>(reinterpret_cast<complex_type*>(It2{}.base().raw_pointer_cast())))
>
static auto many(It1 first, It1 last, It2 d_first, int sign = 0, unsigned = 0)
#endif
{
assert( CUFFT_FORWARD == sign or CUFFT_INVERSE == sign or sign == 0 );
assert(sizes(*first)==sizes(*d_first));
auto ion = std::apply([](auto... t){return std::array< size_type, D>{static_cast< size_type>(t)...};}, sizes (* first));
assert(strides(*first) == strides(*last));
auto istrides = std::apply([](auto... t){return std::array<ssize_type, D>{static_cast<ssize_type>(t)...};}, strides(* first));
auto ostrides = std::apply([](auto... t){return std::array<ssize_type, D>{static_cast<ssize_type>(t)...};}, strides(*d_first));
std::array<std::tuple<int, int, int>, std::decay_t<decltype(*It1{})>::dimensionality> ssn;
for(std::size_t i = 0; i != ssn.size(); ++i) ssn[i] = std::make_tuple(istrides[i], ostrides[i], ion[i]);
std::sort(ssn.begin(), ssn.end(), std::greater<>{});
for(std::size_t i = 0; i != ssn.size(); ++i){
istrides[i] = std::get<0>(ssn[i]);
ostrides[i] = std::get<1>(ssn[i]);
ion[i] = std::get<2>(ssn[i]);
}
int istride = istrides.back();
auto inembed = istrides; inembed.fill(0);
int ostride = ostrides.back();
auto onembed = ostrides; onembed.fill(0);
for(std::size_t i = 1; i != onembed.size(); ++i){
assert(ostrides[i-1] >= ostrides[i]); // otherwise ordering is incompatible
assert(ostrides[i-1]%ostrides[i]==0);
onembed[i]=ostrides[i-1]/ostrides[i]; // assert( onembed[i] <= ion[i] );
assert(istrides[i-1]%istrides[i]==0);
inembed[i]=istrides[i-1]/istrides[i]; // assert( inembed[i] <= ion[i] );
}
plan ret;
ret.direction_ = sign;
ret.idata_ = reinterpret_cast<complex_type const*>( first.base().raw_pointer_cast()) ;
ret.odata_ = const_cast<complex_type*>(reinterpret_cast<complex_type* >(d_first.base().raw_pointer_cast()));
switch(::cufftPlanMany(
/*cufftHandle *plan*/ &ret.h_,
/*int rank*/ ion.size(),
/*int *n*/ ion.data(), // /*NX*/ last - first,
/*int *inembed*/ inembed.data(),
/*int istride*/ istride,
/*int idist*/ stride(first),
/*int *onembed*/ onembed.data(),
/*int ostride*/ ostride,
/*int odist*/ stride(d_first),
/*cufftType type*/ CUFFT_Z2Z,
/*int batch*/ last - first //BATCH
)){
case CUFFT_SUCCESS : break;// "cuFFT successfully executed the FFT plan."
// case CUFFT_INVALID_PLAN : throw std::runtime_error{"The plan parameter is not a valid handle."};
case CUFFT_ALLOC_FAILED : throw std::runtime_error{"CUFFT failed to allocate GPU memory."};
// case CUFFT_INVALID_TYPE : throw std::runtime_error{"The user requests an unsupported type."};
case CUFFT_INVALID_VALUE : throw std::runtime_error{"At least one of the parameters idata, odata, and direction is not valid."};
case CUFFT_INTERNAL_ERROR : throw std::runtime_error{"Used for all internal driver errors."};
// case CUFFT_EXEC_FAILED : throw std::runtime_error{"CUFFT failed to execute an FFT on the GPU."};
case CUFFT_SETUP_FAILED : throw std::runtime_error{"The cuFFT library failed to initialize."};
case CUFFT_INVALID_SIZE : throw std::runtime_error{"The user specifies an unsupported FFT size."};
// case CUFFT_UNALIGNED_DATA : throw std::runtime_error{"Unaligned data."};
// case CUFFT_INCOMPLETE_PARAMETER_LIST: throw std::runtime_error{"Incomplete parameter list."};
// case CUFFT_INVALID_DEVICE : throw std::runtime_error{"Invalid device."};
// case CUFFT_PARSE_ERROR : throw std::runtime_error{"Parse error."};
// case CUFFT_NO_WORKSPACE : throw std::runtime_error{"No workspace."};
// case CUFFT_NOT_IMPLEMENTED: throw std::runtime_error{"Not implemented."};
// case CUFFT_LICENSE_ERROR : throw std::runtime_error{"License error."};
// case CUFFT_NOT_SUPPORTED : throw std::runtime_error{"CUFFT_NOT_SUPPORTED"};
default : throw std::runtime_error{"cufftPlanMany unknown error"};
}
return ret;
}
};
thread_local int plan::tl_execute_count = 0;
template<typename In, class Out>
auto dft(In const& i, Out&& o, int s)
->decltype(cufft::plan{i, o, s}(), std::forward<Out>(o)){
return cufft::plan{i, o, s}(), std::forward<Out>(o);}
template<typename In, typename R = multi::array<typename In::element_type, In::dimensionality, decltype(get_allocator(std::declval<In>()))>>
NODISCARD("when first argument is const")
R dft(In const& i, int s){
static_assert(std::is_trivially_default_constructible<typename In::element_type>{}, "!");
R ret(extensions(i), get_allocator(i));
cufft::dft(i, ret, s);
return ret;
}
#ifndef __INTEL_COMPILER
template<typename It1, typename It2>
auto many_dft(It1 first, It1 last, It2 d_first, sign s)
->decltype(plan::many(first, last, d_first, s)(), d_first + (last - first)){
return plan::many(first, last, d_first, s)(), d_first + (last - first);}
#else
template<typename It1, typename It2>
auto many_dft(It1 first, It1 last, It2 d_first, sign s)
->decltype(plan::many(first, last, d_first, s)(), d_first + (last - first)){
return plan::many(first, last, d_first, s)(), d_first + (last - first);}
#endif
template<typename In, class Out, std::size_t D = In::dimensionality, std::enable_if_t<(D==1), int> = 0>
Out&& dft(std::array<bool, D> which, In const& i, Out&& o, int s){
if(which[0]) return cufft::dft(i, std::forward<Out>(o), s);
else return std::forward<Out>(std::forward<Out>(o) = i);
}
template <class Array, std::size_t... Ns>
constexpr auto array_tail_impl(Array const& t, std::index_sequence<Ns...>){
return std::array<typename Array::value_type, std::tuple_size<Array>{} - 1>{std::get<Ns + 1>(t)...};
}
template<class Array>
constexpr auto array_tail(Array const& t)
->decltype(array_tail_impl(t, std::make_index_sequence<std::tuple_size<Array>{} - 1>())){
return array_tail_impl(t, std::make_index_sequence<std::tuple_size<Array>{} - 1>());}
template<typename In, class Out, std::size_t D = In::dimensionality, std::enable_if_t<(D>1), int> = 0>
auto dft(std::array<bool, D> which, In const& i, Out&& o, int s)
->decltype(many_dft(i.begin(), i.end(), o.begin(), s),std::forward<Out>(o))
{
assert(extension(i) == extension(o));
auto ff = std::find(begin(which)+1, end(which), false);
if(which[0] == true){
if(ff==end(which)) cufft::dft(i, std::forward<Out>(o), s);
else{
auto n = ff - which.begin();
std::rotate(begin(which), ff, end(which));
dft(which, i<<n, o<<n, s);
}
}else if(which[0]==false){
if(D==1 or std::none_of(begin(which)+1, end(which), [](auto e){return e;})){
if(base(o) != base(i)) std::forward<Out>(o) = i;
else if(o.layout() != i.layout()) std::forward<Out>(o) = +i;
}
else if(ff==end(which)) many_dft(i.begin(), i.end(), o.begin(), s);
else{
std::array<bool, D-1> tail = array_tail(which);
if(which[1] == false and i.is_flattable() and o.is_flattable()) cufft::dft(tail, i.flatted(), o.flatted(), s);
else{
auto d_min = 0; auto n_min = size(i);
for(auto d = 0; d != D - 1; ++d){
if((size(i<<d) < n_min) and (tail[d]==false)){
n_min = size(i<<d);
d_min = d;
}
}
if( d_min!=0 ){
std::rotate(which.begin(), which.begin()+d_min, which.end());
dft(which, i<<d_min, o<<d_min, s);
}else
{
if(base(i) == base(o) and i.layout() != o.layout()){
auto const tmp = +i;
for(auto idx : extension(i)) cufft::dft(tail, tmp[idx], o[idx], s);
}else for(auto idx : extension(i)) cufft::dft(tail, i[idx], o[idx], s);
}
}
}
}
return std::forward<Out>(o);
}
template<typename In, std::size_t D = In::dimensionality>
NODISCARD("when passing a const argument")
auto dft(std::array<bool, D> which, In const& i, int sign)->std::decay_t<decltype(
dft(which, i, typename In::decay_type(extensions(i), get_allocator(i)), sign))>{return
dft(which, i, typename In::decay_type(extensions(i), get_allocator(i)), sign);}
template<typename In, std::size_t D = In::dimensionality>
auto dft(std::array<bool, D> which, In&& i, int sign)
->decltype(dft(which, i, i, sign), std::forward<In>(i)){
return dft(which, i, i, sign), std::forward<In>(i);}
//template<typename... A> auto dft_forward(A&&... a)
//->decltype(cufft::dft(std::forward<A>(a)..., cufft::forward)){
// return cufft::dft(std::forward<A>(a)..., cufft::forward);}
template<typename Array, typename A> NODISCARD("when passing a const argument")
auto dft_forward(Array arr, A const& a)
->decltype(cufft::dft(arr, a, cufft::forward)){
return cufft::dft(arr, a, cufft::forward);}
template<typename Array, dimensionality_type D> NODISCARD("when passing a const argument")
auto dft_forward(Array arr, multi::cuda::array<std::complex<double>, D>&& a)
->decltype(cufft::dft(arr, a, cufft::forward), multi::cuda::array<std::complex<double>, D>{}){//assert(0);
return cufft::dft(arr, a, cufft::forward), std::move(a);}
template<typename A> NODISCARD("when passing a const argument")
auto dft_forward(A const& a)
->decltype(cufft::dft(a, cufft::forward)){
return cufft::dft(a, cufft::forward);}
template<typename... A> auto dft_backward(A&&... a)
->decltype(cufft::dft(std::forward<A>(a)..., cufft::backward)){
return cufft::dft(std::forward<A>(a)..., cufft::backward);}
template<typename Array, typename A> NODISCARD("when passing a const argument")
auto dft_backward(Array arr, A const& a)
->decltype(cufft::dft(arr, a, cufft::backward)){
return cufft::dft(arr, a, cufft::backward);}
template<typename A> NODISCARD("when passing a const argument")
auto dft_backward(A const& a)
->decltype(cufft::dft(a, cufft::backward)){
return cufft::dft(a, cufft::backward);}
}
}}
#if not __INCLUDE_LEVEL__ // TEST BELOW
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi cuFFT adaptor"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include <boost/timer/timer.hpp>
#include "../adaptors/cuda.hpp"
#include "../adaptors/fftw.hpp"
#include "../adaptors/cufft.hpp"
//#include "../adaptors/fft.hpp"
#include<complex>
#include<thrust/complex.h>
#include "../complex.hpp"
#include<cuda_runtime.h> // cudaDeviceSynchronize
#include<iostream>
namespace multi = boost::multi;
using complex = std::complex<double>;
namespace utf = boost::unit_test;
template <class T>
__attribute__((always_inline)) inline void DoNotOptimize(const T &value) {
asm volatile("" : "+m"(const_cast<T &>(value)));
}
struct watch : private std::chrono::high_resolution_clock{
std::string label_; time_point start_;
watch(std::string label ="") : label_{label}, start_{}{
cudaDeviceSynchronize();
start_ = now();
}
~watch(){
cudaDeviceSynchronize();
auto const count = std::chrono::duration<double>(now() - start_).count();
std::cerr<< label_<<": "<< count <<" sec"<<std::endl;
}
};
constexpr complex I{0, 1};
#if 1
BOOST_AUTO_TEST_CASE(cufft_2D, *boost::unit_test::tolerance(0.0001)){
multi::array<complex, 2> const in_cpu = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
multi::array<complex, 2> fw_cpu(extensions(in_cpu));
multi::fftw::dft(in_cpu, fw_cpu, multi::fftw::forward);
multi::cuda::array<complex, 2> const in_gpu = in_cpu;
multi::cuda::array<complex, 2> fw_gpu(extensions(in_gpu));
multi::cufft::dft(in_gpu, fw_gpu, multi::cufft::forward);
BOOST_TEST( imag(static_cast<complex>(fw_gpu[3][2]) - fw_cpu[3][2]) == 0. );
auto fw2_gpu = multi::cufft::dft(in_gpu, multi::cufft::forward);
BOOST_TEST( imag(static_cast<complex>(fw2_gpu[3][1]) - fw_cpu[3][1]) == 0. );
multi::cuda::managed::array<complex, 2> const in_mng = in_cpu;
multi::cuda::managed::array<complex, 2> fw_mng(extensions(in_gpu));
multi::cufft::dft(in_mng, fw_mng, multi::cufft::forward);
BOOST_TEST( imag(fw_mng[3][2] - fw_cpu[3][2]) == 0. );
auto fw2_mng = multi::fftw::dft(in_mng, multi::fftw::forward);
BOOST_TEST( imag(fw2_mng[3][1] - fw_cpu[3][1]) == 0. );
}
BOOST_AUTO_TEST_CASE(cufft_3D_timing, *boost::unit_test::tolerance(0.0001)){
auto x = std::make_tuple(300, 300, 300);
{
multi::array<complex, 3> const in_cpu(x, 10.);
BOOST_ASSERT( in_cpu.num_elements()*sizeof(complex) < 2e9 );
multi::array<complex, 3> fw_cpu(extensions(in_cpu), 99.);
{
// boost::timer::auto_cpu_timer t; // 1.041691s wall, 1.030000s user + 0.000000s system = 1.030000s CPU (98.9%)
multi::fftw::dft(in_cpu, fw_cpu, multi::fftw::forward);
BOOST_TEST( fw_cpu[8][9][10] != 99. );
}
}
{
multi::cuda::array<complex, 3> const in_gpu(x, 10.);
multi::cuda::array<complex, 3> fw_gpu(extensions(in_gpu), 99.);
{
// boost::timer::auto_cpu_timer t; // 0.208237s wall, 0.200000s user + 0.010000s system = 0.210000s CPU (100.8%)
multi::cufft::dft(in_gpu, fw_gpu, multi::fftw::forward);
BOOST_TEST( static_cast<complex>(fw_gpu[8][9][10]) != 99. );
}
}
{
multi::cuda::managed::array<complex, 3> const in_gpu(x, 10.);
multi::cuda::managed::array<complex, 3> fw_gpu(extensions(in_gpu), 99.);
{
// boost::timer::auto_cpu_timer t; // 0.208237s wall, 0.200000s user + 0.010000s system = 0.210000s CPU (100.8%)
multi::cufft::dft(in_gpu, fw_gpu, multi::cufft::forward);
// BOOST_TEST( fw_gpu[8][9][10].operator complex() != 99. );
}
{
// boost::timer::auto_cpu_timer t; // 0.208237s wall, 0.200000s user + 0.010000s system = 0.210000s CPU (100.8%)
multi::cufft::dft(in_gpu, fw_gpu, multi::cufft::forward);
// BOOST_TEST( fw_gpu[8][9][10].operator complex() != 99. );
}
}
}
BOOST_AUTO_TEST_CASE(cufft_combinations, *utf::tolerance(0.00001)){
auto const in = []{
multi::array<complex, 4> ret({32, 90, 98, 96});
std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(),
[](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};}
);
return ret;
}();
std::clog<<"memory size "<< in.num_elements()*sizeof(complex)/1e6 <<" MB\n";
multi::cuda::array<complex, 4> const in_gpu = in;
multi::cuda::managed::array<complex, 4> const in_mng = in;
using std::clog;
for(auto c : std::vector<std::array<bool, 4>>{
{false, true , true , true },
{false, true , true , false},
{true , false, false, false},
{true , true , false, false},
{false, false, true , false},
{false, false, false, false},
}){
std::clog<<"case "; copy(begin(c), end(c), std::ostream_iterator<bool>{std::clog,", "}); std::clog<<std::endl;
multi::array<complex, 4> out = in;
multi::array<complex, 4> in_rw = in;
[&, _ = watch{"cpu_opl "}]{
multi::fftw::dft_forward(c, in, out);
}();
[&, _ = watch{"cpu_ipl "}]{
multi::fftw::dft(c, in_rw, multi::fftw::forward);
BOOST_TEST( abs( static_cast<multi::complex<double>>(in_rw[5][4][3][1]) - multi::complex<double>(out[5][4][3][1]) ) == 0. );
}();
{
multi::array<complex, 4> in_rw2 = in;
[&, _ = watch{"cpu_mov "}]{
multi::array<complex, 4> const out_mov = multi::fftw::dft_forward(c, std::move(in_rw2));
// what(out_mov);
BOOST_TEST( abs( static_cast<multi::complex<double>>(out_mov[5][4][3][1]) - multi::complex<double>(out[5][4][3][1]) ) == 0. );
BOOST_REQUIRE( is_empty(in_rw2) );
BOOST_REQUIRE( extensions(out_mov) == extensions(in) );
}();
}
[&, _ = watch{"cpu_new "}]{
auto const out_cpy = multi::fftw::dft_forward(c, in);
BOOST_TEST( abs( static_cast<multi::complex<double>>(out_cpy[5][4][3][1]) - multi::complex<double>(out[5][4][3][1]) ) == 0. );
}();
multi::cuda::array<complex, 4> out_gpu(extensions(in_gpu));
[&, _ = watch{"gpu_opl "}]{
multi::cufft::dft(c, in_gpu , out_gpu, multi::cufft::forward);
BOOST_TEST( abs( static_cast<complex>(out_gpu[5][4][3][1]) - out[5][4][3][1] ) == 0. );
}();
{
multi::cuda::array<complex, 4> in_rw_gpu = in_gpu;
[&, _ = watch{"gpu_ipl "}]{
multi::cufft::dft(c, in_rw_gpu, multi::cufft::forward);
BOOST_TEST( abs( static_cast<complex>(in_rw_gpu[5][4][3][1]) - out[5][4][3][1] ) == 0. );
}();
}
{
multi::cuda::array<complex, 4> in_rw_gpu = in_gpu;
[&, _ = watch{"gpu_mov "}]{
multi::cuda::array<complex, 4> const out_mov = multi::cufft::dft_forward(c, std::move(in_rw_gpu));
// BOOST_REQUIRE( in_rw_gpu.empty() );
// BOOST_TEST( abs( static_cast<complex>(out_mov[5][4][3][1]) - out[5][4][3][1] ) == 0. );
}();
}
{
multi::cuda::array<complex, 4> in_rw_gpu = in_gpu;
[&, _ = watch{"gpu_mov "}]{
multi::cuda::array<complex, 4> out_mov = std::move(in_rw_gpu);
multi::cufft::dft(c, out_mov, multi::cufft::forward);
// BOOST_REQUIRE( in_rw_gpu.empty() );
// BOOST_TEST( abs( static_cast<complex>(out_mov[5][4][3][1]) - out[5][4][3][1] ) == 0. );
}();
}
cudaDeviceSynchronize();
[&, _ = watch{"gpu_new "}]{
multi::cuda::array<complex, 4> const out_cpy = multi::cufft::dft(c, in_gpu, multi::cufft::forward);
}();
multi::cuda::managed::array<complex, 4> out_mng(extensions(in_mng));
[&, _ = watch{"mng_cld "}]{
multi::cufft::dft(c, in_mng, out_mng, multi::cufft::forward);
BOOST_TEST( abs( out_mng[5][4][3][1] - out[5][4][3][1] ) == 0. );
}();
[&, _ = watch{"mng_hot "}]{
multi::cufft::dft(c, in_mng , out_mng, multi::cufft::forward);
BOOST_TEST( abs( out_mng[5][4][3][1] - out[5][4][3][1] ) == 0. );
}();
[&, _ = watch{"mng_new "}]{
auto const out_mng = multi::cufft::dft(c, in_mng, multi::cufft::forward);
BOOST_TEST( abs( out_mng[5][4][3][1] - out[5][4][3][1] ) == 0. );
}();
}
std::clog<<std::endl;
}
BOOST_AUTO_TEST_CASE(cufft_many_3D, *utf::tolerance(0.00001) ){
auto const in_cpu = []{
multi::array<complex, 4> ret({45, 18, 32, 16});
std::generate(
ret.data_elements(), ret.data_elements() + ret.num_elements(),
[](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};}
);
return ret;
}();
multi::cuda::array<complex, 4> const in = in_cpu;
multi::cuda::array<complex, 4> out(extensions(in));
#if 0
multi::cufft::many_dft(begin(unrotated(in)), end(unrotated(in)), begin(unrotated(out)), +1);
multi::array<complex, 4> out_cpu(extensions(in));
multi::fft::many_dft(begin(unrotated(in_cpu)), end(unrotated(in_cpu)), begin(unrotated(out_cpu)), +1);
BOOST_TEST( imag( static_cast<complex>(out[5][4][3][2]) - out_cpu[5][4][3][2]) == 0. );
#endif
}
#if 0
BOOST_AUTO_TEST_CASE(cufft_4D, *utf::tolerance(0.00001)){
auto const in = []{
multi::array<complex, 3> ret({10, 10, 10});
std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(),
[](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};}
);
return ret;
}();
multi::array<complex, 3> out(extensions(in));
// multi::fftw::dft({true, false, true}, in, out, multi::fftw::forward);
multi::fft::many_dft(begin(in<<1), end(in<<1), begin(out<<1), multi::fftw::forward);
multi::cuda::array<complex, 3> in_gpu = in;
multi::cuda::array<complex, 3> out_gpu(extensions(in));
// multi::cufft::dft({true, false, true}, in_gpu, out_gpu, multi::fft::forward);//multi::cufft::forward);
multi::cufft::many_dft(begin(in_gpu<<1), end(in_gpu<<1), begin(out_gpu<<1), multi::fftw::forward);
BOOST_TEST( imag( static_cast<complex>(out_gpu[5][4][3]) - out[5][4][3]) == 0. );
}
//BOOST_AUTO_TEST_CASE(cu
#if 0
BOOST_AUTO_TEST_CASE(cufft_4D){
auto const in = []{
multi::array<complex, 3> ret({10, 10, 10});
ret[2][3][4] = 99.;
return ret;
}();
multi::array<complex, 3> out(extensions(in));
multi::fftw::dft({true, true, false}, in, out, multi::fftw::forward);
// auto fwd = multi::fftw::dft({true, true, true, true}, in, out, multi::fftw::forward);
// BOOST_REQUIRE(in[2][3][4][5] == 99.);
std::cout << out[9][1][2] << std::endl;
for(auto i = 0; i != out.num_elements(); ++i) std:cout << (out.data_elements()[i]) <<' ';
#if 0
multi::cuda::array<complex, 3> in_gpu = in;//[]{
// multi::cuda::array<complex, 4> ret({10, 10, 10, 10});
// ret[2][3][4][5] = 99.;
// return ret;
// }();
multi::cuda::array<complex, 3> out_gpu(extensions(in));
multi::cufft::dft({true, true, false}, in_gpu, out_gpu, multi::cufft::forward);
std::cout << out_gpu[5][4][3].operator complex() << std::endl;
// multi::cufft::dft({true, true, true, true}, in_gpu, out_gpu, multi::cufft::forward);
// multi::cufft::dft({true, true, true, true}, in_gpu, out_gpu, multi::cufft::forward);
#endif
}
#endif
#endif
#endif
#endif
#endif

View File

@ -0,0 +1,131 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil;-*-
$CXX $0 -o $0x -lcudart -lcufft `pkg-config --libs fftw3` -lboost_timer -lboost_unit_test_framework&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2020
#ifndef MULTI_ADAPTORS_FFT_HPP
#define MULTI_ADAPTORS_FFT_HPP
#include "../adaptors/fftw.hpp"
#include "../adaptors/cufft.hpp"
namespace boost{
namespace multi{
namespace fft{
static constexpr int forward = fftw::forward;//FFTW_FORWARD;
static constexpr int none = 0;
static constexpr int backward = fftw::backward;//FFTW_BACKWARD;
static_assert( forward != none and none != backward and backward != forward, "!");
template<std::size_t I> struct priority : std::conditional_t<I==0, std::true_type, struct priority<I-1>>{};
template<class... Args> auto dft_aux_(priority<0>, Args&&... args) DECLRETURN( fftw::dft(std::forward<Args>(args)...))
template<class... Args> auto dft_aux_(priority<1>, Args&&... args) DECLRETURN(cufft ::dft(std::forward<Args>(args)...))
template<class... Args> auto dft(Args&&... args) DECLRETURN(dft_aux_(priority<1>{}, std::forward<Args>(args)...))
template<class In, class... Args> auto dft(std::array<bool, std::decay_t<In>::dimensionality> which, In&& in, Args&&... args) DECLRETURN(dft_aux_(priority<1>{}, which, std::forward<In>(in), std::forward<Args>(args)...))
template<class... Args> auto many_dft_aux_(priority<0>, Args&&... args) DECLRETURN( fftw::many_dft(std::forward<Args>(args)...))
template<class... Args> auto many_dft_aux_(priority<1>, Args&&... args) DECLRETURN(cufft ::many_dft(std::forward<Args>(args)...))
template<class... Args> auto many_dft(Args&&... args) DECLRETURN(many_dft_aux_(priority<1>{}, std::forward<Args>(args)...))
template<class... Args> auto dft_forward_aux_(priority<0>, Args&&... args) DECLRETURN( fftw::dft_forward(std::forward<Args>(args)...))
template<class... Args> auto dft_forward_aux_(priority<1>, Args&&... args) DECLRETURN(cufft ::dft_forward(std::forward<Args>(args)...))
template<class... Args> auto dft_forward(Args&&... args) DECLRETURN(dft_forward_aux_(priority<1>{}, std::forward<Args>(args)...))
template<class In, class... Args> auto dft_forward(std::array<bool, std::decay_t<In>::dimensionality> which, In&& in, Args&&... args) DECLRETURN(dft_forward_aux_(priority<1>{}, which, std::forward<In>(in), std::forward<Args>(args)...))
template<class... Args> auto dft_backward_aux_(priority<0>, Args&&... args) DECLRETURN( fftw::dft_backward(std::forward<Args>(args)...))
template<class... Args> auto dft_backward_aux_(priority<1>, Args&&... args) DECLRETURN(cufft ::dft_backward(std::forward<Args>(args)...))
template<class... Args> auto dft_backward(Args&&... args) DECLRETURN(dft_backward_aux_(priority<1>{}, std::forward<Args>(args)...))
template<class In, class... Args> auto dft_backward(std::array<bool, std::decay_t<In>::dimensionality> which, In&& in, Args&&... args) DECLRETURN(dft_backward_aux_(priority<1>{}, which, std::forward<In>(in), std::forward<Args>(args)...))
}}}
#if not __INCLUDE_LEVEL__
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi FFT adaptor"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include <boost/timer/timer.hpp>
#include <boost/config.hpp>
namespace utf = boost::unit_test;
using complex = std::complex<double>;
namespace multi = boost::multi;
using std::cout;
BOOST_AUTO_TEST_CASE(fft_combinations, *utf::tolerance(0.00001)){
cout<< "# threads is " << multi::fftw::plan::with_nthreads() <<"\n";
cout<<"=========================================================\n";
cout<< BOOST_PLATFORM <<' '<< BOOST_COMPILER <<' '<< __DATE__<<'\n';
auto const in = []{
multi::array<complex, 4> ret({32, 90, 98, 96});
std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(),
[](){return complex{std::rand()/1./RAND_MAX, std::rand()/1./RAND_MAX};}
);
return ret;
}();
std::cout<<"memory size "<< in.num_elements()*sizeof(complex)/1e6 <<" MB\n";
multi::cuda::array<complex, 4> const in_gpu = in;
multi::cuda::managed::array<complex, 4> const in_mng = in;
std::vector<std::array<bool, 4>> cases = {
{false, true , true , true },
{false, true , true , false},
{true , false, false, false},
{true , true , false, false},
{false, false, true , false},
{false, false, false, false},
};
for(auto c : cases){
cout<<"case: "<<std::boolalpha;
copy(begin(c), end(c), std::ostream_iterator<bool>{cout,", "}); cout<<"\n";
multi::array<complex, 4> out(extensions(in));
{
cout<<"flops "<< multi::fftw::plan(c, in, out, multi::fft::forward).flops() <<"\n";
boost::timer::auto_cpu_timer t{"cpu____ %ws wall, CPU (%p%)\n"};
multi::fft::dft(c, in, out, multi::fft::forward);
}
{
boost::timer::auto_cpu_timer t{"cpu_hot %ws wall, CPU (%p%)\n"};
multi::fft::dft(c, in, out, multi::fft::forward);
}
multi::cuda::array<complex, 4> out_gpu(extensions(in_gpu));
{
boost::timer::auto_cpu_timer t{"gpu_cld %ws wall, CPU (%p%)\n"};
multi::fft::dft(c, in_gpu , out_gpu , multi::fft::forward);
BOOST_TEST( abs( static_cast<complex>(out_gpu[5][4][3][1]) - out[5][4][3][1] ) == 0. );
}
{
boost::timer::auto_cpu_timer t{"gpu_hot %ws wall, CPU (%p%)\n"};
multi::fft::dft(c, in_gpu , out_gpu , multi::fft::forward);
// BOOST_TEST( abs( static_cast<complex>(out_gpu[5][4][3][1]) - out[5][4][3][1] ) == 0. );
}
multi::cuda::managed::array<complex, 4> out_mng(extensions(in_mng));
{
boost::timer::auto_cpu_timer t{"mng_cld %ws wall, CPU (%p%)\n"};
multi::fft::dft(c, in_mng , out_mng , multi::fft::forward);
cudaDeviceSynchronize();
BOOST_TEST( abs( out_mng[5][4][3][1] - out[5][4][3][1] ) == 0. );
}
{
/// boost::timer::auto_cpu_timer t{"mng_hot %ws wall, CPU (%p%)\n"};
multi::fft::dft(c, in_mng() , out_mng() , multi::fft::forward);
cudaDeviceSynchronize();
BOOST_TEST( abs( out_mng[5][4][3][1] - out[5][4][3][1] ) == 0. );
}
}
}
#endif
#endif

View File

@ -0,0 +1,934 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4-*-
$CXXX $CXXFLAGS $0 -o $0x$OXX `pkg-config --cflags --libs fftw3 cuda-11.0` -lboost_timer -lboost_unit_test_framework&&$0x$OXX&&rm $0x$OXX;exit
#endif
// © Alfredo A. Correa 2018-2020
#ifndef MULTI_ADAPTORS_FFTW_HPP
#define MULTI_ADAPTORS_FFTW_HPP
#include "../adaptors/../array.hpp"
#include "../adaptors/../config/NODISCARD.hpp"
#include<algorithm> // sort
#include<complex>
#include<numeric> // accumulate
#if HAVE_FFTW3_THREADS
#include <thread>
#endif
#include<fftw3.h> // external fftw3 library
namespace boost{
namespace multi{
namespace fftw{
// template<class T> auto alignment_of(T* p){return ::fftw_alignment_of((double*)p);}
#if __cpp_lib_as_const >= 201510
using std::as_const;
#else
template<class T> constexpr std::add_const_t<T>& as_const(T& t) noexcept{return t;}
#endif
}
#if 0
template<typename Size>
auto fftw_plan_dft_1d(
Size N,
std::complex<double> const* in, std::complex<double>* out, int sign,
unsigned flags = FFTW_ESTIMATE
){
#ifndef NDEBUG
auto check = in[N/3]; // check that const data will not been overwritten
#endif
assert( fftw::alignment_of(in) == fftw::alignment_of(out) );
auto ret=::fftw_plan_dft_1d(N, (fftw_complex*)in, (fftw_complex*)out, sign, flags | FFTW_PRESERVE_INPUT );
assert(check == in[N/3]); // check that const data has not been overwritten
return ret;
}
template<typename Size>
auto fftw_plan_dft_1d(
Size N,
std::complex<double>* in, std::complex<double>* out, int sign,
unsigned flags = FFTW_ESTIMATE
){
assert( fftw::alignment_of(in) == fftw::alignment_of(out) );
return ::fftw_plan_dft_1d(N, (fftw_complex*)in, (fftw_complex*)out, sign, flags);
}
template<typename Size>
auto fftw_plan_dft_2d(
Size N1, Size N2,
std::complex<double> const* in, std::complex<double>* out, int sign,
unsigned flags = FFTW_ESTIMATE
){
assert( fftw::alignment_of(in) == fftw::alignment_of(out) );
#ifndef NDEBUG
auto check = in[N1*N2/3]; // check that const data will not been overwritten
#endif
auto ret = ::fftw_plan_dft_2d(N1, N2, (fftw_complex*)in, (fftw_complex*)out, sign, flags | FFTW_PRESERVE_INPUT);
assert( check == in[N1*N2/3] ); // check that const data has not been overwritten
return ret;
}
template<typename Size>
auto fftw_plan_dft_2d(
Size N1, Size N2,
std::complex<double>* in, std::complex<double>* out, int sign,
unsigned flags = FFTW_ESTIMATE
){
assert(fftw_alignment_of((double*)in) == fftw_alignment_of((double*)out));
return ::fftw_plan_dft_2d(N1, N2, (fftw_complex*)in, (fftw_complex*)out, sign, flags);
}
template<typename Size>
auto fftw_plan_dft_3d(
Size N1, Size N2, Size N3,
std::complex<double>* in, std::complex<double>* out, int sign,
unsigned flags = FFTW_ESTIMATE
){
assert(fftw_alignment_of((double*)in) == fftw_alignment_of((double*)out));
return ::fftw_plan_dft_3d(N1, N2, N3, (fftw_complex*)in, (fftw_complex*)out, sign, flags);
}
template<typename Size>
auto fftw_plan_dft_3d(
Size N1, Size N2, Size N3,
std::complex<double> const* in, std::complex<double>* out, int sign,
unsigned flags = FFTW_ESTIMATE
){
assert( flags & FFTW_PRESERVE_INPUT );
assert(fftw_alignment_of((double*)in) == fftw_alignment_of((double*)out));
return ::fftw_plan_dft_3d(N1, N2, N3, (fftw_complex*)in, (fftw_complex*)out, sign, flags | FFTW_PRESERVE_INPUT);
}
#endif
#if 0
template<typename Rank>
auto fftw_plan_dft(
Rank r, int* ns,
std::complex<double>* in, std::complex<double>* out,
int sign, unsigned flags = FFTW_ESTIMATE
){
assert(fftw_alignment_of((double*)in) == fftw_alignment_of((double*)out));
return ::fftw_plan_dft(r, ns, (fftw_complex*)in, (fftw_complex*)out, sign, flags);
}
template<typename RankType>
auto fftw_plan_dft(
RankType r, int* ns,
std::complex<double> const* in, std::complex<double>* out,
int sign, unsigned flags = FFTW_ESTIMATE | FFTW_PRESERVE_INPUT
){
assert( flags & FFTW_PRESERVE_INPUT );
assert(fftw::alignment_of(in) == fftw::alignment_of(out));
#ifndef NDEBUG
size_t ne = 1; for(RankType i = 0; i != r; ++i) ne*=ns[i];
auto check = in[ne/3]; // check that const data will not been overwritten
#endif
auto ret=::fftw_plan_dft(r, ns, (fftw_complex*)in, (fftw_complex*)out, sign, flags);
assert(check == in[ne/3]); // check that const data has not been overwritten
return ret;
}
#endif
#if 0
template<typename In, typename Out>
auto fftw_plan_dft_1d(
In&& in, Out&& out, int sign, unsigned flags = FFTW_ESTIMATE
){
static_assert(in.dimensionality == 1, "!"); assert(size(in) == size(out));
assert( in.is_compact() ); assert( out.is_compact() );
return multi::fftw_plan_dft_1d(size(in), data_elements(in), data_elements(out), sign, flags);
}
template<class In, class Out>
auto fftw_plan_dft_2d(
In&& in, Out&& out, int sign, unsigned flags = FFTW_ESTIMATE
){
static_assert(in.dimensionality == 2, "!"); assert(in.sizes() == out.sizes());
assert( in.is_compact() ); assert( out.is_compact() );
return multi::fftw_plan_dft_2d(
sizes(in)[0], sizes(in)[1],
data_elements(in), data_elements(out), sign, flags
);
}
template<class In, class Out>
auto fftw_plan_dft_3d(
In&& in, Out&& out, int sign, unsigned flags = FFTW_ESTIMATE
){
static_assert(in.dimensionality == 3, "!"); assert(in.sizes() == out.sizes());
assert( in.is_compact() ); assert( out.is_compact() );
return multi::fftw_plan_dft_3d(
sizes(in)[0], sizes(in)[1], sizes(in)[2],
data(in), data(out),
sign, flags
);
}
#endif
template<class T, class Tpl> constexpr auto to_array(Tpl const& t){
return detail::to_array_impl<T>(t, std::make_index_sequence<std::tuple_size<Tpl>{}>{});
}
#if(__cpp_if_constexpr>=201606)
//https://stackoverflow.com/a/35110453/225186
template<class T> constexpr std::remove_reference_t<T> _constx(T&&t){return t;}
#define logic_assert(C, M) \
if constexpr(noexcept(_constx(C))) static_assert((C), M); else assert((C)&& M);
#else
#define logic_assert(ConditioN, MessagE) assert(ConditioN && MessagE);
#endif
template<typename It1, class It2, std::enable_if_t<std::is_pointer<decltype(base(It2{}))>{} or std::is_convertible<decltype(base(It2{})), std::complex<double>*>{}, int> = 0
>
auto fftw_plan_many_dft(It1 first, It1 last, It2 d_first, int sign, unsigned flags = FFTW_ESTIMATE)
->decltype(reinterpret_cast<fftw_complex*>(/*static_cast<std::complex<double>*>*/(base(d_first))), fftw_plan{}){
static_assert( sizeof(*base( first)) == sizeof(real(*base( first))) + sizeof(imag(*base( first))) and sizeof(*base( first)) == sizeof(fftw_complex),
"input must have complex pod layout" );
static_assert( sizeof(*base(d_first)) == sizeof(real(*base(d_first))) + sizeof(imag(*base(d_first))) and sizeof(*base(d_first)) == sizeof(fftw_complex),
"output must have complex pod layout");
assert(sizes(*first)==sizes(*d_first));
auto ion = to_array<int>(sizes(*first));
assert(strides(*first) == strides(*last));
auto istrides = to_array<int>(strides(*first));
auto ostrides = to_array<int>(strides(*d_first));
std::array<std::array<int, 3>, std::decay_t<decltype(*It1{})>::rank::value> ssn;
for(std::size_t i = 0; i != ssn.size(); ++i) ssn[i] = {istrides[i], ostrides[i], ion[i]};
std::sort(ssn.begin(), ssn.end(), std::greater<>{});
for(std::size_t i = 0; i != ssn.size(); ++i){
istrides[i] = std::get<0>(ssn[i]);
ostrides[i] = std::get<1>(ssn[i]);
ion[i] = std::get<2>(ssn[i]);
}
int istride = istrides.back();
auto inembed = istrides; inembed.fill(0);
int ostride = ostrides.back();
auto onembed = ostrides; onembed.fill(0);
for(std::size_t i = 1; i != onembed.size(); ++i){
assert(ostrides[i-1] >= ostrides[i]); // otherwise ordering is incompatible
assert(ostrides[i-1]%ostrides[i]==0);
onembed[i]=ostrides[i-1]/ostrides[i]; // assert( onembed[i] <= ion[i] );
assert(istrides[i-1]%istrides[i]==0);
inembed[i]=istrides[i-1]/istrides[i]; // assert( inembed[i] <= ion[i] );
}
auto ret = ::fftw_plan_many_dft(
/*int rank*/ ion.size(),
/*const int* n*/ ion.data(),
/*int howmany*/ last - first,
/*fftw_complex * in */ reinterpret_cast<fftw_complex*>(const_cast<std::complex<double>*>(static_cast<std::complex<double> const*>(base(first)))),
/*const int *inembed*/ inembed.data(),
/*int*/ istride,
/*int idist*/ stride(first),
/*fftw_complex * out */ reinterpret_cast<fftw_complex*>(static_cast<std::complex<double>*>(base(d_first))),
/*const int *onembed*/ onembed.data(),
/*int*/ ostride,
/*int odist*/ stride(d_first),
/*int*/ sign, /*unsigned*/ flags
);
assert(ret);
return ret;
}
template<
class In, class Out, dimensionality_type D = std::decay_t<In>::dimensionality,
class=std::enable_if_t<D==std::decay_t<Out>::dimensionality>,
class=decltype(reinterpret_cast<fftw_complex*>(/*static_cast<std::complex<double> *>*/(base(std::declval<Out&>()))))
>
fftw_plan fftw_plan_dft(std::array<bool, +D> which, In&& in, Out&& out, int sign, unsigned flags = FFTW_ESTIMATE){
static_assert( sizeof(*base(in )) == sizeof((*base(in )).real()) + sizeof((*base(in)).imag()) and sizeof(*base(in)) == sizeof(fftw_complex),
"input must have complex pod layout" );
static_assert( sizeof(*base(out)) == sizeof((*base(out)).real()) + sizeof((*base(in)).imag()) and sizeof(*base(out)) == sizeof(fftw_complex),
"output must have complex pod layout" );
using multi::sizes;
assert(sizes(in) == sizes(out));
using multi::strides;
auto ion = to_array<ptrdiff_t>(in.sizes());
auto istrides = to_array<ptrdiff_t>(in.strides());
auto ostrides = to_array<ptrdiff_t>(out.strides());
std::array<fftw_iodim64, D> dims ;
auto l_dims = dims.begin();
std::array<fftw_iodim64, D> howmany;
auto l_howmany = howmany.begin();
for(int i=0; i!=D; ++i) *(which[i]?l_dims:l_howmany)++ = {ion[i], istrides[i], ostrides[i]};
assert( D == l_dims - dims.begin() + l_howmany - howmany.begin() );
assert(in.base()); assert(out.base()); assert( in.extensions() == out.extensions() );
assert( (sign == -1) or (sign == +1) );
fftw_plan ret = fftw_plan_guru64_dft(
/*int rank*/ l_dims - dims.begin(),
/*const fftw_iodim64 *dims*/ dims.data(),
/*int howmany_rank*/ l_howmany - howmany.begin(),
/*const fftw_iodim *howmany_dims*/ howmany.data(), //nullptr, //howmany_dims.data(), //;//nullptr,
/*fftw_complex *in*/ const_cast<fftw_complex*>(reinterpret_cast<fftw_complex const*>(/*static_cast<std::complex<double> const *>*/(in.base()))),
/*fftw_complex *out*/ reinterpret_cast<fftw_complex*>(/*static_cast<std::complex<double> *>*/(out.base())),
sign, flags// | FFTW_ESTIMATE
);
assert(ret &&"fftw lib returned a null plan, if you are using MKL check the limitations of their fftw interface");
//https://software.intel.com/content/www/us/en/develop/documentation/mkl-developer-reference-c/top/appendix-d-fftw-interface-to-intel-math-kernel-library/fftw3-interface-to-intel-math-kernel-library/using-fftw3-wrappers.html
return ret;
}
template<class To, class From, std::enable_if_t<std::is_convertible<From, To>{},int> =0>
To implicit_cast(From&& f){return static_cast<To>(f);}
template<class In, class Out, dimensionality_type D = In::dimensionality, typename = decltype(reinterpret_cast<fftw_complex*>(implicit_cast<std::complex<double>*>(base(std::declval<Out&>()))))>
auto fftw_plan_dft(In const& in, Out&& out, int s, unsigned flags = FFTW_ESTIMATE){
static_assert( D == std::decay_t<Out>::dimensionality , "!");
using multi::sizes; using multi::strides; assert(sizes(in) == sizes(out));
auto
ion = to_array<ptrdiff_t>(sizes(in)),
istrides = to_array<ptrdiff_t>(strides(in)),
ostrides = to_array<ptrdiff_t>(strides(out))
;
std::array<fftw_iodim64, D> dims;
for(int i=0; i!=D; ++i) dims[i] = {ion[i], istrides[i], ostrides[i]};
auto ret = fftw_plan_guru64_dft(
/*int rank*/ s?D:0,
/*const fftw_iodim64 *dims*/ dims.data(),
/*int howmany_rank*/ 0,
/*const fftw_iodim *howmany_dims*/ nullptr, //howmany_dims.data(), //;//nullptr,
/*fftw_complex *in*/ const_cast<fftw_complex*>(reinterpret_cast<fftw_complex const*>(static_cast<std::complex<double> const*>(base(in)))),
/*fftw_complex *out*/ reinterpret_cast<fftw_complex*>(implicit_cast<std::complex<double>*>(base(out))),
s, flags
);
assert(ret);
return ret;
}
namespace fftw{
#if HAVE_FFTW3_THREADS
void initialize_threads(){int good = fftw_init_threads(); assert(good); (void)good;}
#else
void initialize_threads(){}
#endif
void cleanup(){fftw_cleanup();}
struct environment{
~environment(){cleanup();}
};
class plan{
plan() : impl_{nullptr, &fftw_destroy_plan}{}
std::unique_ptr<std::remove_pointer_t<fftw_plan>, decltype(&fftw_destroy_plan)> impl_;
public:
plan(plan const&) = delete;//default;
plan(plan&&) = default;
template<typename... As,
typename = decltype(fftw_plan_dft(std::declval<As&&>()...))
> plan(As&&... as) : impl_{fftw_plan_dft(std::forward<As>(as)...), &fftw_destroy_plan}{
assert(impl_);
}
template<typename... As>
static auto many(As&&... as)
->std::decay_t<decltype(fftw_plan_many_dft(std::forward<As>(as)...) , std::declval<plan>())>
{
plan r; r.impl_.reset(fftw_plan_many_dft(std::forward<As>(as)...)); return r; // this produces a compilation error in icc++17
}
private:
void execute() const{fftw_execute(impl_.get());}
template<class I, class O>
void execute_dft(I&& i, O&& o) const{
::fftw_execute_dft(impl_.get(), const_cast<fftw_complex*>(reinterpret_cast<fftw_complex const*>(static_cast<std::complex<double> const*>(base(i)))), reinterpret_cast<fftw_complex*>(static_cast<std::complex<double>*>(base(o))));
}
template<class I, class O> void execute(I&& i, O&& o) const{execute_dft(std::forward<I>(i), std::forward<O>(o));}
friend void execute(plan const& p){p.execute();}
public:
plan& operator=(plan&&) = default;
plan& operator=(plan const&) = delete;//default;
void operator()() const{execute();} // http://www.fftw.org/fftw3_doc/Thread-safety.html#Thread-safety
template<class I, class O> void operator()(I&& i, O&& o) const{return execute(std::forward<I>(i), std::forward<O>(o));}
double cost() const{return fftw_cost(impl_.get());}
auto flops() const{
struct{double add; double mul; double fma; operator double() const{return add + mul + 2*fma;}} r;
fftw_flops(impl_.get(), &r.add, &r.mul, &r.fma);
return r;
}
//std::string string_print() const{
// return std::unique_ptr<char>{fftw_sprint_plan(impl_.get())}.get();
//}
//friend std::ostream& operator<<(std::ostream& os, plan const& p){return os<<p.string_print()<<'\n';}
#if HAVE_FFTW3_THREADS
public:
static void make_thread_safe(){
fftw_make_planner_thread_safe(); // needs linking to -lfftw3_threads, requires FFTW-3.3.6 or greater
is_thread_safe_ = true;
}
static int with_nthreads(int n){fftw_plan_with_nthreads(n); nthreads_ = n; return n;}
static int with_nthreads(){
int n=std::thread::hardware_concurrency(); return with_nthreads(n?n:2);
}
static bool is_thread_safe(){return is_thread_safe_;}
static bool nthreads(){return nthreads_;}
private:
static bool is_thread_safe_;
static int nthreads_;
static bool initialized_threads_;
#else
static constexpr bool is_thread_safe(){return false;}
static constexpr bool nthreads(){return 1;}
static constexpr int with_nthreads(){return 1;}
#endif
};
#if HAVE_FFTW3_THREADS
bool plan::is_thread_safe_ = (plan::make_thread_safe(), true);
int plan::nthreads_ = (initialize_threads(), with_nthreads());
#endif
using sign = int;
constexpr sign forward = FFTW_FORWARD;
constexpr sign none = 0;
constexpr sign backward = FFTW_BACKWARD;
static_assert( forward != none and none != backward and backward != forward, "!");
enum strategy: decltype(FFTW_ESTIMATE){ estimate = FFTW_ESTIMATE, measure = FFTW_MEASURE };
template<class In, class Out>
auto dft(In const& i, Out&& o, int s)
->decltype(fftw::plan{i, o, s}(), std::forward<Out>(o)){
return fftw::plan{i, o, s}(), std::forward<Out>(o);}
using std::decay_t;
template<class In, class Out, std::size_t D=In::dimensionality>
auto dft(std::array<bool, +D> which, In const& i, Out&& o, sign s)
->decltype(plan{which, i, o, s}(), std::forward<Out>(o)){
return plan{which, i, o, s}(), std::forward<Out>(o);}
template<typename In, class Out, dimensionality_type D=In::dimensionality, dimensionality_type=std::decay_t<Out>::dimensionality>
auto dft(std::array<sign, +D> w, In const& i, Out&& o){
std::array<bool, D> fwd, /*non,*/ bwd;
std::transform(begin(w), end(w), begin(fwd), [](auto e){return e==FFTW_FORWARD;});
dft(fwd, i, o, fftw::forward);
std::transform(begin(w), end(w), begin(bwd), [](auto e){return e==FFTW_BACKWARD;});
if(std::accumulate(begin(bwd), end(bwd), false)) dft(bwd, o, o, FFTW_BACKWARD);
return std::forward<Out>(o);
}
template<typename It1, typename It2>
auto many_dft(It1 first, It1 last, It2 d_first, int sign)
->decltype(plan::many(first, last, d_first, sign)(), d_first + (last - first)){
return plan::many(first, last, d_first, sign)(), d_first + (last - first);}
template<typename In, class R=typename In::decay_type>
NODISCARD("when first argument is const")
auto dft(In const& i, sign s)
->std::decay_t<decltype(dft(i, R(extensions(i), get_allocator(i)), s))>{
return dft(i, R(extensions(i), get_allocator(i)), s);}
template<typename T, dimensionality_type D, class... Args>
decltype(auto) rotate(multi::array<T, D, Args...>& i, int = 1){
multi::array_ref<T, D, typename multi::array<T, D, Args...>::element_ptr> before(data_elements(i), extensions(i));
i.reshape(extensions(rotated(before) ));
fftw::dft(before, i, fftw::none);
return i;
}
template<typename In, dimensionality_type D = In::dimensionality, class R=typename In::decay_type>
NODISCARD("when first argument is const")
auto dft(std::array<bool, +D> which, In const& i, sign s)
->std::decay_t<decltype(fftw::dft(which, i, R(extensions(i), get_allocator(i)), s))>{
return fftw::dft(which, i, R(extensions(i), get_allocator(i)), s);}
template<typename In, multi::dimensionality_type D = std::decay_t<In>::dimensionality>
auto dft(std::array<bool, +D> which, In&& i, sign s)
->decltype(dft(which, i, i, s), std::forward<In>(i)){
return dft(which, i, i, s), std::forward<In>(i);}
template<typename In, std::size_t D = In::dimensionality, class R=typename In::decay_type>
void dft(std::array<bool, +D> which, In const& i) = delete;
template<dimensionality_type Rank /*not deduced*/, typename In, class R=typename In::decay_type>
NODISCARD("when second argument is const")
R dft(In const& i, sign s){
static_assert( Rank <= In::dimensionality, "!" );
return dft<Rank>(i, R(extensions(i), get_allocator(i)), s);
}
template<typename... A> auto dft_forward(A&&... a)
->decltype(fftw::dft(std::forward<A>(a)..., fftw::forward)){
return fftw::dft(std::forward<A>(a)..., fftw::forward);}
template<typename BoolArray, typename A>
NODISCARD("when input argument is read only")
auto dft_forward(BoolArray which, A const& a)
->decltype(fftw::dft(which, a, fftw::forward)){
return fftw::dft(which, a, fftw::forward);}
template<class A, multi::dimensionality_type D = A::dimensionality>
NODISCARD("when input argument is read only")
auto dft_forward(std::array<bool, +D> which, A const& a)
->decltype(fftw::dft(which, a, fftw::forward)){
return fftw::dft(which, a, fftw::forward);}
template<class A, class O, multi::dimensionality_type D = A::dimensionality>
auto dft_forward(std::array<bool, +D> which, A const& a, O&& o)
->decltype(fftw::dft(which, a, std::forward<O>(o), fftw::forward)){
return fftw::dft(which, a, std::forward<O>(o), fftw::forward);}
template<typename A>
NODISCARD("when input argument is read only")
auto dft_forward(A const& a)
->decltype(fftw::dft(a, fftw::forward)){
return fftw::dft(a, fftw::forward);}
template<typename... A> auto dft_backward(A&&... a)
->decltype(dft(std::forward<A>(a)..., fftw::backward)){
return dft(std::forward<A>(a)..., fftw::backward);}
template<class In> In&& dft_inplace(In&& i, sign s){
fftw::plan{i, i, (int)s}();//(i, i);
return std::forward<In>(i);
}
template<class In, class Out, dimensionality_type D = In::dimensionality>
auto copy(In const& i, Out&& o)
->decltype(dft(std::array<bool, D>{}, i, std::forward<Out>(o), fftw::forward)){
return dft(std::array<bool, D>{}, i, std::forward<Out>(o), fftw::forward);}
template<typename In, class R=typename In::decay_type>
NODISCARD("when argument is const")
R copy(In const& i)
{//->decltype(copy(i, R(extensions(i), get_allocator(i))), R()){
return copy(i, R(extensions(i), get_allocator(i)));}
template<typename In, class R=typename std::decay_t<In>::decay_type>
auto move(In&& in){
if(in.is_compact()){
multi::array_ref<typename In::element, In::dimensionality, typename In::element_ptr> ref(
in.base(), extensions(in)
);
copy(in, ref);
return R(
multi::array_ref<typename In::element, In::dimensionality_type, std::move_iterator<typename In::element_ptr>>(std::make_move_iterator(in.mbase()), ((in.mbase()=0), extensions(ref)))
);
}else return copy(std::forward<In>(in));
}
template<typename T, dimensionality_type D, class P, class R=typename multi::array<T, D>>
R copy(multi::basic_array<T, D, multi::move_ptr<T, P>>&& a){
if(a.is_compact()){
return
fftw::copy(
a.template static_array_cast<T, T*>(),
multi::array_ref<T, D, T*>(a.base().base(), a.extensions())
).template static_array_cast<T, multi::move_ptr<T>>()
;
}else return fftw::copy(a.template static_array_cast<T, P>());
}
template<class Array>
auto transpose(Array& a)
->decltype(fftw::copy(transposed(a), a.reshape(extensions(layout(a).transpose())))){
multi::array_ref<typename Array::element, Array::dimensionality, typename Array::element_ptr> r(a.base(), extensions(a));
return fftw::copy(r.transposed(), a.reshape(layout(a).transpose().extensions()));
}
#if 0
// TODO investigate why this doesn't work as expected
template<class Array>
auto rotate(Array& a)
->decltype(fftw::copy(rotated(a), a.reshape(extensions(layout(a).transpose())))){
multi::array_ref<typename Array::element, Array::dimensionality, typename Array::element_ptr> r(a.base(), extensions(a));
auto&& ro = r.rotated();
return fftw::copy(ro, a.reshape(layout(a).rotate().extensions()));
}
#endif
}}}
////////////////////////////////////////////////////////////////////////////////
#if not __INCLUDE_LEVEL__
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi FFTW adaptor"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../array.hpp"
#include "../adaptors/../complex.hpp"
#include<chrono>
#include<random>
#include<thrust/complex.h>
namespace{
namespace multi = boost::multi;
namespace fftw = multi::fftw;
using complex = std::complex<double>; MAYBE_UNUSED complex const I{0, 1};
template<class M> auto power(M const& m)->decltype(std::norm(m)){return std::norm(m);}
template<class M, DELETE((M::dimensionality < 1))> double power(M const& m){return accumulate(begin(m), end(m), 0., [](auto const& a, auto const& b){return a + power(b);});}
struct sum_power{
template<class A, class B> auto operator()(A const& a, B const& b) const{return a+power(b);}
};
MAYBE_UNUSED constexpr int N = 16;
}
struct watch : private std::chrono::high_resolution_clock{
std::string label_; time_point start_;
watch(std::string label ="") : label_{label}, start_{now()}{}
~watch(){
std::cerr<< label_<<": "<< std::chrono::duration<double>(now() - start_).count() <<" sec"<<std::endl;
}
};
template<class T> struct randomizer{
template<class M> void operator()(M&& m) const{for(auto&& e:m) operator()(e);}
void operator()(T& e) const{
static std::random_device r; static std::mt19937 g{r()}; static std::normal_distribution<T> d;
e = d(g);
}
};
template<class T> struct randomizer<std::complex<T>>{
template<class M> void operator()(M&& m) const{for(auto&& e:m) operator()(e);}
void operator()(std::complex<T>& e) const{
static std::random_device r; static std::mt19937 g{r()}; static std::normal_distribution<T> d;
e = std::complex<T>(d(g), d(g));
}
};
struct fftw_fixture : fftw::environment{
void setup(){}
void teardown(){}//fftw_cleanup();}
};
BOOST_TEST_GLOBAL_FIXTURE( fftw_fixture );
BOOST_AUTO_TEST_CASE(fftw_3D){
using complex = std::complex<double>; //TODO make it work with thrust
multi::array<complex, 3> in({10, 10, 10});
in[2][3][4] = 99.;
auto fwd = multi::fftw::dft(in, fftw::forward);
BOOST_REQUIRE(in[2][3][4] == 99.);
}
BOOST_AUTO_TEST_CASE(fftw_1D_const){
multi::array<complex, 1> const in = {1. + 2.*I, 2. + 3. *I, 4. + 5.*I, 5. + 6.*I};
auto fwd = multi::fftw::dft(in, fftw::forward); // Fourier[in, FourierParameters -> {1, -1}]
BOOST_REQUIRE( size(fwd) == size(in) );
BOOST_REQUIRE( fwd[2] == -2. - 2.*I );
BOOST_REQUIRE( in[1] == +2. + 3.*I );
auto bwd = multi::fftw::dft(in, fftw::forward); // InverseFourier[in, FourierParameters -> {-1, -1}]
BOOST_REQUIRE( bwd[2] == -2. - 2.*I );
}
BOOST_AUTO_TEST_CASE(fftw_2D_identity_2, *boost::unit_test::tolerance(0.0001)){
multi::array<complex, 2> const in = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
multi::array<complex, 2> out(extensions(in));
multi::fftw::dft({false, false}, in, out, fftw::forward); // out = in;
BOOST_REQUIRE( power(in) == power(out) );
BOOST_REQUIRE( out == in );
}
BOOST_AUTO_TEST_CASE(fftw_2D_identity, *boost::unit_test::tolerance(0.0001)){
multi::array<complex, 2> const in = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
auto fwd = multi::fftw::dft({}, in, fftw::forward);
BOOST_REQUIRE( fwd == in );
}
BOOST_AUTO_TEST_CASE(fftw_2D, *boost::unit_test::tolerance(0.0001)){
multi::array<complex, 2> const in = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
namespace fftw = multi::fftw;
auto fwd = fftw::dft_forward(in);
BOOST_TEST_REQUIRE( fwd[3][1].real() == -19.0455 ); // Fourier[in, FourierParameters -> {1, -1}][[4]][[2]]
BOOST_TEST_REQUIRE( fwd[3][1].imag() == - 2.22717 );
multi::array<complex, 1> const in0 = {1. + 2.*I, 9. - 1.*I, 2. + 4.*I};
auto b = multi::fftw::dft_forward(in0);
auto a = multi::fftw::dft_forward(in[0]);
BOOST_REQUIRE( fftw::dft_forward(in[0]) == fftw::dft_forward(in0) );
}
BOOST_AUTO_TEST_CASE(fftw_2D_rotated, *boost::unit_test::tolerance(0.0001)){
using multi::array;
array<complex, 2> const in = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
using multi::fftw::dft_forward;
auto fwd = dft_forward(in);
BOOST_REQUIRE(
dft_forward(rotated(in)[0])
== dft_forward(array<complex, 1>{1.+2.*I, 3.+3.*I, 4. + 1.*I, 3. - 1.*I, 31. - 1.*I})
);
BOOST_REQUIRE( dft_forward(rotated(in)) == rotated(fwd) );
}
BOOST_AUTO_TEST_CASE(fftw_2D_many, *boost::unit_test::tolerance(0.0001)){
multi::array<complex, 2> const in = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
multi::array<complex, 2> out(extensions(in));
using multi::fftw::dft_forward;
multi::fftw::dft({fftw::none, fftw::forward}, in, out);
BOOST_REQUIRE( dft_forward(in[0]) == out[0] );
multi::fftw::dft({false, true}, rotated(in), rotated(out), fftw::forward);
BOOST_REQUIRE( dft_forward(rotated(in)[0]) == rotated(out)[0] );
multi::fftw::dft_forward({false, false}, rotated(in), rotated(out));
BOOST_REQUIRE( in == out );
multi::fftw::many_dft(begin(in), end(in), begin(out), fftw::forward);
BOOST_REQUIRE( dft_forward(in[0]) == out[0] );
}
BOOST_AUTO_TEST_CASE(fftw_many1_from_2){
multi::array<complex, 2> in({3, 10}); randomizer<complex>{}(in);
multi::array<complex, 2> out({3, 10});
fftw::dft({false, true}, in, out, fftw::forward);
multi::array<complex, 2> out2({3, 10});
for(int i = 0; i!=size(in); ++i)
fftw::dft(in[i], out2[i], fftw::forward);
BOOST_REQUIRE(out2 == out);
}
BOOST_AUTO_TEST_CASE(fftw_many2_from_3){
multi::array<complex, 3> in({3, 5, 6}); randomizer<complex>{}(in);
multi::array<complex, 3> out({3, 5, 6});
fftw::dft({false, true, true}, in, out, FFTW_FORWARD);
multi::array<complex, 3> out2({3, 5, 6});
for(int i = 0; i!=size(in); ++i)
fftw::dft(in[i], out2[i], FFTW_FORWARD);
BOOST_REQUIRE(out2 == out);
}
BOOST_AUTO_TEST_CASE(fftw_many2_from_2){
multi::array<complex, 2> in({5, 6}); randomizer<complex>{}(in);
multi::array<complex, 2> out({5, 6});
fftw::dft({true, true}, in, out, FFTW_FORWARD);
multi::array<complex, 2> out2({5, 6});
fftw::dft(in, out2, FFTW_FORWARD);
BOOST_REQUIRE(out2 == out);
}
BOOST_AUTO_TEST_CASE(fftw_4D){
multi::array<complex, 4> const in = []{
multi::array<complex, 4> in({10, 10, 10, 10}); in[2][3][4][5] = 99.; return in;
}();
auto fwd = multi::fftw::dft({true, true, true, true}, in, fftw::forward);
BOOST_REQUIRE(in[2][3][4][5] == 99.);
}
BOOST_AUTO_TEST_CASE(fftw_4D_many){
auto const in = []{
multi::array<complex, 4> in({97, 95, 101, 10}, 0.);
in[2][3][4][5] = 99.; return in;
}();
auto fwd = multi::fftw::dft({true, true, true, false}, in, fftw::forward);
BOOST_REQUIRE( in[2][3][4][5] == 99. );
multi::array<complex, 4> out(extensions(in));
multi::fftw::many_dft(begin(unrotated(in)), end(unrotated(in)), begin(unrotated(out)), fftw::forward);
BOOST_REQUIRE( out == fwd );
}
BOOST_AUTO_TEST_CASE(cufft_many_2D){
auto const in = []{
multi::array<complex, 3> ret({10, 10, 10});
std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(),
[](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};}
);
return ret;
}();
multi::array<complex, 3> out(extensions(in));
multi::fftw::many_dft((in<<1).begin(), (in<<1).end(), (out<<1).begin(), multi::fftw::forward);
multi::array<complex, 3> out2(extensions(in));
multi::fftw::dft({true, false, true}, in, out2, multi::fftw::forward);
BOOST_REQUIRE( out == out2 );
}
BOOST_AUTO_TEST_CASE(fftw_5D){
multi::array<complex, 5> in({4, 5, 6, 7, 8});
in[2][3][4][5][6] = 99.;
auto fwd = multi::fftw::dft(in, fftw::forward);
BOOST_REQUIRE(in[2][3][4][5][6] == 99.);
}
BOOST_AUTO_TEST_CASE(fftw_1D_power){
multi::array<complex, 1> in(N, 0.); assert( size(in) == N );
std::iota(begin(in), end(in), 1.);
multi::array<complex, 1> out(extensions(in));
static_assert(dimensionality(in)==dimensionality(out), "!");
auto p = multi::fftw_plan_dft(in, out, fftw::forward, FFTW_PRESERVE_INPUT);
fftw_execute(p);
fftw_destroy_plan(p);
BOOST_REQUIRE( (power(in) - power(out)/num_elements(out)) < 1e-17 );
}
BOOST_AUTO_TEST_CASE(fftw_2D_power){
multi::array<complex, 2> in({N, N});
std::iota(data_elements(in), data_elements(in) + num_elements(in), 1.2);
multi::array<complex, 2> out(extensions(in));
auto p = multi::fftw_plan_dft(in, out, fftw::forward, FFTW_PRESERVE_INPUT);
fftw_execute(p); fftw_destroy_plan(p);
BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-12 );
}
BOOST_AUTO_TEST_CASE(fftw_2D_power_plan){
multi::array<complex, 2> in({16, 16});
std::iota(data_elements(in), data_elements(in) + num_elements(in), 1.2);
multi::array<complex, 2> out(extensions(in));
multi::fftw::plan const p{in, out, fftw::forward, FFTW_PRESERVE_INPUT};
p(); //execute(p); //p.execute();
BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-8 );
}
BOOST_AUTO_TEST_CASE(fftw_2D_power_dft){
multi::array<complex, 2> in({16, 16}); std::iota(data_elements(in), data_elements(in) + num_elements(in), 1.2);
multi::array<complex, 2> out(extensions(in));
multi::fftw::dft(in, out, fftw::forward);
BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-8 );
}
BOOST_AUTO_TEST_CASE(fftw_2D_power_dft_out){
multi::array<complex, 2> in({16, 16}); std::iota(data_elements(in), data_elements(in) + num_elements(in), 1.2);
auto out = multi::fftw::dft(in, fftw::forward);
BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-8 );
}
BOOST_AUTO_TEST_CASE(fftw_2D_power_dft_out_default){
multi::array<complex, 2> in({16, 16}); std::iota(data_elements(in), data_elements(in) + num_elements(in), 1.2);
auto out = multi::fftw::dft(in, fftw::forward);
BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-8 );
}
BOOST_AUTO_TEST_CASE(fftw_3D_power){
multi::array<complex, 3> in({4, 4, 4}); std::iota(in.data_elements(), in.data_elements() + in.num_elements(), 1.2);
multi::array<complex, 3> out = fftw::dft(in, fftw::forward);
BOOST_REQUIRE( std::abs(power(in) - power(out)/num_elements(out)) < 1e-10 );
}
BOOST_AUTO_TEST_CASE(fftw_3D_power_in_place){
multi::array<complex, 3> io({4, 4, 4}); std::iota(io.data_elements(), io.data_elements() + io.num_elements(), 1.2);
auto powerin = power(io);
fftw::dft_inplace(io, fftw::forward);
BOOST_REQUIRE( powerin - power(io)/num_elements(io) < 1e-10 );
}
BOOST_AUTO_TEST_CASE(fftw_3D_power_in_place_over_ref_inplace){
multi::array<complex, 3> io({4, 4, 4}); std::iota(io.data_elements(), io.data_elements() + io.num_elements(), 1.2);
auto powerin = power(io);
// fftw::dft_inplace(multi::array_ref<complex, 3>(io.data(), io.extensions()), fftw::forward);
fftw::dft_inplace(multi::array_ref<complex, 3>(data_elements(io), extensions(io)), fftw::forward);
BOOST_REQUIRE( powerin - power(io)/num_elements(io) < 1e-10 );
}
BOOST_AUTO_TEST_CASE(fftw_3D_power_out_of_place_over_ref){
multi::array<complex, 3> in({4, 4, 4}); std::iota(data_elements(in), data_elements(in)+num_elements(in), 1.2);
multi::array<complex, 3> out({4, 4, 4});
multi::array_ref<complex, 3>(data_elements(out), extensions(out)) = fftw::dft(multi::array_cref<complex, 3>(data_elements(in), extensions(in)), fftw::forward);
BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-10 );
}
BOOST_AUTO_TEST_CASE(fftw_3D_power_out_of_place_over_temporary){
double powerin;
auto f = [&](){
multi::array<complex, 3> in({4, 4, 4});
std::iota(data_elements(in), data_elements(in)+num_elements(in), 1.2);
powerin = power(in);
return in;
};
auto out = fftw::dft(f(), fftw::forward);
BOOST_REQUIRE( std::abs(powerin - power(out)/num_elements(out)) < 1e-10 );
}
BOOST_AUTO_TEST_CASE(fftw_2D_transposition_square_inplace){
multi::array<complex, 2> in = {
{11., 12.},
{21., 22.}
};
BOOST_REQUIRE( in[1][0] == 21. );
multi::fftw::copy(in, rotated(in));
BOOST_TEST( in[0][1].real() == 21. );
BOOST_TEST( in[0][1].imag() == 0. );
}
BOOST_AUTO_TEST_CASE(fftw_4D_inq_poisson){
multi::array<complex, 4> const in = []{
multi::array<complex, 4> in({50, 100, 137, 1});
std::iota(data_elements(in), data_elements(in)+num_elements(in), 1.2);
return in;
}();
multi::array<complex, 4> out(extensions(in));
multi::fftw::dft({0, 1, 1, 0}, in, out);
BOOST_TEST( power(in) == power(out)/std::get<1>(sizes(out))/std::get<2>(sizes(out)) , boost::test_tools::tolerance(1e-10) );
}
#endif
#endif

View File

@ -0,0 +1,21 @@
cmake_minimum_required(VERSION 3.11)
set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
project(boost-multi-adaptors-fftw VERSION 0.1 LANGUAGES CXX)
find_package(FFTW REQUIRED COMPONENTS DOUBLE_LIB)
include_directories(${FFTW_INCLUDE_DIRS})
link_libraries(${FFTW_LIBRARIES})
link_libraries(${BLAS_LIBRARIES})
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
include_directories(${CMAKE_BINARY_DIR})
add_subdirectory(test)

View File

@ -0,0 +1,68 @@
# ==================================================================================================
# This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
# CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
#
# Author(s):
# Cedric Nugteren <cedric.nugteren@surfsara.nl>
#
# ==================================================================================================
#
# Defines the following variables:
# FFTW_FOUND Boolean holding whether or not the FFTW3 library was found
# FFTW_INCLUDE_DIRS The FFTW3 include directory
# FFTW_LIBRARIES The FFTW3 library
#
# In case FFTW3 is not installed in the default directory, set the FFTW_ROOT variable to point to
# the root of FFTW3, such that 'fftw3.h' can be found in $FFTW_ROOT/include. This can either be done
# using an environmental variable (e.g. export FFTW_ROOT=/path/to/fftw3) or using a CMake variable
# (e.g. cmake -DFFTW_ROOT=/path/to/fftw3 ..).
#
# ==================================================================================================
# Sets the possible install locations
set(FFTW_HINTS
${FFTW_ROOT}
$ENV{FFTW_ROOT}
)
set(FFTW_PATHS
/usr
/usr/local
)
# Finds the include directories
find_path(FFTW_INCLUDE_DIRS
NAMES fftw3.h
HINTS ${FFTW_HINTS}
PATH_SUFFIXES include api inc include/x86_64 include/x64
PATHS ${FFTW_PATHS}
DOC "FFTW3 include header fftw3.h"
)
mark_as_advanced(FFTW_INCLUDE_DIRS)
# Finds the library
find_library(FFTW_LIBRARIES
NAMES fftw3
HINTS ${FFTW_HINTS}
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32
PATHS ${FFTW_PATHS}
DOC "FFTW3 library"
)
mark_as_advanced(FFTW_LIBRARIES)
# ==================================================================================================
# Notification messages
if(NOT FFTW_INCLUDE_DIRS)
message(STATUS "Could NOT find 'fftw3.h', install FFTW3 or set FFTW_ROOT")
endif()
if(NOT FFTW_LIBRARIES)
message(STATUS "Could NOT find the FFTW3 library, install it or set FFTW_ROOT")
endif()
# Determines whether or not FFTW3 was found
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(FFTW DEFAULT_MSG FFTW_INCLUDE_DIRS FFTW_LIBRARIES)
# ==================================================================================================

View File

@ -0,0 +1,174 @@
#if COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
$CXXX $CXXFLAGS $0 -o $0x -lfftw3 -lfftw3_mpi&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2020
// apt-get install libfftw3-mpi-dev
// compile with: mpicc simple_mpi_example.c -Wl,-rpath=/usr/local/lib -lfftw3_mpi -lfftw3 -o simple_mpi_example */
#ifndef MULTI_ADAPTOR_FFTW_MEMORY_HPP
#define MULTI_ADAPTOR_FFTW_MEMORY_HPP
#include <fftw3.h>
#include "../../config/NODISCARD.hpp"
#include<cassert>
#include<cstddef>
#include<complex>
#include<limits>
#include<memory>
#include<type_traits>
namespace boost{
namespace multi{
namespace fftw{
template<class T>
class allocator{
public:
using value_type = T;
#if 1
using pointer = value_type*;
using const_pointer = typename std::pointer_traits<pointer>::template
rebind<value_type const>;
using void_pointer = typename std::pointer_traits<pointer>::template
rebind<void>;
using const_void_pointer = typename std::pointer_traits<pointer>::template
rebind<const void>;
using difference_type = typename std::pointer_traits<pointer>::difference_type;
using size_type = std::make_unsigned_t<difference_type>;
template <class U> struct rebind {typedef allocator<U> other;};
#endif
allocator() noexcept {} // not required, unless used
template <class U> allocator(allocator<U> const&) noexcept {}
NODISCARD("to avoid memory leak")
value_type* allocate(std::size_t n) const{return static_cast<value_type*>(fftw_malloc(sizeof(T)*n));}
// value_type* // Use pointer if pointer is not a value_type*
// allocate(std::size_t n){return static_cast<value_type*>(::operator new (n*sizeof(value_type)));}
void deallocate(value_type* p, std::size_t){fftw_free(p);}
// void deallocate(value_type* p, std::size_t) noexcept // Use pointer if pointer is not a value_type*
// {::operator delete(p);}
static int alignment_of(value_type* p){return fftw_alignment_of((double*)p);}
#if 1
value_type* allocate(std::size_t n, const_void_pointer){return allocate(n);}
template <class U, class ...Args>
void construct(U* p, Args&& ...args){::new(p) U(std::forward<Args>(args)...);}
template <class U> void destroy(U* p) noexcept{p->~U();}
std::size_t max_size() const noexcept{return std::numeric_limits<size_type>::max();}
allocator select_on_container_copy_construction() const{return *this;}
using propagate_on_container_copy_assignment = std::false_type;
using propagate_on_container_move_assignment = std::false_type;
using propagate_on_container_swap = std::false_type;
using is_always_equal = std::is_empty<allocator>;
#endif
};
template <class T, class U>
bool operator==(allocator<T> const&, allocator<U> const&) noexcept{return true;}
template <class T, class U>
bool operator!=(allocator<T> const& x, allocator<U> const& y) noexcept{
return !(x == y);
}
#if 0
template<typename T>
struct allocator{
using value_type = T;
using pointer = value_type*;
using size_type = std::size_t;
using difference_type = std::ptrdiff_t;
using propagate_on_container_move_assignment = std::true_type;
// NODISCARD("to avoid memory leak")
pointer allocate(size_type n) const{return static_cast<pointer>(fftw_malloc(sizeof(T)*n));}
void deallocate(pointer data, size_type){fftw_free(data);}
};
#endif
//template<> allocator<std::complex<double>>::pointer allocator<std::complex<double>>::allocate(size_type n){return reinterpret_cast<std::complex<double>*>(fftw_alloc_complex(n));}
//template<> allocator< double >::pointer allocator< double >::allocate(size_type n){return fftw_alloc_real(n) ;}
#if 0
template<>
struct allocator<std::complex<double>>{
using value_type = std::complex<double>;
using pointer = value_type*;
using size_type = std::size_t;
using difference_type = std::ptrdiff_t;
using propagate_on_container_move_assignment = std::true_type;
NODISCARD("to avoid memory leak")
pointer allocate(size_type n){return reinterpret_cast<std::complex<double>*>(fftw_alloc_complex(n));}
void deallocate(pointer data, size_type){fftw_free(data);}
};
template<>
struct allocator<double>{
using value_type = double;
using pointer = value_type*;
using size_type = std::size_t;
using difference_type = std::ptrdiff_t;
using propagate_on_container_move_assignment = std::true_type;
NODISCARD("to avoid memory leak")
pointer allocate(size_type n){return fftw_alloc_real(n);}
void deallocate(pointer data, size_type){fftw_free(data);}
};
#endif
}}}
#if 0//__NVCC__
namespace std{
template<class T> struct allocator_traits<boost::multi::fftw::allocator<T>> : std::allocator_traits<std::allocator<T>>{
using base = std::allocator_traits<std::allocator<T>>;
template<class U> using rebind_alloc = boost::multi::fftw::allocator<U>;
template<class A>
static auto allocate(A& a, typename base::size_type n){return a.allocate(n);}
};
}
#endif
#if 0 //def __NVCC__
namespace std{
template<class T> struct allocator_traits<boost::multi::fftw::allocator<T>> : std::allocator_traits<std::allocator<T>>{
template<class U> using rebind_alloc = boost::multi::fftw::allocator<U>;
};
}
#endif
#if not __INCLUDE_LEVEL__
#include "../../array.hpp"
#include<vector>
namespace multi = boost::multi;
int main(){
{
std::vector<double, multi::fftw::allocator<double>> v(100);
multi::array<double, 2> arr({10, 20});
}
{
std::vector<std::complex<double>, multi::fftw::allocator<std::complex<double>>> v(100);
multi::array<std::complex<double>, 2> arr({10, 20});
}
}
#endif
#endif

View File

@ -0,0 +1,207 @@
#if COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4;-*-
ln -sf $0 $0.cpp;mpicxx -g -I$HOME/prj/alf $0.cpp -o $0x -lfftw3 -lfftw3_mpi&&time mpirun -n 4 $0x&&rm $0x $0.cpp;exit
#ln -sf $0 $0.cpp;mpicxx -g -I$HOME/prj/alf $0.cpp -o $0x -lfftw3 -lfftw3_mpi&&time mpirun -n 4 valgrind --leak-check=full --track-origins=yes --show-leak-kinds=all --suppressions=$HOME/prj/alf/boost/mpi3/test/communicator_main.cpp.openmpi.supp --error-exitcode=1 $0x&&rm $0x $0.cpp;exit
#endif
// © Alfredo A. Correa 2020
// apt-get install libfftw3-mpi-dev
// compile with: mpicc simple_mpi_example.c -Wl,-rpath=/usr/local/lib -lfftw3_mpi -lfftw3 -o simple_mpi_example */
#include "../../array.hpp"
#include "../../config/NODISCARD.hpp"
#include<boost/mpi3/communicator.hpp>
#include<boost/mpi3/environment.hpp>
#include "../fftw.hpp"
#include <fftw3-mpi.h>
namespace boost{
namespace multi{
namespace fftw{
template<typename T>
struct allocator : std::allocator<T>{
template <typename U> struct rebind{using other = fftw::allocator<U>;};
NODISCARD("to avoid memory leak")
T* allocate(std::size_t n){ return static_cast<T*>(fftw_malloc(sizeof(T)*n));}
void deallocate(T* data, std::size_t){fftw_free(data);}
};
namespace mpi{
struct environment{
environment(){fftw_mpi_init();}
~environment(){fftw_mpi_cleanup();}
};
template<class T, multi::dimensionality_type D, class Alloc = fftw::allocator<T>>
struct array;
namespace bmpi3 = boost::mpi3;
template<class T, class Alloc>
struct array<T, multi::dimensionality_type{2}, Alloc>{
using element_type = T;
mutable bmpi3::communicator comm_;
Alloc alloc_;
typename std::allocator_traits<Alloc>::size_type local_count_;
array_ptr<T, 2, typename std::allocator_traits<Alloc>::pointer> local_ptr_;
ptrdiff_t n0_;
static std::pair<typename std::allocator_traits<Alloc>::size_type, multi::extensions_type_<2>>
local_2d(multi::extensions_type_<2> ext, boost::mpi3::communicator const& comm){
ptrdiff_t local_n0, local_0_start;
auto count = fftw_mpi_local_size_2d(std::get<0>(ext).size(), std::get<1>(ext).size(), comm.get(), &local_n0, &local_0_start);
assert( count >= local_n0*std::get<1>(ext).size() );
return {count, {{local_0_start, local_0_start + local_n0}, std::get<1>(ext)}};
}
static auto local_count_2d(multi::extensions_type_<2> ext, boost::mpi3::communicator const& comm){
return local_2d(ext, comm).first;
}
static auto local_extension_2d(multi::extensions_type_<2> ext, boost::mpi3::communicator const& comm){
return local_2d(ext, comm).second;
}
array(multi::extensions_type_<2> ext, bmpi3::communicator comm = mpi3::environment::self(), Alloc alloc = {}) :
comm_{std::move(comm)},
alloc_{alloc},
local_count_{local_count_2d(ext, comm_)},
local_ptr_ {alloc_.allocate(local_count_), local_extension_2d(ext, comm_)},
n0_{multi::layout_t<2>(ext).size()}
{
if(not std::is_trivially_default_constructible<element_type>{})
adl_alloc_uninitialized_default_construct_n(alloc_, local_ptr_->base(), local_ptr_->num_elements());
}
bmpi3::communicator& comm() const&{return comm_;}
array(array const& other) :
comm_ {other.comm_},
alloc_ {other.alloc_},
local_count_{other.local_count_},
local_ptr_ {alloc_.allocate(local_count_), local_extension_2d(other.extensions(), comm_)},
n0_{multi::layout_t<2>(other.extensions()).size()}
{
local_cutout() = other.local_cutout();
}
array(array&& other) :
comm_ {std::move(other.comm_)},
alloc_ {std::move(other.alloc_)},
local_count_{std::exchange(other.local_count_, 0)},
local_ptr_ {std::exchange(other.local_ptr_, nullptr)},
n0_{multi::layout_t<2>(other.extensions()).size()}
{}
explicit array(multi::array<T, 2> const& other, bmpi3::communicator comm = mpi3::environment::self(), Alloc alloc = {}) :
array(other.extensions(), comm, alloc)
{
local_cutout() = other.stenciled(std::get<0>(local_cutout().extensions()), std::get<1>(local_cutout().extensions()));
}
bool empty() const{return extensions().num_elements();}
array_ref <T, 2> local_cutout() &{return *local_ptr_;}
array_cref<T, 2> local_cutout() const&{return *local_ptr_;}
ptrdiff_t local_count() const&{return local_count_;}
multi::extensions_type_<2> extensions() const&{return {n0_, std::get<1>(local_cutout().extensions())};}
ptrdiff_t num_elements() const&{return multi::layout_t<2>(extensions()).num_elements();}
operator multi::array<T, 2>() const&{ static_assert( std::is_trivially_copy_assignable<T>{}, "!" );
multi::array<T, 2> ret(extensions(), alloc_);
comm_.all_gatherv_n(local_cutout().data_elements(), local_cutout().num_elements(), ret.data_elements());
return ret;
}
array& operator=(multi::array<T, 2> const& other) &{
if(other.extensions() == extensions()) local_cutout() = other.stenciled(std::get<0>(local_cutout().extensions()), std::get<1>(local_cutout().extensions()));
else{
array tmp{other};
std::swap(*this, tmp);
}
return *this;
}
bool operator==(multi::array<T, 2> const& other) const&{
if(other.extensions() != extensions()) return false;
return comm_&=(local_cutout() == other.stenciled(std::get<0>(local_cutout().extensions()), std::get<1>(local_cutout().extensions())));
}
friend bool operator==(multi::array<T, 2> const& other, array const& self){
return self.operator==(other);
}
bool operator==(array<T, 2> const& other) const&{assert(comm_==other.comm_);
return comm_&=(local_cutout() == other.local_cutout());
}
array& operator=(array const& other)&{
if(other.extensions() == this->extensions() and other.comm_ == other.comm_)
local_cutout() = other.local_cutout();
else assert(0);
return *this;
}
~array() noexcept{alloc_.deallocate(local_cutout().data_elements(), local_count_);}
};
array<std::complex<double>, 2>& dft(array<std::complex<double>, 2> const& A, array<std::complex<double>, 2>& B, fftw::sign s){
assert( A.extensions() == B.extensions() );
assert( A.comm() == B.comm() );
fftw_plan p = fftw_mpi_plan_dft_2d(
std::get<0>(A.extensions()).size(), std::get<1>(A.extensions()).size(),
(fftw_complex *)A.local_cutout().data_elements(), (fftw_complex *)B.local_cutout().data_elements(),
A.comm().get(),
s, FFTW_ESTIMATE
);
fftw_execute(p);
fftw_destroy_plan(p);
return B;
}
array<std::complex<double>, 2>& dft_forward(array<std::complex<double>, 2> const& A, array<std::complex<double>, 2>& B){
return dft(A, B, fftw::forward);
}
array<std::complex<double>, 2> dft_forward(array<std::complex<double>,2> const& A){
array<std::complex<double>, 2> ret(A.extensions()); dft_forward(A, ret); return ret;
}
}}}}
#if not __INCLUDE_LEVEL__
#include<boost/mpi3/main.hpp>
#include<boost/mpi3/environment.hpp>
#include<boost/mpi3/ostream.hpp>
#include "../fftw.hpp"
namespace mpi3 = boost::mpi3;
namespace multi = boost::multi;
int mpi3::main(int, char*[], mpi3::communicator world){
multi::fftw::mpi::environment fenv;
multi::fftw::mpi::array<std::complex<double>, 2> A({41, 321}, world);
mpi3::ostream os{world};
os<< "global sizes" << std::get<0>(A.extensions()) <<'x'<< std::get<1>(A.extensions()) <<' '<< A.num_elements() <<std::endl;
os<< A.local_cutout().extension() <<'x'<< std::get<1>(A.local_cutout().extensions()) <<"\t#="<< A.local_cutout().num_elements() <<" allocated "<< A.local_count() <<std::endl;
{
auto x = A.local_cutout().extensions();
for(auto i : std::get<0>(x))
for(auto j : std::get<1>(x))
A.local_cutout()[i][j] = std::complex<double>(i + j, i + 2*j + 1)/std::abs(std::complex<double>(i + j, i + 2*j + 1));
}
multi::array<std::complex<double>, 2> A2 = A;
assert( A2 == A );
using multi::fftw::dft_forward;
dft_forward(A , A );
dft_forward(A2, A2);
{
auto x = A.local_cutout().extensions();
for(auto i : std::get<0>(x))
for(auto j : std::get<1>(x))
if(not( std::abs(A.local_cutout()[i][j] - A2[i][j]) < 1e-12 )){
std::cout << A.local_cutout()[i][j] - A2[i][j] <<' '<< std::abs(A.local_cutout()[i][j] - A2[i][j]) << std::endl;
}
}
return 0;
}
#endif

View File

@ -0,0 +1,157 @@
#if COMPILATION_INSTRUCTIONS
#mpicxx -I$HOME/prj/alf $0 -g -o $0x -lfftw3 -lfftw3_mpi &&mpirun -n 4 valgrind $0x;exit
$CXXX $CXXFLAGS -O2 -g `mpicxx -showme:compile|sed 's/-pthread/ /g'` -I$HOME/prj/alf $0 -o $0x `mpicxx -showme:link|sed 's/-pthread/ /g'` -lfftw3 -lfftw3_mpi -lboost_timer&&mpirun -n 4 $0x;exit
#endif
#ifndef MULTI_FFTW_MPI_DISTRIBUTION_HPP
#define MULTI_FFTW_MPI_DISTRIBUTION_HPP
#include <fftw3-mpi.h>
#include<boost/mpi3/communicator.hpp>
#include "../../../array_ref.hpp"
#include <experimental/tuple>
namespace boost{
namespace multi{
namespace fftw{
namespace mpi{
namespace bmpi3 = boost::mpi3;
using difference_type = std::ptrdiff_t;
template<std::ptrdiff_t ElementSize>
class many{
public:
using difference_type = std::ptrdiff_t;
private:
difference_type local_count_;
difference_type local_n0_;
difference_type local_0_start_;
static auto sizes(boost::multi::extensions_type_<2> const& ext){
using std::experimental::apply;
return apply([](auto... e){return std::array<difference_type, 2>{e.size()...};}, ext);
}
public:
many(extensions_type_<2> const& ext, bmpi3::communicator const& comm, difference_type block0 = FFTW_MPI_DEFAULT_BLOCK)
: local_count_{
std::max(
difference_type(
fftw_mpi_local_size_many(
2, sizes(ext).data(), ElementSize/sizeof(double),
block0, comm.get(),
&local_n0_, &local_0_start_
)*sizeof(double)/ElementSize
),
difference_type(1)
)
}
{
static_assert( ElementSize%sizeof(double) == 0 , "!" );
}
difference_type local_count() const{return local_count_ + 100;}
multi::iextension local_extension_0() const{return {local_0_start_, local_0_start_ + local_n0_};}
multi::iextension local_extension() const{return local_extension_0();}
bool operator==(many const& other) const{
return std::tie(this->local_count_, this->local_n0_, this->local_0_start_)
== std::tie(other.local_count_, other.local_n0_, other.local_0_start_);
}
bool operator!=(many const& other) const{return not operator==(other);}
};
template<std::ptrdiff_t ElementSize>
class many_transposed{
public:
using difference_type = std::ptrdiff_t;
private:
difference_type local_count_;
difference_type local_n0_;
difference_type local_0_start_;
difference_type local_n1_;
difference_type local_1_start_;
static auto sizes(boost::multi::extensions_type_<2> const& ext){
using std::experimental::apply;
return apply([](auto... e){return std::array<difference_type, 2>{e.size()...};}, ext);
}
public:
static_assert(ElementSize%sizeof(double)==0, "!");
many_transposed(
extensions_type_<2> const& ext, boost::mpi3::communicator const& comm,
difference_type block0 = FFTW_MPI_DEFAULT_BLOCK, difference_type block1 = FFTW_MPI_DEFAULT_BLOCK
) :
local_count_{
std::max(
difference_type(
fftw_mpi_local_size_many_transposed(
2, sizes(ext).data(), ElementSize/sizeof(double),
block0, block1, comm.get(),
&local_n0_, &local_0_start_,
&local_n1_, &local_1_start_
)*sizeof(double)/ElementSize
),
difference_type(1)
)
}{
static_assert( ElementSize%sizeof(double) == 0 , "!");
// FFTW_MPI_DEFAULT_BLOCK = (size + comm.size - 1)/comm.size
assert( local_count() >= local_extension0().size()*local_extension1().size() );
// assert( block0*comm.size() >= std::get<0>(ext).size() or block0 == FFTW_MPI_DEFAULT_BLOCK );
}
difference_type local_count() const{return local_count_ + 100;}
multi::iextension local_extension0() const{return {local_0_start_, local_0_start_ + local_n0_};}
multi::iextension local_extension1() const{return {local_1_start_, local_1_start_ + local_n1_};}
bool operator==(many_transposed const& other) const{
return std::tie(this->local_count_, this->local_n0_, this->local_0_start_, this->local_n1_, this->local_1_start_)
== std::tie(other.local_count_, other.local_n0_, other.local_0_start_, other.local_n1_, other.local_1_start_);
}
bool operator!=(many_transposed const& other) const{return not operator==(other);}
};
}}}}
#if not __INCLUDE_LEVEL__
#include<boost/mpi3/main_environment.hpp>
#include<boost/mpi3/ostream.hpp>
#include "../../fftw/mpi/environment.hpp"
namespace bmpi3 = boost::mpi3;
namespace multi = boost::multi;
namespace mpi = multi::fftw::mpi;
int bmpi3::main(int, char*[], mpi3::environment& env){
multi::fftw::mpi::environment fenv(env);
auto world = env.world();
mpi3::ostream os{world};
using std::endl;
{
os<< "forced distribution "<<endl;
mpi::many_transposed<sizeof(double)> dist({12, 43}, world, (12+world.size()-1)/world.size());//533/world.size());
os<< "local element count "<< dist.local_count() <<endl;
os<< "local rows "<< dist.local_extension0().size() <<endl;
os<< "local extension "<< dist.local_extension0() <<endl;
}
{
os<< "automatic distribution "<<std::endl;
mpi::many_transposed<sizeof(double)> dist({12, 43}, world);//533/world.size());
os<< "local element count "<< dist.local_count() <<endl;
os<< "local rows "<< dist.local_extension0().size() <<endl;
os<< "local extension "<< dist.local_extension0() <<endl;
}
mpi::many_transposed<sizeof(double)> forced({12, 43}, world);
mpi::many_transposed<sizeof(double)> automa({12, 43}, world);
assert( forced == automa );
return 0;
}
#endif
#endif

View File

@ -0,0 +1,44 @@
#if COMPILATION_INSTRUCTIONS
#mpicxx -I$HOME/prj/alf $0 -g -o $0x -lfftw3 -lfftw3_mpi &&mpirun -n 2 valgrind $0x;exit
$CXXX $CXXFLAGS -O2 -g `mpicxx -showme:compile|sed 's/-pthread/ /g'` -I$HOME/prj/alf $0 -o $0x `mpicxx -showme:link|sed 's/-pthread/ /g'` -lfftw3 -lfftw3_mpi -lboost_timer&&mpirun -n 2 $0x;exit
#endif
#ifndef MULTI_FFTW_MPI_ENVIRONMENT_HPP
#define MULTI_FFTW_MPI_ENVIRONMENT_HPP
#include <fftw3-mpi.h>
#include<boost/mpi3/communicator.hpp>
#include "../../../array_ref.hpp"
#include <experimental/tuple>
namespace boost{
namespace multi{
namespace fftw{
namespace mpi{
namespace bmpi3 = boost::mpi3;
struct environment{
environment(bmpi3::environment&){fftw_mpi_init();}
~environment(){fftw_mpi_cleanup();}
};
}}}}
#if not __INCLUDE_LEVEL__
#include<boost/mpi3/main_environment.hpp>
namespace bmpi3 = boost::mpi3;
namespace multi = boost::multi;
int bmpi3::main(int, char*[], mpi3::environment& env){
multi::fftw::mpi::environment fenv(env);
return 0;
}
#endif
#endif

View File

@ -0,0 +1,566 @@
#if COMPILATION// -*- indent-tabs-mode:t;c-basic-offset:4;tab-width:4;autowrap:nil; -*-
$CXXX $CXXFLAGS `mpicxx -showme:compile|sed 's/-pthread/ /g'` -I$HOME/prj/alf $0 -o $0x `mpicxx -showme:link|sed 's/-pthread/ /g'` -lfftw3 -lfftw3_mpi&&mpirun -n 4 $0x&&rm $0x;exit
#ln -sf $0 $0.cpp;mpicxx -g -I$HOME/prj/alf $0.cpp -o $0x -lfftw3 -lfftw3_mpi&&mpirun -n 4 valgrind --leak-check=full --track-origins=yes --show-leak-kinds=all --suppressions=$HOME/prj/alf/boost/mpi3/test/communicator_main.cpp.openmpi.supp --error-exitcode=1 $0x&&rm $0x $0.cpp;exit
#endif
// © Alfredo A. Correa 2020
// apt-get install libfftw3-mpi-dev
// compile with: mpicc simple_mpi_example.c -Wl,-rpath=/usr/local/lib -lfftw3_mpi -lfftw3 -o simple_mpi_example */
#ifndef MULTI_ADAPTOR_FFTW_MPI_SCATTERED_ARRAY_HPP
#define MULTI_ADAPTOR_FFTW_MPI_SCATTERED_ARRAY_HPP
#include "../mpi/distribution.hpp"
#include "boost/mpi3/process.hpp"
namespace boost{
namespace multi{
namespace fftw{
namespace mpi{
namespace bmpi3 = boost::mpi3;
template<class T, multi::dimensionality_type D, class Alloc = std::allocator<T>> // cannot use fftw::allocator<T> as default because it produces error in nvcc: `template<class _Tp> using __pointer = typename _Tp::pointer is protected within this context`
class scattered_array;
template<class T, multi::dimensionality_type D, class Alloc = std::allocator<T>> // cannot use fftw::allocator<T> as default because it produces error in nvcc: `template<class _Tp> using __pointer = typename _Tp::pointer is protected within this context`
class gathered_array;
template<class T, class Alloc>
struct array{
using local_distrubution_type = many_transposed<sizeof(T)>;
using local_allocator_type = Alloc;
using local_pointer_type = typename std::allocator_traits<local_allocator_type>::pointer;
protected:
local_distrubution_type local_distrubution_;
local_allocator_type alloc_;
local_pointer_type local_data_;
multi::iextension first_ext_;
multi::iextension second_ext_;
public:
array(
multi::extensions_type_<2> exts, bmpi3::communicator comm,
difference_type block0 = FFTW_MPI_DEFAULT_BLOCK, difference_type block1 = FFTW_MPI_DEFAULT_BLOCK,
Alloc alloc = {}
) :
local_distrubution_{exts, comm, block0, block1},
alloc_{alloc},
local_data_{alloc_.allocate(local_distrubution_.local_count())},
first_ext_{std::get<0>(exts)},
second_ext_{std::get<1>(exts)}
{}
~array() noexcept{alloc_.deallocate(local_data_, local_distrubution_.local_count());}
auto local_cutout() &{return array_ref <T, 2, local_pointer_type>(local_data_, local_distrubution_.local_extension0()*local_distrubution_.local_extension1());}//.rotated();}
auto local_cutout() const&{return array_cref<T, 2, local_pointer_type>(local_data_, local_distrubution_.local_extension0()*local_distrubution_.local_extension1());}//.rotated();}
};
template<class T, class Alloc>
class gathered_array<T, 2, Alloc> : public array<T, Alloc>{
bmpi3::communicator comm_;
public:
gathered_array(multi::extensions_type_<2> exts, bmpi3::communicator comm, Alloc alloc = {}) :
array<T, Alloc>{exts, comm, std::get<0>(exts).size(), std::get<1>(exts).size(), alloc},
comm_{std::move(comm)}
{}
scattered_array<T, 2, Alloc> scatter() const{
scattered_array<T, 2, Alloc> other({this->first_ext_, this->second_ext_}, comm_);
auto p = fftw_mpi_plan_many_transpose(
this->second_ext_.size(), this->first_ext_.size(),
sizeof(T)/sizeof(double),
this->second_ext_.size(), FFTW_MPI_DEFAULT_BLOCK,
reinterpret_cast<double*>(const_cast<T*>(this->local_cutout().base())),
reinterpret_cast<double*>( other.local_cutout().base() ),
comm_.get(), FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_IN
);
fftw_execute(p);
fftw_destroy_plan(p);
return other;
}
};
template<class T, class Alloc>
class scattered_array<T, 2, Alloc>{
public:
using local_distrubution_type = many<sizeof(T)>;
using local_allocator_type = Alloc;
using local_pointer_type = typename std::allocator_traits<local_allocator_type>::pointer;
private:
local_distrubution_type local_distribution_;
local_allocator_type alloc_;
local_pointer_type local_data_;
multi::iextension first_ext_;
multi::iextension second_ext_;
mutable bmpi3::communicator comm_;
public:
scattered_array(multi::extensions_type_<2> exts, bmpi3::communicator comm, Alloc alloc = {}) :
local_distribution_{exts, comm},
alloc_{alloc},
local_data_{alloc_.allocate(local_distribution_.local_count())},
first_ext_{std::get<0>(exts)},
second_ext_{std::get<1>(exts)},
comm_{std::move(comm)}
{}
~scattered_array() noexcept{alloc_.deallocate(local_data_, local_distribution_.local_count());}
array_ref <T, 2, local_pointer_type> local_cutout() &{return array_ref <T, 2, local_pointer_type>(local_data_, local_distribution_.local_extension_0()*second_ext_);}
array_cref<T, 2, local_pointer_type> local_cutout() const&{return array_cref<T, 2, local_pointer_type>(local_data_, local_distribution_.local_extension_0()*second_ext_);}
mpi::gathered_array<T, 2> gather() const{
mpi::gathered_array<T, 2> other({first_ext_, second_ext_}, comm_);
auto p = fftw_mpi_plan_many_transpose(
first_ext_.size(), second_ext_.size(),
// std::get<0>(this->extensions()).size(), std::get<1>(this->extensions()).size(),
sizeof(T)/sizeof(double),
FFTW_MPI_DEFAULT_BLOCK, second_ext_.size(), //this->size(),
reinterpret_cast<double*>(const_cast<T*>(this->local_cutout().base())),
reinterpret_cast<double*>( other.local_cutout().base() ),
comm_.get(), FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT
);
fftw_execute(p);
fftw_destroy_plan(p);
return other;
}
bool operator==(scattered_array const& other) const{return comm_&=(local_cutout() == other.local_cutout());}
bool operator!=(scattered_array const& other) const{return not operator==(other);}
};
}}}}
#if 0
template<class T, class Alloc>
class scattered_array<T, multi::dimensionality_type{2}, Alloc>{
public:
using local_allocator_type = Alloc;
using local_pointer_t = typename std::allocator_traits<local_allocator_type>::pointer;
private:
using local_distrubution_type = distribution<sizeof(T)>;
using layout_type = layout_t<T, 2>;
Alloc alloc_ ;
local_pointer_t local_data_; // typename boost::multi::array_ptr<T, 2, local_pointer_t> local_ptr_;
public:
scattered_array(multi::extensions_type_<2> ext, bmpi3::communicator comm = mpi3::environment::self(), Alloc alloc = {}) :
layout_t<T, 2>(ext, comm),
alloc_{alloc},
local_data_{std::allocator_traits<Alloc>::allocate(alloc_, scattered_array::local_count())}//,
{
if(not std::is_trivially_default_constructible<typename scattered_array::element_type>{})
adl_alloc_uninitialized_default_construct_n(alloc_, local_cutout().data_elements()/*local_ptr_->base()*/, local_cutout().num_elements());//local_ptr_->num_elements());
}
scattered_array(scattered_array const& other) :
layout_t<T, 2> {other},
alloc_ {other.alloc_},
local_data_ {std::allocator_traits<Alloc>::allocate(alloc_, layout_type::local_count())}
{
scoped_barrier(other.comm());
local_cutout() = other.local_cutout();
/*
auto p1 = fftw_mpi_plan_many_transpose(
std::get<0>(this->extensions()).size(), std::get<1>(this->extensions()).size(), sizeof(T)/sizeof(double),
other.block(), this->block(),
reinterpret_cast<double*>(const_cast<T*>(other.local_cutout().data_elements())),
reinterpret_cast<double*>( this->local_cutout().data_elements() ),
this->comm().get(), FFTW_ESTIMATE
);
auto p2 = fftw_mpi_plan_many_transpose(
std::get<1>(this->extensions()).size(), std::get<0>(this->extensions()).size(), sizeof(T)/sizeof(double),
other.block(), this->block(),
reinterpret_cast<double*>( this->local_cutout().data_elements()),
reinterpret_cast<double*>( this->local_cutout().data_elements()),
this->comm().get(), FFTW_ESTIMATE
);
fftw_execute(p1);
fftw_execute(p2);
fftw_destroy_plan(p2);
fftw_destroy_plan(p1);
*/
}
scattered_array(scattered_array&& other) : // intel calls this function to return from a function
layout_type{std::exchange(static_cast<layout_type&>(other), layout_type(multi::extensions_type_<2>{}, other.comm()))},
alloc_ {std::move(other.alloc_)},
local_data_{other.local_data_}
{
assert(not other.extensions());
assert(other.local_count() == 0 );
}
friend std::ostream& operator<<(std::ostream& os, scattered_array const& self){
for(int r = 0; r != self.comm().size(); ++r){
if(self.comm().rank() == r){
if(auto x = self.local_cutout().extensions())
for(auto i : std::get<0>(x)){
for(auto j : std::get<1>(x))
os<< self.local_cutout()[i][j] <<' ';
os<<std::endl;
}
}
self.comm().barrier();
}
return os;
}
array_ref <T, 2, local_pointer_t> local_cutout() &//{return *local_ptr_;}
{return array_ref <T, 2, local_pointer_t>(local_data_, this->local_extensions());}
array_cref<T, 2, local_pointer_t> local_cutout() const&//{return *local_ptr_;}
{return array_cref<T, 2, local_pointer_t>(local_data_, this->local_extensions());}
local_pointer_t local_data(){return local_data_;}
typename std::pointer_traits<local_pointer_t>::template rebind<T const> local_data() const{return local_data_;}
auto extensions() const{return this->global_extensions();}
operator multi::array<T, 2>() const&{
static_assert( std::is_trivially_copy_assignable<T>{}, "!" );
multi::array<T, 2> ret(this->global_extensions(), 1., alloc_);
this->comm().all_gatherv_n(local_data_, local_cutout().num_elements(), ret.data_elements());
return ret;
}
mpi::gathered_array<T, 2> gather() const{
mpi::gathered_array<T, 2> other(this->extensions(), this->comm());
this->comm_.gatherv_n(local_cutout().data_elements(), local_cutout().num_elements(), other.data_elements());
static_assert( std::is_trivially_copy_assignable<T>{} and sizeof(T)%sizeof(double)==0, "!");
/* {
fftw_plan p = fftw_mpi_plan_many_transpose(
std::get<0>(this->extensions()).size(), std::get<1>(this->extensions()).size(), sizeof(T)/sizeof(double),
this->block(), std::get<0>(this->extensions()).size(),
reinterpret_cast<double*>(const_cast<T*>(local_cutout().data_elements())),
reinterpret_cast<double*>(ret.data_elements()),
this->comm().get(), FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_IN | FFTW_MPI_TRANSPOSED_OUT
);
fftw_execute(p);
fftw_destroy_plan(p);
}*/
auto p1 = fftw_mpi_plan_many_transpose(
std::get<0>(this->extensions()).size(), std::get<1>(this->extensions()).size(),
sizeof(T)/sizeof(double),
FFTW_MPI_DEFAULT_BLOCK, this->size(),
reinterpret_cast<double*>(const_cast<T*>(this->local_cutout().data_elements())),
reinterpret_cast<double*>( other.data_elements() ),
this->comm().get(), FFTW_ESTIMATE
);
auto p2 = fftw_mpi_plan_many_transpose(
std::get<1>(this->extensions()).size(), std::get<0>(this->extensions()).size(),
sizeof(T)/sizeof(double),
other.block(), other.block(),
reinterpret_cast<double*>( other.data_elements()),
reinterpret_cast<double*>( other.data_elements()),
this->comm().get(), FFTW_ESTIMATE
);
fftw_execute(p1);
fftw_execute(p2);
fftw_destroy_plan(p2);
fftw_destroy_plan(p1);
return other;
}
explicit scattered_array(multi::array<T, 2> const& other, bmpi3::communicator comm = mpi3::environment::self(), Alloc alloc = {}) :
scattered_array(other.extensions(), comm, alloc)
{
local_cutout() = other.stenciled(std::get<0>(local_cutout().extensions()), std::get<1>(local_cutout().extensions()));
}
// bool operator==(array<T, 2> const& other) const&{assert(comm()==other.comm());
// return comm()&=(local_cutout() == other.local_cutout());
// }
// bool operator!=(array<T, 2> const& other) const&{return not(*this==other);}
ptrdiff_t num_elements() const&{return multi::layout_t<2>(extensions()).num_elements();}
layout_type layout() const{return *this;}
~scattered_array() noexcept{if(this->local_count()) alloc_.deallocate(local_data_, this->local_count());}
scattered_array& operator=(scattered_array const& other)&{
assert(this->comm() == other.comm());
if(this->extensions() == other.extensions()){
fftw_plan p = fftw_mpi_plan_many_transpose(
std::get<0>(this->extensions()).size(), std::get<1>(this->extensions()).size(), sizeof(T)/sizeof(double),
other.block(), this->block(),
reinterpret_cast<double*>(const_cast<T*>(other.local_cutout().data_elements())),
reinterpret_cast<double*>( this->local_cutout().data_elements() ),
this->comm().get(), FFTW_ESTIMATE
);
fftw_execute(p);
fftw_destroy_plan(p);
}else assert(0);
return *this;
}
#if 0
private:
typename std::allocator_traits<Alloc>::size_type
local_count_2d (multi::extensions_type_<2> ext){return local_2d(ext).first; }
auto local_extension_2d(multi::extensions_type_<2> ext){return local_2d(ext).second;}
public:
Alloc get_allocator() const{return alloc_;}
array(bmpi3::communicator comm = mpi3::environment::self(), Alloc alloc = {}) :
comm_{std::move(comm)},
alloc_{alloc},
local_count_{local_count_2d(multi::extensions_type_<2>{})},
local_ptr_ {alloc_.allocate(local_count_), local_extension_2d(multi::extensions_type_<2>{})},
n0_{multi::layout_t<2>(multi::extensions_type_<2>{}).size()}
{}
bool empty() const{return extensions().num_elements();}
array_ref <T, 2> local_cutout() &{return *local_ptr_;}
array_cref<T, 2> local_cutout() const&{return *local_ptr_;}
ptrdiff_t local_count() const&{return local_count_;}
auto local_data() const&{return local_cutout().data_elements();}
multi::extensions_type_<2> extensions() const&{return {n0_, std::get<1>(local_cutout().extensions())};}
friend auto extensions(array const& self){return self.extensions();}
array& operator=(multi::array<T, 2> const& other) &{
if(other.extensions() == extensions()) local_cutout() = other.stenciled(std::get<0>(local_cutout().extensions()), std::get<1>(local_cutout().extensions()));
else{
array tmp{other};
std::swap(*this, tmp);
}
return *this;
}
template<class Array, class=std::enable_if_t<not std::is_same<Array, multi::array<T, 2>>{}> >
array& operator=(Array const& other) &{
assert( other.extensions() == this->extensions() );
static_assert( std::is_trivially_assignable<T&, T>{}, "!" );
static_assert( sizeof(T)%sizeof(double)==0, "!" );
auto options = FFTW_ESTIMATE;
if(other.layout_.is_transposed){
options |= FFTW_MPI_TRANSPOSED_IN;
n0_ = std::get<1>(other.extensions()).size();
}
fftw_plan p = fftw_mpi_plan_many_transpose(
std::get<0>(extensions()).size(), std::get<1>(extensions()).size(), sizeof(T)/sizeof(double),
FFTW_MPI_DEFAULT_BLOCK, other.layout_.block,
reinterpret_cast<double*>(const_cast<T*>(other.local_cutout().base())),
reinterpret_cast<double*>(this->local_cutout().data_elements()),
this->comm_.get(), options
);
fftw_execute(p);
fftw_destroy_plan(p);
local_ptr_ = array_ptr<T, 2, local_pointer_t>{this->local_cutout().data_elements(), local_extension_2d(other.extensions())};
return *this;
}
bool operator==(multi::array<T, 2> const& other) const&{
if(other.extensions() != extensions()) return false;
return comm_&=(local_cutout() == other.stenciled(std::get<0>(local_cutout().extensions()), std::get<1>(local_cutout().extensions())));
}
friend bool operator==(multi::array<T, 2> const& other, array const& self){
return self.operator==(other);
}
bool operator==(array<T, 2> const& other) const&{assert(comm_==other.comm_);
return comm_&=(local_cutout() == other.local_cutout());
}
array& operator=(array const& other)&{
if(other.extensions() == this->extensions() and other.comm_ == other.comm_)
local_cutout() = other.local_cutout();
else assert(0);
return *this;
}
basic_array<T, typename std::pointer_traits<local_pointer_t>::template rebind<T const>> transposed() const{
return basic_array<T, typename std::pointer_traits<local_pointer_t>::template rebind<T const>>{
layout_t{n0_, true, FFTW_MPI_DEFAULT_BLOCK}, this->local_cutout().layout().transpose(), this->local_cutout().data_elements()
};
}
};
#endif
#endif
#if 0
boost::multi::fftw::mpi::scattered_array<std::complex<double>, 2>& dft(
boost::multi::fftw::mpi::scattered_array<std::complex<double>, 2> const& A,
boost::multi::fftw::mpi::scattered_array<std::complex<double>, 2> & B,
fftw::sign /*s*/
){
(void)A;
// assert( A.extensions() == B.extensions() );
// assert( A.comm() == B.comm() );
#if 0
fftw_plan p = fftw_mpi_plan_dft_2d(
std::get<0>(A.extensions()).size(), std::get<1>(A.extensions()).size(),
(fftw_complex *)A.local_cutout().data_elements(), (fftw_complex *)B.local_cutout().data_elements(),
A.comm().get(),
s, FFTW_ESTIMATE
);
fftw_execute(p);
fftw_destroy_plan(p);
#endif
return B;
}
#endif
#if 0
array_transposed<std::complex<double>, 2>& dft(
array<std::complex<double>, 2> const& A,
array_transposed<std::complex<double>, 2>& B,
fftw::sign s
){
// http://www.fftw.org/fftw3_doc/MPI-Plan-Creation.html
// assert( A.extensions() == B.extensions() );
assert( A.comm() == B.comm() );
fftw_plan p = fftw_mpi_plan_dft_2d(
std::get<0>(A.extensions()).size(), std::get<1>(A.extensions()).size(),
(fftw_complex *)A.local_cutout().data_elements(), (fftw_complex *)B.local_cutout().data_elements(),
A.comm().get(),
s, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT
);
fftw_execute(p);
fftw_destroy_plan(p);
return B;
}
array<std::complex<double>, 2>& dft_forward(array<std::complex<double>, 2> const& A, array<std::complex<double>, 2>& B){
return dft(A, B, fftw::forward);
}
array<std::complex<double>, 2> dft_forward(array<std::complex<double>,2> const& A){
array<std::complex<double>, 2> ret(A.extensions()); dft_forward(A, ret); return ret;
}
}}}}
#endif
#if not __INCLUDE_LEVEL__
#include<boost/mpi3/main_environment.hpp>
#include<boost/mpi3/ostream.hpp>
#include "../../fftw/mpi/environment.hpp"
namespace mpi3 = boost::mpi3;
namespace multi = boost::multi;
namespace fftw = multi::fftw;
namespace mpi = fftw::mpi;
int mpi3::main(int, char*[], mpi3::environment& menv){
mpi::environment fenv(menv);
auto world = menv.world();
mpi3::ostream os{world};
using T = std::complex<double>;
mpi::scattered_array<T, 2> S({14, 19}, world);
using std::get;
if(auto x = extensions(S.local_cutout()))
for(auto i : get<0>(x))
for(auto j : get<1>(x))
S.local_cutout()[i][j] = T(i, j);//std::complex<double>(i + j, i + 2*j + 1)/std::abs(std::complex<double>(i + j, i + 2*j + 1));
mpi::gathered_array<T, 2> G = S.gather();
G.local_cutout();
assert( G.extensions() == {14, 19} );
if(world.rank() == 0){
assert( G.extensions() == {14, 19} );
assert( G.local_cutout().extensions() == {14, 19} );
}
if(world.rank() != 0){
assert( G.extensions() == {14, 19} );
assert( G.local_cutout().extensions() == {0, 0} );
}
multi::array<T, 2> A = S.gather();
if(world.rank() == 0) assert( A.extensions() == {14, 19} );
if(world.rank() != 0) assert( A.empty() );
world.barrier();
if(world.root()){
std::cout<<"-------------\n";
if(auto x = extensions(G.local_cutout()))
for(auto i : get<0>(x)){
for(auto j : get<1>(x))
std::cout<< G.local_cutout()[i][j] <<'\t';
std::cout<<std::endl;
}
}else assert(G.local_cutout().empty());
mpi::scattered_array<T, 2> S2 = G.scatter();
assert( S2 == S );
mpi::gathered_array<T, 2> G2 = S2.gather();
if(world.root()){
std::cout<<"-------------\n";
if(auto x = extensions(G2.local_cutout()))
for(auto i : get<0>(x)){
for(auto j : get<1>(x))
std::cout<< G2.local_cutout()[i][j] <<'\t';
std::cout<<std::endl;
}
assert( G2.local_cutout() == G.local_cutout() );
}else assert(G2.local_cutout().empty());
// assert( S == S2 );
// if(not world.root()) assert( G.local_cutout().empty() );
// mpi::gathered_array<double, 2> G({8, 15}, world);
/*
auto const A = [&]{
os<<"global sizes"<< std::get<0>(A.extensions()) <<'x'<< std::get<1>(A.extensions()) <<' '<< A.num_elements() <<std::endl;
os<< A.local_cutout().extension() <<'x'<< std::get<1>(A.local_cutout().extensions()) <<"\t#="<< A.local_cutout().num_elements() <<" allocated "<< A.local_count() <<std::endl;
if(auto x = A.local_cutout().extensions())
for(auto i : std::get<0>(x))
for(auto j : std::get<1>(x))
A.local_cutout()[i][j] = i + j;//std::complex<double>(i + j, i + 2*j + 1)/std::abs(std::complex<double>(i + j, i + 2*j + 1));
return A;
}();
*/
/*
multi::fftw::mpi::scattered_array<std::complex<double>, 2> B(A.extensions(), world);
multi::array<std::complex<double>, 2> A2 = A;
assert( A2 == A );
using multi::fftw::dft_forward;
*/
#if 0
dft_forward(A , B );
dft_forward(A2, A2);
{
auto x = B.local_cutout().extensions();
for(auto i : std::get<0>(x))
for(auto j : std::get<1>(x))
if(not( std::abs(B.local_cutout()[i][j] - A2[i][j]) < 1e-12 )){
std::cout<< B.local_cutout()[i][j] - A2[i][j] <<' '<< std::abs(B.local_cutout()[i][j] - A2[i][j]) <<'\n';
}
}
multi::fftw::mpi::array_transposed<std::complex<double>, 2> AT(A.extensions(), world);
os<< "global sizes" << std::get<0>(AT.extensions()) <<'x'<< std::get<1>(AT.extensions()) <<' '<< AT.num_elements() <<std::endl;
os<< AT.local_cutout().extension() <<'x'<< std::get<1>(AT.local_cutout().extensions()) <<"\t#="<< AT.local_cutout().num_elements() <<" allocated "<< AT.local_count() <<std::endl;
dft(A, AT, multi::fftw::forward);
if(world.rank() == 0){
if(auto x = B.local_cutout().extensions()){
for(auto i : std::get<0>(x)){
for(auto j : std::get<1>(x))
std::cout<< B.local_cutout()[i][j] <<' ';
std::cout<<'\n';
}
}
if(auto x = AT.local_cutout().extensions()){
for(auto i : std::get<0>(x)){
for(auto j : std::get<1>(x))
std::cout<< AT.local_cutout()[i][j] <<' ';
std::cout<<'\n';
}
}
}
#endif
return 0;
}
#endif
#endif

View File

@ -0,0 +1,40 @@
#if COMPILATION_INSTRUCTIONS
mpic++ -I$HOME/prj/alf $0 -o $0x -lfftw3 -lfftw3_mpi&&time mpirun -n 4 $0x&&rm $0x;exit
#endif
#include "../../../fftw/mpi.hpp"
#include<boost/mpi3/main.hpp>
#include<boost/mpi3/environment.hpp>
#include<boost/mpi3/ostream.hpp>
#include "../../../fftw.hpp"
namespace mpi3 = boost::mpi3;
namespace multi = boost::multi;
int mpi3::main(int, char*[], mpi3::communicator world){
multi::fftw::mpi::environment fenv;
multi::fftw::mpi::array<std::complex<double>, 2> G({41, 321}, world);
if(auto x = G.local_cutout().extensions())
for(auto i : std::get<0>(x))
for(auto j : std::get<1>(x))
G.local_cutout()[i][j] = std::complex<double>(i + j, i + 2*j);
multi::array<std::complex<double>, 2> L = G; // world replicas
assert( L == G );
using multi::fftw::dft_forward;
dft_forward(L, L); // dft in replicas
dft_forward(G, G);
if(auto x = G.local_cutout().extensions())
for(auto i : std::get<0>(x))
for(auto j : std::get<1>(x))
if(not(std::abs(G.local_cutout()[i][j] - L[i][j]) < 1e-8)) std::cout<< std::abs(G.local_cutout()[i][j] - L[i][j]) << std::endl;
return 0;
}

View File

@ -0,0 +1,88 @@
# -*-indent-tabs-mode:nil;c-basic-offset:2;tab-width:4;autowrap:nil;-*-
#[=[Multi Test suite can be run like this:
mkdir -p build
cd build
cmake .. [-DENABLE_CUDA=1]
make -j
ctest -j --output-on-error [-T memcheck]
exit
#]=]
cmake_minimum_required(VERSION 3.11)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
set(CMAKE_VERBOSE_MAKEFILE ON)
project(boost-multi-adaptors-fftw-test VERSION 0.1 LANGUAGES CXX)
find_package(Boost REQUIRED COMPONENTS unit_test_framework timer)
find_package(FFTW REQUIRED COMPONENTS DOUBLE_LIB)
include_directories(${FFTW_INCLUDE_DIRS})
link_libraries(${FFTW_LIBRARIES})
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
if(ENABLE_CUDA OR DEFINED CXXCUDA)
enable_language(CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe \"--diag_suppress=implicit_return_from_non_void_function\"")
endif()
find_package(CUDA QUIET)
if (CUDA_FOUND)
message("CUDA found")
include_directories(${CUDA_INCLUDE_DIRS})
else()
message("CUDA not found")
endif()
enable_testing()
list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure") # needs cmake 3.17
include(CTest)
#configure_file("config.hpp.in" ${CMAKE_BINARY_DIR}/config.hpp)
include_directories(${CMAKE_BINARY_DIR})
#file(GLOB TEST_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
set(TEST_SRCS
combinations.cpp
# copy.cpp
core.cpp
# thrust.cpp
transpose.cpp
transpose_square.cpp
)
foreach(TEST_FILE ${TEST_SRCS})
SET(TEST_EXE "${TEST_FILE}.x")
add_executable (${TEST_EXE} ${TEST_FILE})
if(ENABLE_CUDA OR DEFINED CXXCUDA)
set_source_files_properties(${TEST_FILE} PROPERTIES LANGUAGE CUDA)
target_compile_options (${TEST_EXE} PRIVATE -std=c++17)
endif()
# target_compile_features (${TEST_EXE} PUBLIC cxx_std_17)
target_compile_definitions(${TEST_EXE} PRIVATE "BOOST_PP_VARIADICS")
target_compile_definitions(${TEST_EXE} PRIVATE ${Boost_DEFINITIONS})
target_include_directories(${TEST_EXE} PRIVATE ${Boost_INCLUDE_DIRS})
target_link_libraries (${TEST_EXE} PRIVATE ${Boost_LIBRARIES})
target_link_directories (${TEST_EXE} PRIVATE ${Boost_LIBRARY_DIRS})
if(NOT ENABLE_CUDA)
target_compile_options (${TEST_EXE} PRIVATE
-Werror -Wall -Wextra -fno-common
$<$<CXX_COMPILER_ID:GNU>:
-Wpedantic -Wformat-truncation -fstack-usage>#-Wconversion
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:
-Wpedantic -Wmove>
$<$<CXX_COMPILER_ID:Intel>:
-wd161 -diag-disable=remark -Warray-bounds -Wchar-subscripts -Wcomment -Wenum-compare -Wformat -Wuninitialized -Wmaybe-uninitialized -Wmain -Wnarrowing -Wnonnull -Wparentheses -Wpointer-sign -Wreorder -Wno-return-type -Wsign-compare -Wsequence-point -Wtrigraphs -Wunused-function -Wunused-but-set-variable -Wunused-variable -Wwrite-strings -Werror -diag-error:3846
>
$<$<CXX_COMPILER_ID:MSVC>:
/W4>)
endif()
add_test(NAME ${TEST_EXE} COMMAND ./${TEST_EXE})
endforeach()

View File

@ -0,0 +1,68 @@
# ==================================================================================================
# This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
# CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
#
# Author(s):
# Cedric Nugteren <cedric.nugteren@surfsara.nl>
#
# ==================================================================================================
#
# Defines the following variables:
# FFTW_FOUND Boolean holding whether or not the FFTW3 library was found
# FFTW_INCLUDE_DIRS The FFTW3 include directory
# FFTW_LIBRARIES The FFTW3 library
#
# In case FFTW3 is not installed in the default directory, set the FFTW_ROOT variable to point to
# the root of FFTW3, such that 'fftw3.h' can be found in $FFTW_ROOT/include. This can either be done
# using an environmental variable (e.g. export FFTW_ROOT=/path/to/fftw3) or using a CMake variable
# (e.g. cmake -DFFTW_ROOT=/path/to/fftw3 ..).
#
# ==================================================================================================
# Sets the possible install locations
set(FFTW_HINTS
${FFTW_ROOT}
$ENV{FFTW_ROOT}
)
set(FFTW_PATHS
/usr
/usr/local
)
# Finds the include directories
find_path(FFTW_INCLUDE_DIRS
NAMES fftw3.h
HINTS ${FFTW_HINTS}
PATH_SUFFIXES include api inc include/x86_64 include/x64
PATHS ${FFTW_PATHS}
DOC "FFTW3 include header fftw3.h"
)
mark_as_advanced(FFTW_INCLUDE_DIRS)
# Finds the library
find_library(FFTW_LIBRARIES
NAMES fftw3
HINTS ${FFTW_HINTS}
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32
PATHS ${FFTW_PATHS}
DOC "FFTW3 library"
)
mark_as_advanced(FFTW_LIBRARIES)
# ==================================================================================================
# Notification messages
if(NOT FFTW_INCLUDE_DIRS)
message(STATUS "Could NOT find 'fftw3.h', install FFTW3 or set FFTW_ROOT")
endif()
if(NOT FFTW_LIBRARIES)
message(STATUS "Could NOT find the FFTW3 library, install it or set FFTW_ROOT")
endif()
# Determines whether or not FFTW3 was found
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(FFTW DEFAULT_MSG FFTW_INCLUDE_DIRS FFTW_LIBRARIES)
# ==================================================================================================

View File

@ -0,0 +1,142 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4-*-
$CXX $0 -o $0x -lfftw3 -lboost_unit_test_framework -lboost_timer&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi FFTW adaptor (cpu) with thrust complex"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include <boost/timer/timer.hpp>
#include "../../fftw.hpp"
#include<complex>
#include<chrono>
#include<thrust/complex.h>
namespace multi = boost::multi;
namespace utf = boost::unit_test::framework;
struct watch : private std::chrono::high_resolution_clock{
std::string label_; time_point start_;
watch(std::string label ="") : label_{label}, start_{now()}{}
~watch(){
std::cerr<< label_<<": "<< std::chrono::duration<double>(now() - start_).count() <<" sec"<<std::endl;
}
};
BOOST_AUTO_TEST_CASE(fft_combinations, *boost::unit_test::tolerance(0.00001)){
using complex = std::complex<double>;
auto const in = []{
multi::array<complex, 4> ret({32, 90, 98, 96});
std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(),
[](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};}
);
return ret;
}();
std::cout<<"memory size "<< in.num_elements()*sizeof(complex)/1e6 <<" MB\n";
std::vector<std::array<bool, 4>> cases = {
{false, true , true , true },
{false, true , true , false},
{true , false, false, false},
{true , true , false, false},
{false, false, true , false},
{false, false, false, false},
};
using std::cout;
for(auto c : cases){
cout<<"case "; copy(begin(c), end(c), std::ostream_iterator<bool>{cout,", "}); cout<<"\n";
multi::array<complex, 4> out = in;
{
boost::timer::auto_cpu_timer t{"cpu_oplac %ws wall, CPU (%p%)\n"};
multi::fftw::dft_forward(c, in, out);
}
{
multi::fftw::plan p(c, in, out, multi::fftw::forward);
boost::timer::auto_cpu_timer t{"cpu_oplac planned %ws wall, CPU (%p%)\n"};
p();
}
{
auto in_rw = in;
boost::timer::auto_cpu_timer t{"cpu_iplac %ws wall, CPU (%p%)\n"};
multi::fftw::dft_forward(c, in_rw);
// BOOST_TEST( abs( in_rw[5][4][3][1] - out[5][4][3][1] ) == 0. );
}
{
auto in_rw = in;
multi::fftw::plan p(c, in_rw, in_rw, multi::fftw::forward);
boost::timer::auto_cpu_timer t{"cpu_iplac planned %ws wall, CPU (%p%)\n"};
p();
// BOOST_TEST( abs( in_rw[5][4][3][1] - out[5][4][3][1] ) == 0. );
}
{
auto in_rw = in;
multi::fftw::plan p(c, in_rw, in_rw, multi::fftw::forward);// | FFTW_MEASURE);
boost::timer::auto_cpu_timer t{"cpu_iplac planned measured %ws wall, CPU (%p%)\n"};
p();
// BOOST_TEST( abs( in_rw[5][4][3][1] - out[5][4][3][1] ) == 0. );
}
{
boost::timer::auto_cpu_timer t{"cpu_alloc %ws wall, CPU (%p%)\n"};
auto out_cpy = multi::fftw::dft_forward(c, in);
BOOST_TEST( abs( out_cpy[5][4][3][1] - out[5][4][3][1] ) == 0. );
}
{
auto in_rw = in;
boost::timer::auto_cpu_timer t{"cpu_move %ws wall, CPU (%p%)\n"};
auto out_cpy = multi::fftw::dft_forward(c, std::move(in_rw));
BOOST_REQUIRE( in_rw.empty() );
BOOST_TEST( abs( out_cpy[5][4][3][1] - out[5][4][3][1] ) == 0. );
}
}
}
BOOST_AUTO_TEST_CASE(fftw_4D_power_benchmark, *boost::unit_test::disabled() ){
using complex = std::complex<double>;
namespace fftw = multi::fftw;
auto x = multi::array<complex, 4>::extensions_type({64, 128, 128, 128});
multi::array<complex, 4> in(x);
std::iota(in.data_elements(), in.data_elements() + in.num_elements(), 1.2);
BOOST_REQUIRE( in[0][0][0][0] == 1.2 );
std::array<bool, 4> c = {false, true, true, true};
[&, _ = watch{utf::current_test_case().full_name()+" inplace FTTT"}]{
fftw::dft(c, in, fftw::forward);
}();
[&, _ = watch{utf::current_test_case().full_name()+" inplace FTTT"}]{
fftw::dft(c, in, fftw::forward);
}();
auto in0000 = in[0][0][0][0];
BOOST_REQUIRE( in0000 != 1.2 );
multi::array<complex, 4> out(x);
[&, _ = watch{utf::current_test_case().full_name()+" outofplace FTTT"}]{
fftw::dft(c, in, out, fftw::forward);
}();
[&, _ = watch{utf::current_test_case().full_name()+" outofplace FTTT"}]{
fftw::dft(c, in, out, fftw::forward);
}();
[&, _ = watch{utf::current_test_case().full_name()+" outofplace FTTT"}]{
fftw::dft(c, in, out, fftw::forward);
}();
[&, _ = watch{utf::current_test_case().full_name()+" outofplace+alloc FTTT"}]{
multi::array<complex, 4> out2(x);
fftw::dft(c, in, out2, fftw::forward);
}();
[&, _ = watch{utf::current_test_case().full_name()+" outofplace+alloc FTTT"}]{
multi::array<complex, 4> out2(x);
fftw::dft(c, in, out2, fftw::forward);
}();
BOOST_REQUIRE( in0000 == in[0][0][0][0] );
}

View File

@ -0,0 +1,153 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4-*-
$CXXX $CXXFLAGS -O3 $0 -o $0x -DHAVE_FFTW3_THREADS -lfftw3 -lfftw3_threads -lboost_unit_test_framework -lboost_timer&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi FFTW copy"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include<boost/timer/timer.hpp>
#include "../../fftw.hpp"
#include<complex>
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(fftw_copy){
using complex = std::complex<double>;
auto const in = []{
multi::array<complex, 4> ret({96, 96, 96, 96});
std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(),
[](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};}
);
return ret;
}();
std::cout<<"memory size "<< in.num_elements()*sizeof(complex)/1e6 <<" MB\n";
{
multi::array<complex, 4> out(extensions(in), 0.);
{
boost::timer::auto_cpu_timer t{"fftw_copy in-inorder %ws wall, CPU (%p%)\n"};
multi::fftw::copy(in, rotated(out));
}
BOOST_REQUIRE( out[1][2][3][4] == in[2][3][4][1] );
BOOST_REQUIRE( rotated(out) == in );
}
{
multi::array<complex, 4> out(extensions(in), 0.);
{
boost::timer::auto_cpu_timer t{"fftw_copy out-inorder %ws wall, CPU (%p%)\n"};
multi::fftw::copy(unrotated(in), out);
}
BOOST_REQUIRE( out[1][2][3][4] == in[2][3][4][1] );
BOOST_REQUIRE( rotated(out) == in );
BOOST_REQUIRE( out == unrotated(in) );
}
{
multi::array<complex, 4> out(extensions(in), 0.);
{
boost::timer::auto_cpu_timer t{"assignment in-inorder %ws wall, CPU (%p%)\n"};
rotated(out) = in;
}
BOOST_REQUIRE( out[1][2][3][4] == in[2][3][4][1] );
}
{
multi::array<complex, 4> out(extensions(in), 0.);
{
boost::timer::auto_cpu_timer t{"assignment out-inorder %ws wall, CPU (%p%)\n"};
out = unrotated(in);
}
BOOST_REQUIRE( out[1][2][3][4] == in[2][3][4][1] );
}
{
multi::array<complex, 4> out = in;
{
boost::timer::auto_cpu_timer t{"assignment inplace out-inorder %ws wall, CPU (%p%)\n"};
out = unrotated(out);
}
BOOST_REQUIRE( out[1][2][3][4] == in[2][3][4][1] );
BOOST_REQUIRE( out == unrotated(in) );
}
{
multi::array<complex, 4> out = in;
{
boost::timer::auto_cpu_timer t{"assignment inplace in-inorder %ws wall, CPU (%p%)\n"};
rotated(out) = out;
}
BOOST_REQUIRE( out[1][2][3][4] == in[2][3][4][1] );
// BOOST_REQUIRE( rotated(out) == in );
}
{
multi::array<complex, 4> out = in;
{
boost::timer::auto_cpu_timer t{"assignment inplace with copy out-inorder %ws wall, CPU (%p%)\n"};
out = unrotated(multi::array<complex, 4>{out});
}
BOOST_REQUIRE( out[1][2][3][4] == in[2][3][4][1] );
BOOST_REQUIRE( out == unrotated(in) );
}
{
multi::array<complex, 4> out = in;
{
boost::timer::auto_cpu_timer t{"assignment inplace with copy in-inorder %ws wall, CPU (%p%)\n"};
rotated(out) = multi::array<complex, 4>{out};
}
BOOST_REQUIRE( out[1][2][3][4] == in[2][3][4][1] );
BOOST_REQUIRE( out == unrotated(in) );
}
{
multi::array<complex, 4> out = in;
{
boost::timer::auto_cpu_timer t{"fftw copy inplace in-inorder %ws wall, CPU (%p%)\n"};
multi::fftw::copy(out, rotated(out));
}
BOOST_REQUIRE( out[1][2][3][4] == in[2][3][4][1] );
BOOST_REQUIRE( out == unrotated(in) );
}
{
multi::array<complex, 4> out = in;
{
boost::timer::auto_cpu_timer t{"fftw copy inplace out-inorder %ws wall, CPU (%p%)\n"};
multi::fftw::copy(unrotated(out), out);
}
BOOST_REQUIRE( out[1][2][3][4] == in[2][3][4][1] );
BOOST_REQUIRE( out == unrotated(in) );
}
{
multi::array<complex, 4> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw move construct inplace in-inorder %ws wall, CPU (%p%)\n"};
multi::array<complex, 4> out2 = multi::fftw::copy( out.move().unrotated() );
BOOST_REQUIRE( out.empty() );
BOOST_REQUIRE( p == out2.data_elements() );
BOOST_TEST( out2[1][2][3][4].real() == in[2][3][4][1].real() );
}
}
{
multi::array<complex, 4> out = in;
auto p = out.data_elements();
multi::array<complex, 4> out2;
{
boost::timer::auto_cpu_timer t{"fftw move assign inplace in-inorder %ws wall, CPU (%p%)\n"};
out2 = multi::fftw::copy( out.move().unrotated() );
BOOST_REQUIRE( out.empty() );
BOOST_REQUIRE( p == out2.data_elements() );
BOOST_TEST( out2[1][2][3][4].real() == in[2][3][4][1].real() );
}
}
{
multi::array<complex, 4> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw move self-assign inplace in-inorder %ws wall, CPU (%p%)\n"};
out = multi::fftw::copy( out.move().unrotated() );
BOOST_REQUIRE( p == out.data_elements() );
BOOST_TEST( out[1][2][3][4].real() == in[2][3][4][1].real() );
}
}
}

View File

@ -0,0 +1,374 @@
// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4-*-
// © Alfredo A. Correa 2020-2021
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi FFTW transpose"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../../array.hpp"
#include "../../../adaptors/../complex.hpp"
#include "../../../adaptors/fftw.hpp"
#include<chrono>
#include<random>
//#include<thrust/complex.h> // TODO make lib work with thrust complex
namespace{
namespace multi = boost::multi;
namespace fftw = multi::fftw;
using complex = std::complex<double>; MAYBE_UNUSED complex const I{0, 1};
template<class M> auto power(M const& m)->decltype(std::norm(m)){return std::norm(m);}
template<class M, DELETE((M::rank_v < 1))> double power(M const& m){return accumulate(begin(m), end(m), 0., [](auto const& a, auto const& b){return a + power(b);});}
struct sum_power{
template<class A, class B> auto operator()(A const& a, B const& b) const{return a+power(b);}
};
MAYBE_UNUSED constexpr int N = 16;
}
struct watch : private std::chrono::high_resolution_clock{
std::string label_; time_point start_;
watch(std::string label ="") : label_{label}, start_{now()}{}
~watch(){
std::cerr<< label_<<": "<< std::chrono::duration<double>(now() - start_).count() <<" sec"<<std::endl;
}
};
template<class T> struct randomizer{
template<class M> void operator()(M&& m) const{for(auto&& e:m) operator()(e);}
void operator()(T& e) const{
static std::random_device r; static std::mt19937 g{r()}; static std::normal_distribution<T> d;
e = d(g);
}
};
template<class T> struct randomizer<std::complex<T>>{
template<class M> void operator()(M&& m) const{for(auto&& e:m) operator()(e);}
void operator()(std::complex<T>& e) const{
static std::random_device r; static std::mt19937 g{r()}; static std::normal_distribution<T> d;
e = std::complex<T>(d(g), d(g));
}
};
struct fftw_fixture : fftw::environment{
void setup(){}
void teardown(){}//fftw_cleanup();}
};
BOOST_TEST_GLOBAL_FIXTURE( fftw_fixture );
BOOST_AUTO_TEST_CASE(fftw_3D){
using complex = std::complex<double>; //TODO make it work with thrust
multi::array<complex, 3> in({10, 10, 10});
in[2][3][4] = 99.;
auto fwd = multi::fftw::dft(in, fftw::forward);
BOOST_REQUIRE(in[2][3][4] == 99.);
}
BOOST_AUTO_TEST_CASE(fftw_1D_const){
multi::array<complex, 1> const in = {1. + 2.*I, 2. + 3. *I, 4. + 5.*I, 5. + 6.*I};
auto fwd = multi::fftw::dft(in, fftw::forward); // Fourier[in, FourierParameters -> {1, -1}]
BOOST_REQUIRE( size(fwd) == size(in) );
BOOST_REQUIRE( fwd[2] == -2. - 2.*I );
BOOST_REQUIRE( in[1] == +2. + 3.*I );
auto bwd = multi::fftw::dft(in, fftw::forward); // InverseFourier[in, FourierParameters -> {-1, -1}]
BOOST_REQUIRE( bwd[2] == -2. - 2.*I );
}
BOOST_AUTO_TEST_CASE(fftw_2D_identity_2, *boost::unit_test::tolerance(0.0001)){
multi::array<complex, 2> const in = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
multi::array<complex, 2> out(extensions(in));
multi::fftw::dft({false, false}, in, out, fftw::forward); // out = in;
BOOST_REQUIRE( power(in) == power(out) );
BOOST_REQUIRE( out == in );
}
BOOST_AUTO_TEST_CASE(fftw_2D_identity, *boost::unit_test::tolerance(0.0001)){
multi::array<complex, 2> const in = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
auto fwd = multi::fftw::dft({}, in, fftw::forward);
BOOST_REQUIRE( fwd == in );
}
BOOST_AUTO_TEST_CASE(fftw_2D, *boost::unit_test::tolerance(0.0001)){
multi::array<complex, 2> const in = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
namespace fftw = multi::fftw;
auto fwd = fftw::dft_forward(in);
BOOST_TEST_REQUIRE( fwd[3][1].real() == -19.0455 ); // Fourier[in, FourierParameters -> {1, -1}][[4]][[2]]
BOOST_TEST_REQUIRE( fwd[3][1].imag() == - 2.22717 );
multi::array<complex, 1> const in0 = {1. + 2.*I, 9. - 1.*I, 2. + 4.*I};
auto b = multi::fftw::dft_forward(in0);
auto a = multi::fftw::dft_forward(in[0]);
BOOST_REQUIRE( fftw::dft_forward(in[0]) == fftw::dft_forward(in0) );
}
BOOST_AUTO_TEST_CASE(fftw_2D_rotated, *boost::unit_test::tolerance(0.0001)){
using multi::array;
array<complex, 2> const in = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
using multi::fftw::dft_forward;
auto fwd = dft_forward(in);
BOOST_REQUIRE(
dft_forward(rotated(in)[0])
== dft_forward(array<complex, 1>{1.+2.*I, 3.+3.*I, 4. + 1.*I, 3. - 1.*I, 31. - 1.*I})
);
BOOST_REQUIRE( dft_forward(rotated(in)) == rotated(fwd) );
}
BOOST_AUTO_TEST_CASE(fftw_2D_many, *boost::unit_test::tolerance(0.0001)){
multi::array<complex, 2> const in = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
multi::array<complex, 2> out(extensions(in));
using multi::fftw::dft_forward;
multi::fftw::dft({fftw::none, fftw::forward}, in, out);
BOOST_REQUIRE( dft_forward(in[0]) == out[0] );
multi::fftw::dft({false, true}, rotated(in), rotated(out), fftw::forward);
BOOST_REQUIRE( dft_forward(rotated(in)[0]) == rotated(out)[0] );
multi::fftw::dft_forward({false, false}, rotated(in), rotated(out));
BOOST_REQUIRE( in == out );
multi::fftw::many_dft(in.begin(), in.end(), out.begin(), fftw::forward);
BOOST_REQUIRE( dft_forward(in[0]) == out[0] );
}
//BOOST_AUTO_TEST_CASE(fftw_many1_from_2){
// multi::array<complex, 2> in({3, 10}); randomizer<complex>{}(in);
// multi::array<complex, 2> out({3, 10});
// fftw::dft({false, true}, in, out, fftw::forward);
// multi::array<complex, 2> out2({3, 10});
// for(int i = 0; i!=size(in); ++i)
// fftw::dft(in[i], out2[i], fftw::forward);
// BOOST_REQUIRE(out2 == out);
//}
//BOOST_AUTO_TEST_CASE(fftw_many2_from_3){
// multi::array<complex, 3> in({3, 5, 6}); randomizer<complex>{}(in);
// multi::array<complex, 3> out({3, 5, 6});
// fftw::dft({false, true, true}, in, out, FFTW_FORWARD);
// multi::array<complex, 3> out2({3, 5, 6});
// for(int i = 0; i!=size(in); ++i)
// fftw::dft(in[i], out2[i], FFTW_FORWARD);
// BOOST_REQUIRE(out2 == out);
//}
//BOOST_AUTO_TEST_CASE(fftw_many2_from_2){
// multi::array<complex, 2> in({5, 6}); randomizer<complex>{}(in);
// multi::array<complex, 2> out({5, 6});
// fftw::dft({true, true}, in, out, FFTW_FORWARD);
// multi::array<complex, 2> out2({5, 6});
// fftw::dft(in, out2, FFTW_FORWARD);
// BOOST_REQUIRE(out2 == out);
//}
//BOOST_AUTO_TEST_CASE(fftw_4D){
// multi::array<complex, 4> const in = []{
// multi::array<complex, 4> in({10, 10, 10, 10}); in[2][3][4][5] = 99.; return in;
// }();
// auto fwd = multi::fftw::dft({true, true, true, true}, in, fftw::forward);
// BOOST_REQUIRE(in[2][3][4][5] == 99.);
//}
//BOOST_AUTO_TEST_CASE(fftw_4D_many){
// auto const in = []{
// multi::array<complex, 4> in({97, 95, 101, 10}, 0.);
// in[2][3][4][5] = 99.; return in;
// }();
// auto fwd = multi::fftw::dft({true, true, true, false}, in, fftw::forward);
// BOOST_REQUIRE( in[2][3][4][5] == 99. );
// multi::array<complex, 4> out(extensions(in));
// multi::fftw::many_dft(begin(unrotated(in)), end(unrotated(in)), begin(unrotated(out)), fftw::forward);
// BOOST_REQUIRE( out == fwd );
//}
//BOOST_AUTO_TEST_CASE(cufft_many_2D){
// auto const in = []{
// multi::array<complex, 3> ret({10, 10, 10});
// std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(),
// [](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};}
// );
// return ret;
// }();
// multi::array<complex, 3> out(extensions(in));
// multi::fftw::many_dft((in<<1).begin(), (in<<1).end(), (out<<1).begin(), multi::fftw::forward);
// multi::array<complex, 3> out2(extensions(in));
// multi::fftw::dft({true, false, true}, in, out2, multi::fftw::forward);
// BOOST_REQUIRE( out == out2 );
//}
//BOOST_AUTO_TEST_CASE(fftw_5D){
// multi::array<complex, 5> in({4, 5, 6, 7, 8});
// in[2][3][4][5][6] = 99.;
// auto fwd = multi::fftw::dft(in, fftw::forward);
// BOOST_REQUIRE(in[2][3][4][5][6] == 99.);
//}
//BOOST_AUTO_TEST_CASE(fftw_1D_power){
// multi::array<complex, 1> in(N, 0.); assert( size(in) == N );
// std::iota(begin(in), end(in), 1.);
// multi::array<complex, 1> out(extensions(in));
// static_assert( in.dimensionality() == out.dimensionality(), "!");
// auto p = multi::fftw_plan_dft(in, out, fftw::forward, FFTW_PRESERVE_INPUT);
// fftw_execute(p);
// fftw_destroy_plan(p);
// BOOST_REQUIRE( (power(in) - power(out)/num_elements(out)) < 1e-17 );
//}
//BOOST_AUTO_TEST_CASE(fftw_2D_power){
// multi::array<complex, 2> in({N, N});
// std::iota(in.data_elements(), in.data_elements() + in.num_elements(), 1.2);
// multi::array<complex, 2> out(in.extensions());
// auto p = multi::fftw_plan_dft(in, out, fftw::forward, FFTW_PRESERVE_INPUT);
// fftw_execute(p); fftw_destroy_plan(p);
// BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-12 );
//}
//BOOST_AUTO_TEST_CASE(fftw_2D_power_plan){
// multi::array<complex, 2> in({16, 16});
// std::iota(in.data_elements(), in.data_elements() + in.num_elements(), 1.2);
// multi::array<complex, 2> out(in.extensions());
// multi::fftw::plan const p{in, out, fftw::forward, FFTW_PRESERVE_INPUT};
// p(); //execute(p); //p.execute();
// BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-8 );
//}
//BOOST_AUTO_TEST_CASE(fftw_2D_power_dft){
// multi::array<complex, 2> in({16, 16});
// std::iota(data_elements(in), data_elements(in) + num_elements(in), 1.2);
// multi::array<complex, 2> out(extensions(in));
// multi::fftw::dft(in, out, fftw::forward);
// BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-8 );
//}
//BOOST_AUTO_TEST_CASE(fftw_2D_power_dft_out){
// multi::array<complex, 2> in({16, 16}); std::iota(data_elements(in), data_elements(in) + num_elements(in), 1.2);
// auto out = multi::fftw::dft(in, fftw::forward);
// BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-8 );
//}
//BOOST_AUTO_TEST_CASE(fftw_2D_power_dft_out_default){
// multi::array<complex, 2> in({16, 16}); std::iota(data_elements(in), data_elements(in) + num_elements(in), 1.2);
// auto out = multi::fftw::dft(in, fftw::forward);
// BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-8 );
//}
//BOOST_AUTO_TEST_CASE(fftw_3D_power){
// multi::array<complex, 3> in({4, 4, 4}); std::iota(in.data_elements(), in.data_elements() + in.num_elements(), 1.2);
// multi::array<complex, 3> out = fftw::dft(in, fftw::forward);
// BOOST_REQUIRE( std::abs(power(in) - power(out)/num_elements(out)) < 1e-10 );
//}
//BOOST_AUTO_TEST_CASE(fftw_3D_power_in_place){
// multi::array<complex, 3> io({4, 4, 4}); std::iota(io.data_elements(), io.data_elements() + io.num_elements(), 1.2);
// auto powerin = power(io);
// fftw::dft_inplace(io, fftw::forward);
// BOOST_REQUIRE( powerin - power(io)/num_elements(io) < 1e-10 );
//}
//BOOST_AUTO_TEST_CASE(fftw_3D_power_in_place_over_ref_inplace){
// multi::array<complex, 3> io({4, 4, 4}); std::iota(io.data_elements(), io.data_elements() + io.num_elements(), 1.2);
// auto powerin = power(io);
//// fftw::dft_inplace(multi::array_ref<complex, 3>(io.data(), io.extensions()), fftw::forward);
//// fftw::dft_inplace(multi::array_ref<complex, 3>(data_elements(io), extensions(io)), fftw::forward);
// fftw::dft_inplace(io(), fftw::forward);
// BOOST_REQUIRE( powerin - power(io)/num_elements(io) < 1e-10 );
//}
//BOOST_AUTO_TEST_CASE(fftw_3D_power_out_of_place_over_ref){
// multi::array<complex, 3> in({4, 4, 4});
// std::iota(in.data_elements(), in.data_elements()+in.num_elements(), 1.2);
// multi::array<complex, 3> out({4, 4, 4});
// out() = fftw::dft(in.protect(), fftw::forward);
// BOOST_REQUIRE( power(in) - power(out)/num_elements(out) < 1e-10 );
//}
//BOOST_AUTO_TEST_CASE(fftw_3D_power_out_of_place_over_temporary){
// double powerin;
// auto f = [&](){
// multi::array<complex, 3> in({4, 4, 4});
// std::iota(data_elements(in), data_elements(in)+num_elements(in), 1.2);
// powerin = power(in);
// return in;
// };
// auto out = fftw::dft(f(), fftw::forward);
// BOOST_REQUIRE( std::abs(powerin - power(out)/num_elements(out)) < 1e-10 );
//}
//BOOST_AUTO_TEST_CASE(fftw_2D_transposition_square_inplace){
// multi::array<complex, 2> in = {
// {11., 12.},
// {21., 22.}
// };
// BOOST_REQUIRE( in[1][0] == 21. );
// multi::fftw::copy(in, rotated(in));
// BOOST_TEST( in[0][1].real() == 21. );
// BOOST_TEST( in[0][1].imag() == 0. );
//}
//BOOST_AUTO_TEST_CASE(fftw_4D_inq_poisson){
// multi::array<complex, 4> const in = []{
// multi::array<complex, 4> in({50, 100, 137, 1});
// std::iota(data_elements(in), data_elements(in)+num_elements(in), 1.2);
// return in;
// }();
//
// multi::array<complex, 4> out(extensions(in));
// multi::fftw::dft({0, 1, 1, 0}, in, out);
// BOOST_TEST( power(in) == power(out)/std::get<1>(sizes(out))/std::get<2>(sizes(out)) , boost::test_tools::tolerance(1e-10) );
//}

View File

@ -0,0 +1,37 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4-*-
$CXX $0 -o $0x -lfftw3 -lboost_unit_test_framework -ftemplate-backtrace-limit=0&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi FFTW adaptor (cpu) with thrust complex"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include "../../fftw.hpp"
#include<complex>
#include <thrust/complex.h>
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(fftw_2D_identity){
using complex = thrust::complex<double>; complex const I{0, 1};
multi::array<complex, 2> const in = {
{ 1. + 2.*I, 9. - 1.*I, 2. + 4.*I},
{ 3. + 3.*I, 7. - 4.*I, 1. + 9.*I},
{ 4. + 1.*I, 5. + 3.*I, 2. + 4.*I},
{ 3. - 1.*I, 8. + 7.*I, 2. + 1.*I},
{ 31. - 1.*I, 18. + 7.*I, 2. + 10.*I}
};
auto fwd = multi::fftw::dft({true, true}, in, multi::fftw::forward);
multi::array<thrust::complex<double>, 2> const in_t = in;
auto fwd_t = multi::fftw::dft({true, true}, in_t, multi::fftw::forward);
BOOST_REQUIRE( fwd == fwd_t );
}

View File

@ -0,0 +1,120 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4-*-
$CXXX $CXXFLAGS -O3 $0 -o $0x -DHAVE_FFTW3_THREADS -lfftw3 -lfftw3_threads -lboost_unit_test_framework -lboost_timer&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi FFTW transpose"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include<boost/timer/timer.hpp>
#include "../../fftw.hpp"
#include<complex>
namespace multi = boost::multi;
BOOST_AUTO_TEST_CASE(fftw_transpose){
// multi::fftw::initialize_threads();
multi::fftw::plan::with_nthreads(1);
using complex = std::complex<double>;
{
auto const in = []{
multi::array<complex, 2> ret({10137, 9973});
std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(),
[](){return complex{std::rand()*1./RAND_MAX, std::rand()*1./RAND_MAX};}
);
std::cout<<"memory size "<< ret.num_elements()*sizeof(complex)/1e6 <<" MB\n";
return ret;
}();
{
multi::array<complex, 2> out = in;
{
boost::timer::auto_cpu_timer t{"transposition with aux %ws wall, CPU (%p%)\n"};
multi::array<complex, 2> aux = ~out;
out = std::move(aux);
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
}
{
multi::array<complex, 2> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw trans mve 1 thread %ws wall, CPU (%p%)\n"};
multi::array<complex, 2> out2 = multi::fftw::copy( ~move(out) );
BOOST_REQUIRE( out2.data_elements() == p );
BOOST_REQUIRE( out2[35][79] == in[79][35] );
}
}
{
multi::array<complex, 2> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw transpose fun thread %ws wall, CPU (%p%)\n"};
multi::fftw::transpose( out );
BOOST_REQUIRE( out.data_elements() == p );
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
BOOST_REQUIRE( out == ~in );
}
{
multi::array<complex, 2> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw transpose 1 thread %ws wall, CPU (%p%)\n"};
out = multi::fftw::copy( ~move(out) );
BOOST_REQUIRE( out.data_elements() == p );
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
}
multi::fftw::plan::with_nthreads(2);
{
multi::array<complex, 2> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw trans mve 2 thread %ws wall, CPU (%p%)\n"};
multi::array<complex, 2> out2 = multi::fftw::copy( ~move(out) );
BOOST_REQUIRE( out2.data_elements() == p );
BOOST_REQUIRE( out2[35][79] == in[79][35] );
}
}
{
multi::array<complex, 2> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw transpose 2 threads %ws wall, CPU (%p%)\n"};
out = multi::fftw::copy( ~move(out) );
BOOST_REQUIRE( out.data_elements() == p );
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
}
multi::fftw::plan::with_nthreads(3);
{
multi::array<complex, 2> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw transpose 3 threads %ws wall, CPU (%p%)\n"};
out = multi::fftw::copy( ~move(out) );
BOOST_REQUIRE( out.data_elements() == p );
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
}
multi::fftw::plan::with_nthreads(4);
{
multi::array<complex, 2> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw transpose 4 threads %ws wall, CPU (%p%)\n"};
out = multi::fftw::copy( ~move(out) );
BOOST_REQUIRE( out.data_elements() == p );
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
}
}
}

View File

@ -0,0 +1,114 @@
#ifdef COMPILATION// -*-indent-tabs-mode:t;c-basic-offset:4;tab-width:4-*-
$CXXX $CXXFLAGS -Ofast $0 -o $0x -DHAVE_FFTW3_THREADS -lfftw3 -lfftw3_threads -lboost_unit_test_framework -lboost_timer&&$0x&&rm $0x;exit
#endif
// © Alfredo A. Correa 2020
#define BOOST_TEST_MODULE "C++ Unit Tests for Multi FFTW transpose"
#define BOOST_TEST_DYN_LINK
#include<boost/test/unit_test.hpp>
#include<boost/timer/timer.hpp>
#include "../../fftw.hpp"
#include<complex>
namespace multi = boost::multi;
using complex = std::complex<double>; complex const I{0, 1};
BOOST_AUTO_TEST_CASE(fftw_transpose){
multi::fftw::initialize_threads();
{
auto const in = []{
multi::array<complex, 2> ret({8192, 8192});
std::generate(ret.data_elements(), ret.data_elements() + ret.num_elements(),
[](){return std::rand()*1./RAND_MAX + std::rand()*1./RAND_MAX*I;}
);
std::cout<<"memory size "<< ret.num_elements()*sizeof(complex)/1e6 <<" MB\n";
return ret;
}();
multi::fftw::plan::with_nthreads(1);
{
multi::array<complex, 2> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw trans mve 1 thread %ws wall, CPU (%p%)\n"};
multi::fftw::transpose( out );
BOOST_REQUIRE( out.data_elements() == p );
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
BOOST_REQUIRE( out == ~in );
}
{
multi::array<complex, 2> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw trans mve 1 thread %ws wall, CPU (%p%)\n"};
out = multi::fftw::copy( transposed( move(out) ) );
BOOST_REQUIRE( out.data_elements() == p );
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
BOOST_REQUIRE( out == ~in );
}
multi::fftw::plan::with_nthreads(2);
{
multi::array<complex, 2> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw trans mve 2 thread %ws wall, CPU (%p%)\n"};
out = multi::fftw::copy( ~move(out) );
BOOST_REQUIRE( out.data_elements() == p );
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
BOOST_REQUIRE( out == ~in );
}
multi::fftw::plan::with_nthreads(4);
{
multi::array<complex, 2> out = in;
auto p = out.data_elements();
{
boost::timer::auto_cpu_timer t{"fftw trans mve 4 thread %ws wall, CPU (%p%)\n"};
out = multi::fftw::copy( ~move(out) );
BOOST_REQUIRE( out.data_elements() == p );
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
BOOST_REQUIRE( out == ~in );
}
{
multi::array<complex, 2> out = in;
multi::array<complex, 2> aux(extensions(out));
{
boost::timer::auto_cpu_timer t{"auxiliary copy %ws wall, CPU (%p%)\n"};
aux = ~out;
out = std::move(aux);
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
BOOST_REQUIRE( out == ~in );
}
{
multi::array<complex, 2> out = in;
{
boost::timer::auto_cpu_timer t{"transposition with loop %ws wall, CPU (%p%)\n"};
for(auto i: extension(out))
for(auto j = 0l; j != i; ++j)
std::swap(out[i][j], out[j][i]);
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
BOOST_REQUIRE( out == ~in );
}
{
multi::array<complex, 2> out = in;
{
boost::timer::auto_cpu_timer t{"transposition with loop 2 %ws wall, CPU (%p%)\n"};
for(auto i = 0l; i != out.size(); ++i)
for(auto j = i + 1; j != out.size(); ++j)
std::swap(out[i][j], out[j][i]);
BOOST_REQUIRE( out[35][79] == in[79][35] );
}
BOOST_REQUIRE( out == ~in );
}
}
}

View File

@ -0,0 +1,4 @@
#pragma once
#include "lapack/getrf.hpp"

View File

@ -0,0 +1,228 @@
#ifdef COMPILATION_INSTRUCTIONS
(echo "#include\""$0"\"" > $0x.cpp) && clang++ `#-DNDEBUG` -O3 -std=c++14 -Wall -Wextra -Wpedantic -Wfatal-errors -D_TEST_MULTI_ADAPTORS_LAPACK_CORE -DADD_ $0x.cpp -o $0x.x -lblas -llapack && time $0x.x $@ && rm -f $0x.x $0x.cpp; exit
#endif
// Alfredo A. Correa 2019 ©
#ifndef MULTI_ADAPTORS_LAPACK_CORE_HPP
#define MULTI_ADAPTORS_LAPACK_CORE_HPP
//#include<iostream>
#include<cassert>
#include<complex>
//#include <cblas/cblas.h>
#include<lapacke.h>
#define s float
#define d double
#define c std::complex<s>
#define z std::complex<d>
#define v void
#define INT int
#define INTEGER INT const&
//#define N INTEGER n
#define CHARACTER char const&
#define UPLO CHARACTER
#define JOBZ CHARACTER
#define LAPACK(NamE) NamE##_
#define LWORK INTEGER lwork
#define LIWORK INTEGER liwork
#define IWORK int*
#define xPOTRF(T) v LAPACK(T##potrf)(UPLO, int const& N, T*, int const& LDA, int& INFO)
#define xSYEV(T) v LAPACK(T##syev) (JOBZ, UPLO, int const& N, T*, int const& LDA, T*, T*, LWORK, int& INFO)
#define xSYEVD(T) v LAPACK(T##syevd)(JOBZ, UPLO, int const& N, T*, int const& LDA, T*, T*, LWORK, IWORK, LIWORK, int& INFO)
#define xHEEV(T) v LAPACK(T##heev) (JOBZ, UPLO, int const& N, T*, int const& LDA, T*, T*, LWORK, int& INFO)
#define subroutine void
#define integer int const&
#define integer_out int&
#define integer_ptr int*
#define integer_cptr int const*
#define character char const&
// http://www.netlib.org/lapack/explore-html/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html
#define xGETRF(T) \
subroutine T##getrf_( \
integer M, /*The number of rows of the matrix A. M >= 0.*/ \
integer N, /*The number of columns of the matrix A. N >= 0.*/ \
T* A, /*On entry, the M-by-N matrix to be factored.*/ \
/*On exit, the factors L and U from the factorization*/ \
integer LDA, /*The leading dimension of the array A. LDA >= max(1,M).*/\
integer_ptr IPIV, /*The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).*/\
integer_out INFO /*= 0: successful exit*/\
/*< 0: if INFO = -i, the i-th argument had an illegal value*/\
/*> 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations.*/\
)
// http://www.netlib.org/lapack/explore-html/d8/ddc/group__real_g_ecomputational_gaa00bcf4d83a118cb6f0b6619d6ffaa24.html
#define xGETRS(T) \
subroutine T##getrs_( \
character TRANS,/*Specifies the form of the system of equations: */\
/* = 'N': A * X = B (No transpose) */\
/* = 'T': A**T* X = B (Transpose) */\
/* = 'C': A**T* X = B (Conjugate transpose = Transpose) */\
integer N, /*The order of the matrix A. N >= 0. */\
integer NRHS, /*The number of right hand sides, i.e., the number of columns*/\
/*of the matrix B. NRHS >= 0. */\
T const* A, /* The factors L and U from the factorization A = P*L*U */\
/*as computed by SGETRF. */\
integer LDA, /*The leading dimension of the array A. LDA >= max(1,N). */\
integer_cptr IPIV, /*The pivot indices from SGETRF; for 1<=i<=N, row i of the */\
/*matrix was interchanged with row IPIV(i). */\
T* B, /*On entry, the right hand side matrix B. */\
/*On exit, the solution matrix X. */\
integer LDB, /*The leading dimension of the array B. LDB >= max(1,N). */\
integer INFO /*= 0: successful exit */\
/*< 0: if INFO = -i, the i-th argument had an illegal value */\
)
// TODO // http://www.netlib.org/lapack/explore-html/d7/d3b/group__double_g_esolve_ga5ee879032a8365897c3ba91e3dc8d512.html
extern "C"{
//xGETRF(s) ; xGETRF(d) ; xGETRF(c) ; xGETRF(z) ;
//xGETRS(s) ; xGETRS(d) ; xGETRS(c) ; xGETRS(z) ;
}
namespace core{
// http://www.netlib.org/lapack/explore-html/da/d30/a18643_ga5b625680e6251feb29e386193914981c.html
int getrf(lapack_int m, lapack_int n, double* A, lapack_int lda, int* ipiv){
assert( m >= 0 );
assert( n >= 0 );
assert( lda >= std::max(lapack_int{1}, m) );
int info;
dgetrf_(&m, &n, A, &lda, ipiv, &info);
assert(info >= 0);
return info;
}
void getrs(char trans, lapack_int const n, lapack_int const nrhs, double const* A, lapack_int const lda, int const* ipiv, double* B, lapack_int const ldb){
assert( trans == 'T' or trans == 'N' or trans == 'C' );
assert( n >= 0 );
assert( nrhs >= 0 );
assert( lda >= std::max(1, n) );
int info;
dgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info);
switch(info){
case -1: throw std::logic_error{"transa ≠ 'N', 'T', or 'C'"};
case -2: throw std::logic_error{"n < 0" };
case -3: throw std::logic_error{"nrhs < 0" };
case -4: throw std::logic_error{"n > lda" };
case -5: throw std::logic_error{"lda ≤ 0" };
case -6: throw std::logic_error{"n > ldb" };
case -7: throw std::logic_error{"ldb ≤ 0" };
case -8: throw std::logic_error{"error!" };
}
assert(info == 0 );
return;
}
}
namespace lapack{
struct context{
template<class... Args> static auto getrf(Args&&... args)->decltype(core::getrf(args...)){return core::getrf(args...);}
template<class... Args> static auto getrs(Args&&... args)->decltype(core::getrs(args...)){return core::getrs(args...);}
};
}
extern "C"{
//xPOTRF(s) ; xPOTRF(d) ;
//xPOTRF(c) ; xPOTRF(z) ;
//xSYEV(s) ; xSYEV(d) ;
//xSYEVD(s) ; xSYEVD(d) ;
// xHEEV(c) ; xHEEV(z) ;
}
#undef subroutine
#undef integer
#undef character
#undef JOBZ
#undef UPLO
#undef INFO
#undef CHARACTER
#undef N
#undef LDA
#undef INTEGER
#undef INT
#define xpotrf(T) template<class S> v potrf(char uplo, S n, T *x, S incx, int& info){LAPACK(T##potrf)(uplo, n, x, incx, info);}
namespace core{
xpotrf(s) xpotrf(d)
xpotrf(c) xpotrf(z)
}
// http://www.netlib.org/lapack/explore-html/d2/d8a/group__double_s_yeigen_ga442c43fca5493590f8f26cf42fed4044.html
#define xsyev(T) template<class S> v syev(char jobz, char uplo, S n, T* a, S lda, T* w, T* work, S lwork, int& info){LAPACK(T##syev)(jobz, uplo, n, a, lda, w, work, lwork, info);}
// http://www.netlib.org/lapack/explore-html/d2/d8a/group__double_s_yeigen_ga77dfa610458b6c9bd7db52533bfd53a1.html
#define xsyevd(T) template<class S> v syevd(char jobz, char uplo, S n, T* a, S lda, T* w, T* work, S lwork, int* iwork, S liwork, int& info){ \
if(n <= 1 ){assert(lwork >= 1 ); assert(liwork >=1 );} \
if(jobz == 'N' and n > 1){assert(lwork >= 2*n+1 ); assert(liwork >= 1 );} \
if(jobz == 'V' and n > 1){assert(lwork >= 1 + 6*n + 2*n*n); assert(liwork >= 3 + 5*n);} \
LAPACK(T##syevd)(jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info); \
}
#define xheev(T) template<class S> v heev(char jobz, char uplo, S n, T* a, S lda, T* w, T* work, S lwork, int& info){LAPACK(T##heev)(jobz, uplo, n, a, lda, w, work, lwork, info);}
namespace core{
xsyev (s) xsyev (d)
xsyevd(s) xsyevd(d)
xheev(c) xheev(z)
}
#undef s
#undef d
#undef c
#undef z
#undef v
#define TRANS const char& trans
///////////////////////////////////////////////////////////////////////////////
#if _TEST_MULTI_ADAPTORS_LAPACK_CORE
#include "../../array.hpp"
#include "../../utility.hpp"
#include<iostream>
#include<numeric>
#include<vector>
namespace multi = boost::multi;
using std::cout;
int main(){
using core::potrf;
std::vector<double> v = {
2., 1.,
1., 2.
};
cout
<< v[0] <<'\t'<< v[1] <<'\n'
<< v[2] <<'\t'<< v[3] <<'\n' << std::endl
;
int info;
potrf('U', 2, v.data(), 2, info);
cout << "error " << info << std::endl;
cout
<< v[0] <<'\t'<< v[1] <<'\n'
<< v[2] <<'\t'<< v[3] <<'\n'
;
cout << std::endl;
}
#endif
#endif

Some files were not shown because too many files have changed in this diff Show More