mirror of https://gitlab.com/QEF/q-e.git
Merge branch 'merge_qegpu' into HEAD
This commit is contained in:
commit
db0da8b0d9
257
.ci/cineca.yml
257
.ci/cineca.yml
|
@ -1,104 +1,19 @@
|
|||
# ====================
|
||||
# UtilXlib UnitTesting
|
||||
# ====================
|
||||
|
||||
build:cudampiomp:
|
||||
tags: [galileo]
|
||||
only:
|
||||
- /utilxlib/i
|
||||
script:
|
||||
- module load profile/advanced pgi/17.10 cuda/8.0.61
|
||||
- ./configure --enable-openmp
|
||||
- cd UtilXlib/tests
|
||||
- bash compile_and_run_tests.sh -smcn
|
||||
|
||||
build:intelmpiomp:
|
||||
tags: [galileo]
|
||||
only:
|
||||
- /utilxlib/i
|
||||
script:
|
||||
- module load profile/advanced intel intelmpi
|
||||
- ./configure --enable-openmp
|
||||
- cd UtilXlib/tests
|
||||
- bash compile_and_run_tests.sh -sm
|
||||
|
||||
build:cudampi:
|
||||
tags: [galileo]
|
||||
only:
|
||||
- /utilxlib/i
|
||||
script:
|
||||
- module load profile/advanced pgi/17.10 cuda/8.0.61
|
||||
- ./configure
|
||||
- cd UtilXlib/tests
|
||||
- bash compile_and_run_tests.sh -smcn
|
||||
|
||||
build:intelmpi:
|
||||
tags: [galileo]
|
||||
only:
|
||||
- /utilxlib/i
|
||||
script:
|
||||
- module load profile/advanced intel intelmpi
|
||||
- ./configure
|
||||
- cd UtilXlib/tests
|
||||
- bash compile_and_run_tests.sh -sm
|
||||
|
||||
build:pw:
|
||||
image: ubuntu:latest
|
||||
tags: [docker]
|
||||
image: espressofoundation/ubuntu:latest
|
||||
script:
|
||||
- apt-get update
|
||||
- apt-get install --yes build-essential gfortran wget python
|
||||
- apt-get install --yes libopenmpi-dev openmpi-bin
|
||||
- apt-get install --yes libblas-dev liblapack-dev fftw3 fftw3-dev pkg-config
|
||||
- ./configure
|
||||
- make pw
|
||||
# - export OMP_NUM_THREADS=1
|
||||
# - cd test-suite
|
||||
# - mkdir /tmp/save
|
||||
# - make run-tests-pw-serial
|
||||
# - rm -r /tmp/save
|
||||
|
||||
build:cp:
|
||||
image: ubuntu:latest
|
||||
script:
|
||||
- apt-get update
|
||||
- apt-get install --yes build-essential gfortran wget python
|
||||
- apt-get install --yes libopenmpi-dev openmpi-bin
|
||||
- apt-get install --yes libblas-dev liblapack-dev fftw3 fftw3-dev pkg-config
|
||||
- ./configure
|
||||
- make cp
|
||||
# - export OMP_NUM_THREADS=1
|
||||
# - cd test-suite
|
||||
# - mkdir /tmp/save
|
||||
# - make run-tests-cp-serial
|
||||
# - rm -r /tmp/save
|
||||
|
||||
|
||||
|
||||
#### BUILDS ON GALILEO ####
|
||||
|
||||
build:intel:
|
||||
tags: [galileo]
|
||||
intel parallel:
|
||||
tags: [intel]
|
||||
script:
|
||||
- module purge
|
||||
- module load intel/pe-xe-2018--binary intelmpi/2018--binary mkl/2018--binary
|
||||
- module list
|
||||
- module load intel/pe-xe-2018--binary intelmpi/2018--binary mkl/2018--binary python
|
||||
- ./configure --enable-openmp
|
||||
- make pw cp
|
||||
- export OMP_NUM_THREADS=2
|
||||
- cd test-suite
|
||||
- mkdir -p /tmp/save
|
||||
- make run-tests-pw-parallel
|
||||
- make run-tests-cp-parallel
|
||||
- rm -r /tmp/save
|
||||
|
||||
|
||||
pgi cuda mpi:
|
||||
tags: [k80]
|
||||
script:
|
||||
- module purge
|
||||
- module load profile/global pgi/17.10 mkl/2018--binary cuda/8.0.61
|
||||
- ./configure --enable-openmp --with-cuda-runtime=8.0 --with-cuda-cc=35 --with-cuda=$CUDA_HOME --with-scalapack=no
|
||||
- make -j pw
|
||||
- make -j pw cp
|
||||
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
|
||||
- echo "Using $SLURM_NTASKS procs and $OMP_NUM_THREADS threads"
|
||||
- cd test-suite
|
||||
|
@ -107,3 +22,163 @@ pgi cuda mpi:
|
|||
- make run-tests-pw-parallel
|
||||
- sed -i 's/export PARA_POSTFIX=" "/export PARA_POSTFIX=" -pd=.true."/g' run-pw.sh
|
||||
- make clean && make run-tests-pw-parallel
|
||||
|
||||
pgi parallel k80:
|
||||
tags: [k80]
|
||||
script:
|
||||
- module purge
|
||||
- module load profile/global hpc-sdk/20.9--binary mkl/2019--binary python
|
||||
- ./configure --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=35 --with-cuda=yes --enable-cuda-env-check=no --with-scalapack=no
|
||||
- make -j pw cp
|
||||
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
|
||||
- echo "Using $SLURM_NTASKS procs and $OMP_NUM_THREADS threads"
|
||||
- cd test-suite
|
||||
- sed -i "s/TESTCODE_NPROCS=4/TESTCODE_NPROCS=$SLURM_NTASKS/" ENVIRONMENT
|
||||
- make clean
|
||||
- make run-tests-pw-parallel
|
||||
- sed -i 's/export PARA_POSTFIX=" "/export PARA_POSTFIX=" -pd=.true."/g' run-pw.sh
|
||||
- make clean && make run-tests-pw-parallel
|
||||
- cd .. && cp PW/src/pw.x ./pwgpu-mpi-cuda8-cc35-${CI_COMMIT_SHA:0:8}.x
|
||||
|
||||
|
||||
pgi parallel v100:
|
||||
tags: [galileo,v100]
|
||||
script:
|
||||
- module purge
|
||||
- module load profile/global hpc-sdk/20.9--binary mkl/2019--binary cuda/10.1 python
|
||||
- ./configure --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=70 --with-cuda=$CUDA_HOME --with-scalapack=no
|
||||
- make -j pw cp
|
||||
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
|
||||
- echo "Using $SLURM_NTASKS procs and $OMP_NUM_THREADS threads"
|
||||
- cd test-suite
|
||||
- sed -i "s/TESTCODE_NPROCS=4/TESTCODE_NPROCS=$SLURM_NTASKS/" ENVIRONMENT
|
||||
- make clean
|
||||
- make run-tests-pw-parallel
|
||||
- sed -i 's/export PARA_POSTFIX=" "/export PARA_POSTFIX=" -pd=.true."/g' run-pw.sh
|
||||
- make clean && make run-tests-pw-parallel
|
||||
|
||||
pgi serial k80:
|
||||
tags: [k80]
|
||||
script:
|
||||
- module purge
|
||||
- module load profile/global hpc-sdk/20.9--binary mkl/2019--binary python
|
||||
- ./configure --disable-parallel --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=35 --with-cuda=yes --enable-cuda-env-check=no
|
||||
- make -j pw cp
|
||||
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
|
||||
- echo "Using $OMP_NUM_THREADS threads"
|
||||
- cd test-suite
|
||||
- make clean
|
||||
- make run-tests-pw-serial
|
||||
- make run-tests-cp-serial
|
||||
|
||||
|
||||
pgi serial v100 :
|
||||
tags: [galileo,v100]
|
||||
script:
|
||||
- module purge
|
||||
- module load profile/global hpc-sdk/20.9--binary mkl/2019--binary python
|
||||
- ./configure --disable-parallel --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=70 --with-cuda=yes --enable-cuda-env-check=no
|
||||
- make -j pw cp
|
||||
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
|
||||
- echo "Using $OMP_NUM_THREADS threads"
|
||||
- cd test-suite
|
||||
- make clean
|
||||
- make run-tests-pw-serial
|
||||
- make run-tests-cp-serial
|
||||
|
||||
pgi207 power v100:
|
||||
tags: [marconi100]
|
||||
script:
|
||||
- module purge
|
||||
- module load profile/global hpc-sdk/2020--binary cuda/10.1 spectrum_mpi/10.3.1--binary python/3.8.2
|
||||
- pwd
|
||||
- cd ..
|
||||
- ./configure CC=pgcc F77=pgf90 FC=pgf90 F90=pgf90 MPIF90=mpipgifort --enable-openmp --with-cuda=$CUDA_ROOT --with-cuda-runtime=10.1 --with-cuda-cc=70
|
||||
- make -j pw cp
|
||||
- salloc --nodes=1 --ntasks-per-node=4 --ntasks-per-socket=2 --cpus-per-task=32 --gres=gpu:4 --mem=230000MB --time 01:00:00 -A $CINECA_QE_ACCOUNT -p m100_usr_prod
|
||||
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=8; fi
|
||||
- echo "Using $OMP_NUM_THREADS threads"
|
||||
- cd test-suite
|
||||
- make clean
|
||||
- make run-tests-pw-parallel
|
||||
- make run-tests-cp-parallel
|
||||
- exit
|
||||
|
||||
#pgi cuda9.2 p100 nollvm:
|
||||
# tags: [p100]
|
||||
# script:
|
||||
# - module purge
|
||||
# - module load profile/global pgi/19.10--binary cuda/10.0
|
||||
# - ./configure --enable-openmp --with-cuda-runtime=10.0 --with-cuda-cc=60 --with-cuda=$CUDA_HOME --with-scalapack=no
|
||||
# - sed -i 's/traditional/traditional -Uvector/' make.inc # problem in PGI compiler mis-understanding comments.
|
||||
# - sed -i 's/cuda10.0/cuda10.0,nollvm/' make.inc # this is needed for some reason yet to be clarified.
|
||||
# - make -j pw
|
||||
# - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
|
||||
# - echo "Using $OMP_NUM_THREADS threads"
|
||||
# - cd test-suite
|
||||
# - make clean
|
||||
# - make run-tests-pw-parallel
|
||||
#
|
||||
#pgi cuda9.2 p100:
|
||||
# tags: [p100]
|
||||
# script:
|
||||
# - module purge
|
||||
# - module load profile/global pgi/19.10--binary cuda/10.0
|
||||
# - ./configure --enable-openmp --with-cuda-runtime=10.0 --with-cuda-cc=60 --with-cuda=$CUDA_HOME --with-scalapack=no
|
||||
# - sed -i 's/traditional/traditional -Uvector/' make.inc # problem in PGI compiler mis-understanding comments.
|
||||
# - make -j pw
|
||||
# - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
|
||||
# - echo "Using $OMP_NUM_THREADS threads"
|
||||
# - cd test-suite
|
||||
# - make clean
|
||||
# - make run-tests-pw-parallel
|
||||
|
||||
#
|
||||
## UtilXlib UnitTesting
|
||||
#build:cudampiomp:
|
||||
# tags: [galileo]
|
||||
# script:
|
||||
# - module load profile/advanced pgi/17.10 cuda/8.0.61
|
||||
# - ./configure --enable-openmp
|
||||
# - cd UtilXlib/tests
|
||||
# - bash compile_and_run_tests.sh -smcn
|
||||
#
|
||||
#build:intelmpiomp:
|
||||
# tags: [galileo]
|
||||
# script:
|
||||
# - module load profile/advanced intel intelmpi
|
||||
# - ./configure --enable-openmp
|
||||
# - cd UtilXlib/tests
|
||||
# - bash compile_and_run_tests.sh -sm
|
||||
#
|
||||
#build:cudampi:
|
||||
# tags: [galileo]
|
||||
# script:
|
||||
# - module load profile/advanced pgi/17.10 cuda/8.0.61
|
||||
# - ./configure
|
||||
# - cd UtilXlib/tests
|
||||
# - bash compile_and_run_tests.sh -smcn
|
||||
#
|
||||
#build:intelmpi:
|
||||
# tags: [galileo]
|
||||
# script:
|
||||
# - module load profile/advanced intel intelmpi
|
||||
# - ./configure
|
||||
# - cd UtilXlib/tests
|
||||
# - bash compile_and_run_tests.sh -sm
|
||||
#
|
||||
##### BUILDS ON GALILEO ####
|
||||
#
|
||||
#build:laxlib-unittest:
|
||||
# tags: [galileo]
|
||||
# script:
|
||||
# - module load profile/advanced pgi/17.10 cuda/8.0.61
|
||||
# - ./configure && make pw
|
||||
# - cd LAXlib/tests
|
||||
# - make clean && make
|
||||
# - for file in ./*.x; do mpirun -np 4 ./$file; done
|
||||
# - cd ../../
|
||||
# - ./configure --disable-parallel && make clean && make pw
|
||||
# - cd LAXlib/tests
|
||||
# - make clean && make
|
||||
# - for file in ./*.x; do ./$file; done
|
||||
|
|
|
@ -22,12 +22,12 @@ build:cp:
|
|||
# - make run-tests-cp-serial
|
||||
# - rm -r /tmp/save
|
||||
|
||||
build:ph:
|
||||
tags: [docker]
|
||||
image: espressofoundation/ubuntu:latest
|
||||
script:
|
||||
- ./configure
|
||||
- make ph
|
||||
#build:ph:
|
||||
# tags: [docker]
|
||||
# image: espressofoundation/ubuntu:latest
|
||||
# script:
|
||||
# - ./configure
|
||||
# - make ph
|
||||
# - export OMP_NUM_THREADS=1
|
||||
# - cd test-suite
|
||||
# - mkdir /tmp/save
|
||||
|
|
|
@ -591,6 +591,12 @@ add_custom_target(gwl
|
|||
qe_gww_util_grap_exe
|
||||
qe_gww_util_abcoefftoeps_exe
|
||||
qe_gww_util_memorypw4gww_exe
|
||||
qe_gww_bse_bse_main_exe
|
||||
qe_gww_gww_exe
|
||||
qe_gww_gww_fit_exe
|
||||
qe_gww_head_exe
|
||||
qe_gww_simple_bse_exe
|
||||
qe_gww_simple_ip_exe
|
||||
COMMENT
|
||||
"GW with Lanczos chains")
|
||||
|
||||
|
|
|
@ -175,12 +175,16 @@
|
|||
|
||||
!orthonormalize c0
|
||||
|
||||
call calbec(1,nsp,eigr,c0,bec)
|
||||
call calbec(nbsp, betae,c0,bec)
|
||||
CALL gram_bgrp( betae, bec, nkb, c0, ngw )
|
||||
|
||||
!calculates phi for pcdaga
|
||||
|
||||
#if defined (__CUDA)
|
||||
CALL errore(' runcg_uspp ', ' GPU version not yet implemented', 1 )
|
||||
#else
|
||||
CALL calphi_bgrp( c0, SIZE(c0,1), bec, nkb, betae, phi, nbsp )
|
||||
#endif
|
||||
|
||||
!calculates the factors for S and K inversion in US case
|
||||
if(nkbus>0) then
|
||||
|
@ -205,13 +209,13 @@
|
|||
|
||||
|
||||
ENERGY_CHECK: if(.not. ene_ok ) then
|
||||
call calbec(1,nsp,eigr,c0,bec)
|
||||
call calbec(nbsp, betae,c0,bec)
|
||||
if(.not.tens) then
|
||||
call rhoofr(nfi,c0(:,:),irb,eigrb,bec,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
|
||||
else
|
||||
|
||||
if(newscheme.or.firstiter) then
|
||||
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,c0,bec,dbec,firstiter,vpot)
|
||||
firstiter=.false.
|
||||
endif
|
||||
|
@ -238,7 +242,7 @@
|
|||
!
|
||||
! put core charge (if present) in rhoc(r)
|
||||
!
|
||||
if (nlcc_any) call set_cc(irb,eigrb,rhoc)
|
||||
if (nlcc_any) call set_cc(rhoc)
|
||||
|
||||
!
|
||||
!---ensemble-DFT
|
||||
|
@ -301,7 +305,7 @@
|
|||
|
||||
!update d
|
||||
|
||||
call newd(vpot,irb,eigrb,rhovan,fion)
|
||||
call newd(vpot,rhovan,fion,.true.)
|
||||
|
||||
|
||||
call prefor(eigr,betae)!ATTENZIONE
|
||||
|
@ -341,13 +345,13 @@
|
|||
hpsi0=hpsi
|
||||
gi = hpsi
|
||||
|
||||
call calbec(1,nsp,eigr,hpsi,becm)
|
||||
call calbec(nbsp, betae,hpsi,becm)
|
||||
call xminus1(hpsi,betae,dumm,becm,s_minus1,.false.)
|
||||
! call sminus1(hpsi,becm,betae)
|
||||
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
!look if the following two lines are really needed
|
||||
call calbec(1,nsp,eigr,hpsi,becm)
|
||||
call calbec(nbsp, betae,hpsi,becm)
|
||||
call pc2(c0,bec,hpsi,becm)
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
|
@ -357,13 +361,13 @@
|
|||
else
|
||||
call xminus1_state(gi,betae,ema0bg,becm,k_minus1,.true.,ave_ene)
|
||||
endif
|
||||
call calbec(1,nsp,eigr,gi,becm)
|
||||
call calbec(nbsp, betae,gi,becm)
|
||||
call pc2(c0,bec,gi,becm)
|
||||
|
||||
|
||||
if(tens) call calcmt( nrlx, f, z0t, fmat0 )
|
||||
|
||||
call calbec(1,nsp,eigr,hpsi,bec0)
|
||||
call calbec(nbsp, betae,hpsi,bec0)
|
||||
|
||||
! calculates gamma
|
||||
gamma=0.d0
|
||||
|
@ -496,7 +500,7 @@
|
|||
|
||||
!project hi on conduction sub-space
|
||||
|
||||
call calbec(1,nsp,eigr,hi,bec0)
|
||||
call calbec(nbsp, betae,hi,bec0)
|
||||
call pc2(c0,bec,hi,bec0)
|
||||
|
||||
|
||||
|
@ -565,7 +569,7 @@
|
|||
|
||||
!orthonormalize
|
||||
|
||||
call calbec(1,nsp,eigr,cm,becm)
|
||||
call calbec(nbsp, betae,cm,becm)
|
||||
CALL gram_bgrp( betae, becm, nkb, cm, ngw )
|
||||
|
||||
!calculate energy
|
||||
|
@ -573,7 +577,7 @@
|
|||
call rhoofr(nfi,cm(:,:),irb,eigrb,becm,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
|
||||
else
|
||||
if(newscheme) then
|
||||
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,cm,becm,dbec,.false., vpot )
|
||||
endif
|
||||
|
||||
|
@ -588,7 +592,7 @@
|
|||
!
|
||||
! put core charge (if present) in rhoc(r)
|
||||
!
|
||||
if (nlcc_any) call set_cc(irb,eigrb,rhoc)
|
||||
if (nlcc_any) call set_cc(rhoc)
|
||||
!
|
||||
vpot = rhor
|
||||
!
|
||||
|
@ -628,7 +632,7 @@
|
|||
cm(1,:)=0.5d0*(cm(1,:)+CONJG(cm(1,:)))
|
||||
endif
|
||||
|
||||
call calbec(1,nsp,eigr,cm,becm)
|
||||
call calbec(nbsp, betae,cm,becm)
|
||||
CALL gram_bgrp( betae, becm, nkb, cm, ngw )
|
||||
|
||||
!test on energy: check the energy has really diminished
|
||||
|
@ -638,7 +642,7 @@
|
|||
call rhoofr(nfi,cm(:,:),irb,eigrb,becm,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
|
||||
else
|
||||
if(newscheme) then
|
||||
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,cm,becm,dbec,.false., vpot )
|
||||
endif
|
||||
! calculation of the rotated quantities
|
||||
|
@ -652,7 +656,7 @@
|
|||
!
|
||||
! put core charge (if present) in rhoc(r)
|
||||
!
|
||||
if (nlcc_any) call set_cc(irb,eigrb,rhoc)
|
||||
if (nlcc_any) call set_cc(rhoc)
|
||||
!
|
||||
vpot = rhor
|
||||
!
|
||||
|
@ -699,7 +703,7 @@
|
|||
endif
|
||||
c0=c0+spasso*passov*hi
|
||||
restartcg=.true.
|
||||
call calbec(1,nsp,eigr,c0,bec)
|
||||
call calbec(nbsp, betae,c0,bec)
|
||||
CALL gram_bgrp( betae, bec, nkb, c0, ngw )
|
||||
ene_ok=.false.
|
||||
!if ene1 << energy < ene0; go to ene1
|
||||
|
@ -709,7 +713,7 @@
|
|||
endif
|
||||
c0=c0+spasso*passov*hi
|
||||
restartcg=.true.!ATTENZIONE
|
||||
call calbec(1,nsp,eigr,c0,bec)
|
||||
call calbec(nbsp, betae,c0,bec)
|
||||
CALL gram_bgrp( betae, bec, nkb, c0, ngw )
|
||||
!if ene > ene0,en1 do a steepest descent step
|
||||
ene_ok=.false.
|
||||
|
@ -725,14 +729,14 @@
|
|||
cm=c0+spasso*passov*hi
|
||||
! chenge the searching direction
|
||||
spasso=spasso*(-1.d0)
|
||||
call calbec(1,nsp,eigr,cm,becm)
|
||||
call calbec(nbsp, betae,cm,becm)
|
||||
CALL gram_bgrp( betae, bec, nkb, cm, ngw )
|
||||
call calbec(1,nsp,eigr,cm,becm)
|
||||
call calbec(nbsp, betae,cm,becm)
|
||||
if(.not.tens) then
|
||||
call rhoofr(nfi,cm(:,:),irb,eigrb,becm,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
|
||||
else
|
||||
if(newscheme) then
|
||||
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,cm,becm,dbec,.false., vpot )
|
||||
endif
|
||||
! calculation of the rotated quantities
|
||||
|
@ -746,7 +750,7 @@
|
|||
!
|
||||
! put core charge (if present) in rhoc(r)
|
||||
!
|
||||
if (nlcc_any) call set_cc(irb,eigrb,rhoc)
|
||||
if (nlcc_any) call set_cc(rhoc)
|
||||
!
|
||||
vpot = rhor
|
||||
!
|
||||
|
@ -776,10 +780,14 @@
|
|||
|
||||
if(tens.and.newscheme) enever=enever-entropy
|
||||
|
||||
if(.not. ene_ok) call calbec (1,nsp,eigr,c0,bec)
|
||||
if(.not. ene_ok) call calbec (nbsp, betae,c0,bec)
|
||||
|
||||
#if defined (__CUDA)
|
||||
CALL errore(' runcg_uspp ', ' GPU version not yet implemented', 1 )
|
||||
#else
|
||||
!calculates phi for pc_daga
|
||||
CALL calphi_bgrp( c0, SIZE(c0,1), bec, nkb, betae, phi, nbsp )
|
||||
#endif
|
||||
|
||||
!=======================================================================
|
||||
!
|
||||
|
@ -788,7 +796,7 @@
|
|||
!
|
||||
!=======================================================================
|
||||
if(tens.and. .not.newscheme) then
|
||||
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,c0,bec,dbec,firstiter, vpot )
|
||||
!the following sets up the new energy
|
||||
enever=etot
|
||||
|
@ -809,7 +817,7 @@
|
|||
!calculates atomic forces and lambda
|
||||
|
||||
if(tpre) then!if pressure is need the following is written because of caldbec
|
||||
call calbec(1,nsp,eigr,c0,bec)
|
||||
call calbec(nbsp, betae,c0,bec)
|
||||
if(.not.tens) then
|
||||
call caldbec_bgrp( eigr, c0, dbec, idesc )
|
||||
call rhoofr(nfi,c0(:,:),irb,eigrb,bec,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
|
||||
|
@ -827,7 +835,7 @@
|
|||
!
|
||||
! put core charge (if present) in rhoc(r)
|
||||
!
|
||||
if (nlcc_any) call set_cc(irb,eigrb,rhoc)
|
||||
if (nlcc_any) call set_cc(rhoc)
|
||||
|
||||
!
|
||||
!---ensemble-DFT
|
||||
|
@ -844,12 +852,16 @@
|
|||
|
||||
call calcmt( nrlx, f, z0t, fmat0 )
|
||||
|
||||
call newd(vpot,irb,eigrb,rhovan,fion)
|
||||
call newd(vpot,rhovan,fion,.true.)
|
||||
#if defined (__CUDA)
|
||||
CALL errore(' runcg_uspp ', ' GPU version not yet implemented', 1 )
|
||||
#else
|
||||
if (.not.tens) then
|
||||
if (tfor .or. tprnfor) call nlfq_bgrp( c0, eigr, bec, becdr, fion ) ! call nlfq(c0,eigr,bec,becdr,fion)
|
||||
if (tfor .or. tprnfor) call nlfq_bgrp( c0, betae, bec, becdr, fion )
|
||||
else
|
||||
if (tfor .or. tprnfor) call nlfq_bgrp( c0diag, eigr, becdiag, becdrdiag, fion ) ! call nlfq(c0diag,eigr,becdiag,becdrdiag,fion)
|
||||
if (tfor .or. tprnfor) call nlfq_bgrp( c0diag, betae, becdiag, becdrdiag, fion )
|
||||
endif
|
||||
#endif
|
||||
|
||||
call prefor(eigr,betae)
|
||||
do i=1,nbsp,2
|
||||
|
@ -955,9 +967,9 @@
|
|||
enddo
|
||||
enddo
|
||||
|
||||
call calbec (1,nsp,eigr,c0,bec)
|
||||
call calbec (nbsp,betae,c0,bec)
|
||||
CALL gram_bgrp( betae, bec, nkb, c0, ngw )
|
||||
call calbec(1,nsp,eigr,c0,bec)
|
||||
call calbec(nbsp, betae,c0,bec)
|
||||
|
||||
|
||||
|
||||
|
@ -998,7 +1010,7 @@
|
|||
CALL mp_sum( lambda_repl, intra_bgrp_comm )
|
||||
CALL distribute_lambda( lambda_repl, lambda( :, :, 1 ), idesc( :, 1 ) )
|
||||
cm(:,:)=c0(:,:)
|
||||
call calbec (1,nsp,eigr,cm,becm)
|
||||
call calbec (nbsp, betae,cm,becm)
|
||||
|
||||
endif
|
||||
DEALLOCATE( lambda_repl )
|
||||
|
@ -1031,7 +1043,7 @@
|
|||
!
|
||||
DEALLOCATE( lambda_dist )
|
||||
!
|
||||
call nlsm2_bgrp( ngw, nkb, eigr, c0, becdr, nbspx, nbsp )
|
||||
call nlsm2_bgrp( ngw, nkb, betae, c0, becdr, nbspx, nbsp )
|
||||
!
|
||||
endif
|
||||
!
|
||||
|
|
|
@ -6,9 +6,16 @@
|
|||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
|
||||
#if defined(__CUDA)
|
||||
#define DEVICEATTR ,DEVICE
|
||||
#else
|
||||
#define DEVICEATTR
|
||||
#endif
|
||||
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE rhoofr_cp &
|
||||
( nfi, c_bgrp, irb, eigrb, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
|
||||
( nfi, c_bgrp, c_d, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, &
|
||||
enl, denl, ekin, dekin, tstress, ndwwf )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! this routine computes:
|
||||
|
@ -48,7 +55,7 @@
|
|||
USE uspp, ONLY: nkb
|
||||
USE uspp_param, ONLY: nh, nhm
|
||||
USE cell_base, ONLY: omega
|
||||
USE electrons_base, ONLY: nspin, nbsp_bgrp, ispin_bgrp, f_bgrp
|
||||
USE electrons_base, ONLY: nspin, nbsp_bgrp, ispin_bgrp, f_bgrp, f_d
|
||||
USE constants, ONLY: pi, fpi
|
||||
USE mp, ONLY: mp_sum
|
||||
USE io_global, ONLY: stdout, ionode
|
||||
|
@ -60,7 +67,7 @@
|
|||
USE fft_interfaces, ONLY: fwfft, invfft
|
||||
USE fft_base, ONLY: dffts, dfftp
|
||||
USE cp_interfaces, ONLY: checkrho, ennl, calrhovan, dennl
|
||||
USE cp_main_variables, ONLY: iprint_stdout, idesc
|
||||
USE cp_main_variables, ONLY: iprint_stdout, idesc, irb, eigrb
|
||||
USE wannier_base, ONLY: iwf
|
||||
USE exx_module, ONLY: rhopr
|
||||
USE input_parameters, ONLY: tcpbo ! BS
|
||||
|
@ -68,6 +75,8 @@
|
|||
USE io_files, ONLY: restart_dir
|
||||
USE fft_rho
|
||||
USE fft_helper_subroutines, ONLY: c2psi_gamma
|
||||
USE mp, ONLY: mp_barrier
|
||||
USE mp_world, ONLY: mpime, world_comm
|
||||
!
|
||||
IMPLICIT NONE
|
||||
INTEGER nfi
|
||||
|
@ -79,11 +88,10 @@
|
|||
REAL(DP) rhos(:,:)
|
||||
REAL(DP) enl, ekin
|
||||
REAL(DP) denl(3,3), dekin(6)
|
||||
COMPLEX(DP) eigrb( :, : )
|
||||
COMPLEX(DP) rhog( :, : )
|
||||
COMPLEX(DP) drhog( :, :, :, : )
|
||||
COMPLEX(DP) c_bgrp( :, : )
|
||||
INTEGER irb( :, : )
|
||||
COMPLEX(DP) DEVICEATTR :: c_d( :, : )
|
||||
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
|
||||
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
|
||||
|
||||
|
@ -119,7 +127,11 @@
|
|||
!
|
||||
! calculation of kinetic energy ekin
|
||||
!
|
||||
#if defined (__CUDA)
|
||||
ekin = enkin( c_d, f_d, nbsp_bgrp )
|
||||
#else
|
||||
ekin = enkin( c_bgrp, f_bgrp, nbsp_bgrp )
|
||||
#endif
|
||||
!
|
||||
IF( nbgrp > 1 ) &
|
||||
CALL mp_sum( ekin, inter_bgrp_comm )
|
||||
|
@ -217,6 +229,9 @@
|
|||
CALL errore( ' rhoofr ', ' c second dimension too small ', SIZE( c_bgrp, 2 ) )
|
||||
!
|
||||
c_bgrp( :, nbsp_bgrp + 1 ) = ( 0.d0, 0.d0 )
|
||||
#if defined (__CUDA)
|
||||
c_d( :, nbsp_bgrp + 1 ) = ( 0.d0, 0.d0 )
|
||||
#endif
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
|
@ -236,7 +251,11 @@
|
|||
!
|
||||
ELSE
|
||||
!
|
||||
#if defined (__CUDA)
|
||||
CALL loop_over_states_gpu()
|
||||
#else
|
||||
CALL loop_over_states()
|
||||
#endif
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -260,7 +279,7 @@
|
|||
DEALLOCATE( drhovan )
|
||||
END IF
|
||||
!
|
||||
CALL rhov( irb, eigrb, rhovan, rhog, rhor )
|
||||
CALL rhov( rhovan, rhog, rhor )
|
||||
|
||||
ENDIF COMPUTE_CHARGE
|
||||
!
|
||||
|
@ -359,6 +378,10 @@
|
|||
#endif
|
||||
REAL(DP), ALLOCATABLE :: tmp_rhos(:,:)
|
||||
|
||||
IF( fftx_ntgrp(dffts) > 1 ) THEN
|
||||
CALL errore('rhoofr','Task group not supported',1)
|
||||
END IF
|
||||
|
||||
ALLOCATE( psis( dffts%nnr_tg ) )
|
||||
!
|
||||
CALL tg_get_group_nr3( dffts, tg_nr3 )
|
||||
|
@ -369,17 +392,9 @@
|
|||
|
||||
do i = 1, nbsp_bgrp, 2 * fftx_ntgrp(dffts)
|
||||
|
||||
#if defined(__MPI)
|
||||
!
|
||||
CALL c2psi_gamma_tg(dffts, psis, c_bgrp, i, nbsp_bgrp )
|
||||
|
||||
CALL invfft ('tgWave', psis, dffts )
|
||||
#else
|
||||
CALL c2psi_gamma( dffts, psis, c_bgrp(:,i), c_bgrp(:,i+1) )
|
||||
|
||||
CALL invfft('Wave', psis, dffts )
|
||||
|
||||
#endif
|
||||
!
|
||||
! Now the first proc of the group holds the first two bands
|
||||
! of the 2*nogrp bands that we are processing at the same time,
|
||||
|
@ -446,6 +461,103 @@
|
|||
RETURN
|
||||
END SUBROUTINE loop_over_states
|
||||
|
||||
#if defined (__CUDA)
|
||||
|
||||
SUBROUTINE loop_over_states_gpu
|
||||
!
|
||||
USE parallel_include
|
||||
USE fft_helper_subroutines
|
||||
USE control_flags, ONLY : many_fft
|
||||
USE cudafor
|
||||
!
|
||||
! MAIN LOOP OVER THE EIGENSTATES
|
||||
! - This loop is also parallelized within the task-groups framework
|
||||
! - Each group works on a number of eigenstates in parallel
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: from, i, ig, eig_index, eig_offset, ii, tg_nr3, ioff
|
||||
!
|
||||
REAL(DP), DEVICE, ALLOCATABLE :: rhos_d(:,:)
|
||||
COMPLEX(DP), DEVICE, ALLOCATABLE :: psis(:)
|
||||
COMPLEX(DP), DEVICE, ALLOCATABLE :: ptmp(:,:)
|
||||
INTEGER, DEVICE, POINTER :: nl_d(:), nlm_d(:)
|
||||
|
||||
ALLOCATE( psis( dffts%nnr * many_fft ) ) ! dffts%nnr * many_fft
|
||||
ALLOCATE( rhos_d ( SIZE(rhos,1), SIZE(rhos,2) ) )
|
||||
!
|
||||
rhos_d = 0_DP
|
||||
nl_d => dffts%nl_d
|
||||
nlm_d => dffts%nlm_d
|
||||
|
||||
do i = 1, nbsp_bgrp, 2 * many_fft
|
||||
|
||||
psis = 0.0d0
|
||||
|
||||
ioff = 0
|
||||
DO ii = i, i + 2 * many_fft - 1, 2
|
||||
IF( ii < nbsp_bgrp ) THEN
|
||||
!$cuf kernel do(1)
|
||||
do ig = 1, dffts%ngw
|
||||
psis( nlm_d( ig ) + ioff) = CONJG( c_d( ig, ii ) ) + ci * conjg( c_d( ig, ii+1 ))
|
||||
psis( nl_d( ig ) + ioff) = c_d( ig, ii ) + ci * c_d( ig, ii+1 )
|
||||
end do
|
||||
ELSE IF( ii == nbsp_bgrp ) THEN
|
||||
!$cuf kernel do(1)
|
||||
do ig = 1, dffts%ngw
|
||||
psis( nlm_d( ig ) + ioff) = CONJG( c_d( ig, ii ) )
|
||||
psis( nl_d( ig ) + ioff) = c_d( ig, ii )
|
||||
end do
|
||||
END IF
|
||||
! CALL c2psi_gamma( dffts, psis, c_bgrp(:,ii), c_bgrp(:,ii+1) )
|
||||
ioff = ioff + dffts%nnr
|
||||
|
||||
END DO
|
||||
|
||||
CALL invfft('Wave', psis, dffts, many_fft )
|
||||
|
||||
ioff = 0
|
||||
DO ii = i, i + 2 * many_fft - 1, 2
|
||||
IF( ii < nbsp_bgrp ) THEN
|
||||
iss1=ispin_bgrp( ii )
|
||||
sa1 =f_bgrp( ii )/omega
|
||||
iss2=ispin_bgrp( ii + 1 )
|
||||
sa2 =f_bgrp( ii + 1 )/omega
|
||||
!$cuf kernel do(1)
|
||||
do ir = 1, dffts%nnr
|
||||
rhos_d(ir,iss1) = rhos_d(ir,iss1) + sa1*( real(psis(ir + ioff)))**2
|
||||
rhos_d(ir,iss2) = rhos_d(ir,iss2) + sa2*(aimag(psis(ir + ioff)))**2
|
||||
end do
|
||||
ELSE IF( ii == nbsp_bgrp ) THEN
|
||||
iss1=ispin_bgrp( ii )
|
||||
sa1 =f_bgrp( ii )/omega
|
||||
iss2=iss1
|
||||
sa2=0.0d0
|
||||
!$cuf kernel do(1)
|
||||
do ir = 1, dffts%nnr
|
||||
rhos_d(ir,iss1) = rhos_d(ir,iss1) + sa1*( real(psis(ir + ioff)))**2
|
||||
END DO
|
||||
END IF
|
||||
ioff = ioff + dffts%nnr
|
||||
END DO
|
||||
!
|
||||
END DO
|
||||
|
||||
rhos = rhos_d
|
||||
|
||||
IF( nbgrp > 1 ) THEN
|
||||
CALL mp_sum( rhos, inter_bgrp_comm )
|
||||
END IF
|
||||
|
||||
DEALLOCATE( rhos_d )
|
||||
DEALLOCATE( psis )
|
||||
DEALLOCATE( ptmp )
|
||||
|
||||
RETURN
|
||||
END SUBROUTINE loop_over_states_gpu
|
||||
#endif
|
||||
|
||||
|
||||
!-----------------------------------------------------------------------
|
||||
END SUBROUTINE rhoofr_cp
|
||||
!-----------------------------------------------------------------------
|
||||
|
@ -525,50 +637,32 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
|
|||
REAL(DP), INTENT(OUT) :: drhor(dfftp%nnr,nspin,3,3)
|
||||
COMPLEX(DP), INTENT(OUT) :: drhog(dfftp%ngm,nspin,3,3)
|
||||
! local
|
||||
INTEGER i, j, isup, isdw, iv, jv, ig, ijv, is, iss, ia, ir, ijs
|
||||
INTEGER i, j, isup, isdw, iv, jv, ig, ijv, is, iss, ia, ir, ijs, itid
|
||||
REAL(DP) :: asumt, dsumt
|
||||
COMPLEX(DP) fp, fm, ci
|
||||
#if defined(__INTEL_COMPILER)
|
||||
#if __INTEL_COMPILER >= 1300
|
||||
!dir$ attributes align: 4096 :: v, dqgbt,qv
|
||||
#endif
|
||||
#endif
|
||||
COMPLEX(DP), ALLOCATABLE :: v(:)
|
||||
COMPLEX(DP), ALLOCATABLE:: dqgbt(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: qv(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: fg1(:), fg2(:)
|
||||
!
|
||||
INTEGER :: itid, mytid, ntids
|
||||
#if defined(_OPENMP)
|
||||
INTEGER :: omp_get_thread_num, omp_get_num_threads
|
||||
EXTERNAL :: omp_get_thread_num, omp_get_num_threads
|
||||
#endif
|
||||
!
|
||||
!$omp parallel default(none), private(i,j,iss,ir,ig,mytid,ntids,itid), shared(nspin,dfftp,drhor,drhog,rhor,rhog,ainv)
|
||||
#if defined(_OPENMP)
|
||||
mytid = omp_get_thread_num() ! take the thread ID
|
||||
ntids = omp_get_num_threads() ! take the number of threads
|
||||
#else
|
||||
mytid = 0
|
||||
ntids = 1
|
||||
#endif
|
||||
itid = 0
|
||||
!$omp parallel do collapse(3) default(none), private(i,j,iss,ir,ig), shared(nspin,dfftp,drhor,drhog,rhor,rhog,ainv)
|
||||
DO j=1,3
|
||||
DO i=1,3
|
||||
DO iss=1,nspin
|
||||
IF( MOD( itid, ntids ) == mytid ) THEN
|
||||
DO ir=1,dfftp%nnr
|
||||
drhor(ir,iss,i,j)=-rhor(ir,iss)*ainv(j,i)
|
||||
END DO
|
||||
DO ig=1,dfftp%ngm
|
||||
drhog(ig,iss,i,j)=-rhog(ig,iss)*ainv(j,i)
|
||||
END DO
|
||||
END IF
|
||||
itid = itid + 1
|
||||
DO ir=1,dfftp%nnr
|
||||
drhor(ir,iss,i,j)=-rhor(ir,iss)*ainv(j,i)
|
||||
END DO
|
||||
DO ig=1,dfftp%ngm
|
||||
drhog(ig,iss,i,j)=-rhog(ig,iss)*ainv(j,i)
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
!$omp end parallel
|
||||
!$omp end parallel do
|
||||
|
||||
IF ( nkbus <= 0 ) THEN
|
||||
GO TO 1000
|
||||
|
@ -589,8 +683,8 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
|
|||
|
||||
!$omp parallel default(none) &
|
||||
!$omp shared(nat, ityp, ngb, nh, eigrb, dfftb, irb, v, &
|
||||
!$omp ci, i, j, dqgb, qgb, nhm, rhovan, drhovan, upf ) &
|
||||
!$omp private(mytid, ntids, is, ia, iv, jv, ijv, ig, iss, &
|
||||
!$omp i, j, dqgb, qgb, nhm, rhovan, drhovan, upf ) &
|
||||
!$omp private( is, ia, iv, jv, ijv, ig, iss, &
|
||||
!$omp qv, fg1, fg2, itid, dqgbt, dsumt, asumt )
|
||||
|
||||
ALLOCATE( qv( dfftb%nnr ) )
|
||||
|
@ -598,12 +692,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
|
|||
ALLOCATE( fg1( ngb ) )
|
||||
ALLOCATE( fg2( ngb ) )
|
||||
|
||||
#if defined(_OPENMP)
|
||||
mytid = omp_get_thread_num() ! take the thread ID
|
||||
ntids = omp_get_num_threads() ! take the number of threads
|
||||
itid = 0
|
||||
#endif
|
||||
|
||||
iss=1
|
||||
|
||||
DO ia=1,nat
|
||||
|
@ -617,11 +706,11 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
|
|||
#endif
|
||||
|
||||
#if defined(_OPENMP)
|
||||
IF ( mytid /= itid ) THEN
|
||||
itid = MOD( itid + 1, ntids )
|
||||
IF ( omp_get_thread_num() /= itid ) THEN
|
||||
itid = MOD( itid + 1, omp_get_num_threads() )
|
||||
CYCLE
|
||||
ELSE
|
||||
itid = MOD( itid + 1, ntids )
|
||||
itid = MOD( itid + 1, omp_get_num_threads() )
|
||||
END IF
|
||||
#endif
|
||||
|
||||
|
@ -761,7 +850,7 @@ END SUBROUTINE drhov
|
|||
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
|
||||
SUBROUTINE rhov(rhovan,rhog,rhor)
|
||||
!-----------------------------------------------------------------------
|
||||
! Add Vanderbilt contribution to rho(r) and rho(g)
|
||||
!
|
||||
|
@ -772,7 +861,7 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
|
|||
USE kinds, ONLY: dp
|
||||
USE ions_base, ONLY: nat, na, nsp, ityp
|
||||
USE io_global, ONLY: stdout
|
||||
USE mp_global, ONLY: intra_bgrp_comm
|
||||
USE mp_global, ONLY: intra_bgrp_comm, inter_bgrp_comm
|
||||
USE mp, ONLY: mp_sum
|
||||
USE uspp_param, ONLY: nh, nhm, upf
|
||||
USE uspp, ONLY: deeq, nkbus
|
||||
|
@ -786,19 +875,21 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
|
|||
USE fft_interfaces, ONLY: fwfft, invfft
|
||||
USE fft_base, ONLY: dfftb, dfftp, dfftb
|
||||
USE fft_helper_subroutines, ONLY: fftx_add_threed2oned_gamma
|
||||
USE cp_main_variables, ONLY: irb, eigrb, iabox, nabox
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP), INTENT(IN) :: rhovan(nhm*(nhm+1)/2,nat,nspin)
|
||||
INTEGER, INTENT(in) :: irb(3,nat)
|
||||
COMPLEX(DP), INTENT(in):: eigrb(ngb,nat)
|
||||
!
|
||||
REAL(DP), INTENT(inout):: rhor(dfftp%nnr,nspin)
|
||||
COMPLEX(DP), INTENT(inout):: rhog(dfftp%ngm,nspin)
|
||||
!
|
||||
INTEGER :: isup, isdw, nfft, ifft, iv, jv, ig, ijv, is, iss, isa, ia, ir, i, j
|
||||
INTEGER, PARAMETER :: isup = 1
|
||||
INTEGER, PARAMETER :: isdw = 2
|
||||
INTEGER :: iv, jv, ig, ijv, is, iss, ia, ir, i, j, iia
|
||||
REAL(DP) :: sumrho
|
||||
COMPLEX(DP) :: ci, fp, fm, ca
|
||||
COMPLEX(DP) :: fp, fm, ca
|
||||
COMPLEX(DP), PARAMETER :: ci=(0.d0,1.d0)
|
||||
#if defined(__INTEL_COMPILER)
|
||||
#if __INTEL_COMPILER >= 1300
|
||||
!dir$ attributes align: 4096 :: qgbt, v, qv
|
||||
|
@ -809,8 +900,8 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
|
|||
COMPLEX(DP), ALLOCATABLE :: qv(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: fg1(:), fg2(:)
|
||||
|
||||
INTEGER :: mytid, ntids
|
||||
#if defined(_OPENMP)
|
||||
INTEGER :: itid, mytid, ntids
|
||||
INTEGER :: omp_get_thread_num, omp_get_num_threads
|
||||
EXTERNAL :: omp_get_thread_num, omp_get_num_threads
|
||||
#endif
|
||||
|
@ -822,7 +913,6 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
|
|||
END IF
|
||||
|
||||
CALL start_clock( 'rhov' )
|
||||
ci=(0.d0,1.d0)
|
||||
!
|
||||
!
|
||||
ALLOCATE( v( dfftp%nnr ) )
|
||||
|
@ -830,13 +920,8 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
|
|||
! private variable need to be initialized, otherwise
|
||||
! outside the parallel region they have an undetermined value
|
||||
!
|
||||
#if defined(_OPENMP)
|
||||
mytid = 0
|
||||
ntids = 1
|
||||
itid = 0
|
||||
#endif
|
||||
iss = 1
|
||||
isa = 1
|
||||
!
|
||||
IF(nspin.EQ.1) THEN
|
||||
!
|
||||
|
@ -845,12 +930,8 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
|
|||
|
||||
!$omp parallel default(none) &
|
||||
!$omp shared(na, ngb, nh, rhovan, qgb, eigrb, dfftb, iverbosity, omegab, irb, v, &
|
||||
!$omp stdout, ci, rhor, dfftp, upf, nsp, ityp, nat ) &
|
||||
!$omp private(mytid, ntids, is, ia, nfft, ifft, iv, jv, ijv, sumrho, qgbt, ig, iss, isa, ca, &
|
||||
!$omp qv, fg1, fg2, itid, ir )
|
||||
|
||||
iss=1
|
||||
isa=1
|
||||
!$omp stdout, rhor, dfftp, upf, nsp, ityp, nat, nspin, iabox, nabox, inter_bgrp_comm ) &
|
||||
!$omp private(mytid, ntids, is, ia, iia, iv, jv, ijv, sumrho, qgbt, ig, ca, qv, ir )
|
||||
|
||||
!$omp workshare
|
||||
v (:) = (0.d0, 0.d0)
|
||||
|
@ -859,149 +940,96 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
|
|||
#if defined(_OPENMP)
|
||||
mytid = omp_get_thread_num() ! take the thread ID
|
||||
ntids = omp_get_num_threads() ! take the number of threads
|
||||
itid = 0
|
||||
#endif
|
||||
|
||||
ALLOCATE( qgbt( ngb, 2 ) )
|
||||
ALLOCATE( qgbt( ngb, nspin ) )
|
||||
ALLOCATE( qv( dfftb%nnr ) )
|
||||
ALLOCATE( fg1( ngb ) )
|
||||
ALLOCATE( fg2( ngb ) )
|
||||
|
||||
|
||||
DO ia = 1, nat
|
||||
|
||||
is = ityp(ia)
|
||||
|
||||
IF( upf(is)%tvanp ) THEN
|
||||
|
||||
nfft = 1
|
||||
|
||||
#if defined(__MPI)
|
||||
IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) ) THEN
|
||||
CYCLE
|
||||
END IF
|
||||
#endif
|
||||
#if defined(_OPENMP)
|
||||
IF ( mytid /= itid ) THEN
|
||||
itid = MOD( itid + 1, ntids )
|
||||
CYCLE
|
||||
ELSE
|
||||
itid = MOD( itid + 1, ntids )
|
||||
END IF
|
||||
#endif
|
||||
|
||||
DO ifft=1,nfft
|
||||
qgbt(:,ifft) = (0.d0, 0.d0)
|
||||
DO iv= 1,nh(is)
|
||||
DO jv=iv,nh(is)
|
||||
ijv = (jv-1)*jv/2 + iv
|
||||
sumrho=rhovan(ijv,ia+ifft-1,iss)
|
||||
IF(iv.NE.jv) sumrho=2.d0*sumrho
|
||||
DO ig=1,ngb
|
||||
qgbt(ig,ifft)=qgbt(ig,ifft) + sumrho*qgb(ig,ijv,is)
|
||||
END DO
|
||||
DO iia = 1, nabox
|
||||
IF( MOD( iia - 1, ntids ) == mytid ) THEN
|
||||
ia = iabox(iia)
|
||||
is = ityp(ia)
|
||||
qgbt(:,1) = (0.d0, 0.d0)
|
||||
DO iv= 1,nh(is)
|
||||
DO jv=iv,nh(is)
|
||||
ijv = (jv-1)*jv/2 + iv
|
||||
sumrho=rhovan(ijv,ia,1)
|
||||
IF(iv.NE.jv) sumrho=2.d0*sumrho
|
||||
DO ig=1,ngb
|
||||
qgbt(ig,1)=qgbt(ig,1) + sumrho*qgb(ig,ijv,is)
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
! add structure factor
|
||||
!
|
||||
IF(nfft.EQ.2)THEN
|
||||
fg1 = eigrb(1:ngb,ia )*qgbt(1:ngb,1)
|
||||
fg2 = eigrb(1:ngb,ia+1 )*qgbt(1:ngb,2)
|
||||
CALL fft_oned2box( qv, fg1, fg2 )
|
||||
ELSE
|
||||
fg1 = eigrb(1:ngb,ia )*qgbt(1:ngb,1)
|
||||
CALL fft_oned2box( qv, fg1 )
|
||||
ENDIF
|
||||
qgbt(1:ngb,1) = eigrb(1:ngb,ia)*qgbt(1:ngb,1)
|
||||
CALL fft_oned2box( qv, qgbt(:,1) )
|
||||
|
||||
CALL invfft( qv, dfftb, ia )
|
||||
!
|
||||
! qv = US augmentation charge in real space on box grid
|
||||
! for atomic species is, real(qv)=atom ia, imag(qv)=atom ia+1
|
||||
|
||||
IF( iverbosity > 1 ) THEN
|
||||
ca = SUM(qv)
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom g-sp = ', &
|
||||
& omegab*DBLE(qgbt(1,1))
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom r-sp = ', &
|
||||
& omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom g-sp = ', &
|
||||
& omegab*DBLE(qgbt(1,2))
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom r-sp = ', &
|
||||
& omegab*AIMAG(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
|
||||
ENDIF
|
||||
!
|
||||
! add qv(r) to v(r), in real space on the dense grid
|
||||
!
|
||||
CALL box2grid(irb(:,ia),1,qv,v)
|
||||
IF (nfft.EQ.2) CALL box2grid(irb(:,ia+1),2,qv,v)
|
||||
!
|
||||
END IF
|
||||
END DO
|
||||
|
||||
DEALLOCATE( fg1 )
|
||||
DEALLOCATE( fg2 )
|
||||
DEALLOCATE(qv)
|
||||
DEALLOCATE(qgbt)
|
||||
!
|
||||
! rhor(r) = total (smooth + US) charge density in real space
|
||||
!
|
||||
!$omp barrier
|
||||
!$omp master
|
||||
!
|
||||
CALL mp_sum( v, inter_bgrp_comm )
|
||||
!
|
||||
!$omp end master
|
||||
!$omp barrier
|
||||
!$omp do
|
||||
DO ir=1,dfftp%nnr
|
||||
rhor(ir,1)=rhor(ir,1)+DBLE(v(ir))
|
||||
END DO
|
||||
!$omp end do
|
||||
!$omp end parallel
|
||||
|
||||
iss = 1
|
||||
|
||||
DO ir=1,dfftp%nnr
|
||||
rhor(ir,iss)=rhor(ir,iss)+DBLE(v(ir))
|
||||
END DO
|
||||
|
||||
!
|
||||
IF( iverbosity > 1 ) THEN
|
||||
ca = SUM(v)
|
||||
|
||||
CALL mp_sum( ca, intra_bgrp_comm )
|
||||
|
||||
WRITE( stdout,'(a,2f12.8)') &
|
||||
& ' rhov: int n_v(r) dr = ',omega*ca/(dfftp%nr1*dfftp%nr2*dfftp%nr3)
|
||||
ENDIF
|
||||
!
|
||||
CALL fwfft('Rho',v, dfftp )
|
||||
!
|
||||
IF( iverbosity > 1 ) THEN
|
||||
WRITE( stdout,*) ' rhov: smooth ',omega*rhog(1,iss)
|
||||
WRITE( stdout,*) ' rhov: vander ',omega*v(1)
|
||||
WRITE( stdout,*) ' rhov: all ',omega*(rhog(1,iss)+v(1))
|
||||
ENDIF
|
||||
!
|
||||
! rhog(g) = total (smooth + US) charge density in G-space
|
||||
!
|
||||
CALL fftx_add_threed2oned_gamma( dfftp, v, rhog(:,iss) )
|
||||
CALL fftx_add_threed2oned_gamma( dfftp, v, rhog(:,1) )
|
||||
|
||||
IF( iverbosity > 1 ) WRITE( stdout,'(a,2f12.8)') &
|
||||
& ' rhov: n_v(g=0) = ',omega*DBLE(rhog(1,iss))
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! nspin=2: two fft at a time, one for spin up and one for spin down
|
||||
!
|
||||
isup=1
|
||||
isdw=2
|
||||
!$omp parallel default(none) &
|
||||
!$omp shared(na, ngb, nh, rhovan, qgb, eigrb, dfftb, iverbosity, omegab, irb, v, &
|
||||
!$omp stdout, rhor, dfftp, upf, nsp, ityp, nat, nspin, iabox, nabox, inter_bgrp_comm ) &
|
||||
!$omp private(mytid, ntids, is, ia, iia, iv, jv, ijv, sumrho, qgbt, ig, ca, qv, fg1, fg2, ir )
|
||||
|
||||
|
||||
!$omp workshare
|
||||
v (:) = (0.d0, 0.d0)
|
||||
!$omp end workshare
|
||||
|
||||
#if defined(_OPENMP)
|
||||
mytid = omp_get_thread_num() ! take the thread ID
|
||||
ntids = omp_get_num_threads() ! take the number of threads
|
||||
#endif
|
||||
|
||||
ALLOCATE( qgbt( ngb, 2 ) )
|
||||
ALLOCATE( qv( dfftb%nnr ) )
|
||||
ALLOCATE( fg1( ngb ) )
|
||||
ALLOCATE( fg2( ngb ) )
|
||||
|
||||
isa=1
|
||||
DO ia=1,nat
|
||||
is = ityp(ia)
|
||||
#if defined(__MPI)
|
||||
IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) ) CYCLE
|
||||
#endif
|
||||
IF( upf(is)%tvanp ) THEN
|
||||
|
||||
DO iia = 1, nabox
|
||||
IF( MOD( iia - 1, ntids ) == mytid ) THEN
|
||||
ia = iabox(iia)
|
||||
is = ityp(ia)
|
||||
DO iss=1,2
|
||||
qgbt(:,iss) = (0.d0, 0.d0)
|
||||
DO iv=1,nh(is)
|
||||
|
@ -1015,42 +1043,40 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
|
|||
END DO
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
! add structure factor
|
||||
!
|
||||
!
|
||||
! add structure factor
|
||||
!
|
||||
fg1 = eigrb(1:ngb,ia)*qgbt(1:ngb,1)
|
||||
fg2 = eigrb(1:ngb,ia)*qgbt(1:ngb,2)
|
||||
CALL fft_oned2box( qv, fg1, fg2 )
|
||||
!
|
||||
!
|
||||
CALL invfft( qv,dfftb,ia)
|
||||
!
|
||||
! qv is the now the US augmentation charge for atomic species is
|
||||
! and atom ia: real(qv)=spin up, imag(qv)=spin down
|
||||
!
|
||||
IF( iverbosity > 1 ) THEN
|
||||
ca = SUM(qv)
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: up g-space = ', &
|
||||
& omegab*DBLE(qgbt(1,1))
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: up r-sp = ', &
|
||||
& omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: dw g-space = ', &
|
||||
& omegab*DBLE(qgbt(1,2))
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: dw r-sp = ', &
|
||||
& omegab*AIMAG(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
|
||||
ENDIF
|
||||
!
|
||||
! add qv(r) to v(r), in real space on the dense grid
|
||||
!
|
||||
!
|
||||
! qv is the now the US augmentation charge for atomic species is
|
||||
! and atom ia: real(qv)=spin up, imag(qv)=spin down
|
||||
!
|
||||
! add qv(r) to v(r), in real space on the dense grid
|
||||
!
|
||||
CALL box2grid(irb(:,ia),qv,v)
|
||||
|
||||
END IF
|
||||
|
||||
END IF
|
||||
END DO
|
||||
!
|
||||
DEALLOCATE(qgbt)
|
||||
DEALLOCATE( qv )
|
||||
DEALLOCATE( fg1 )
|
||||
DEALLOCATE( fg2 )
|
||||
!$omp barrier
|
||||
!$omp master
|
||||
CALL mp_sum( v, inter_bgrp_comm )
|
||||
!$omp end master
|
||||
!$omp barrier
|
||||
!$omp do
|
||||
DO ir=1,dfftp%nnr
|
||||
rhor(ir,isup)=rhor(ir,isup)+DBLE(v(ir))
|
||||
rhor(ir,isdw)=rhor(ir,isdw)+AIMAG(v(ir))
|
||||
END DO
|
||||
!$omp end do
|
||||
!$omp end parallel
|
||||
!
|
||||
IF( iverbosity > 1 ) THEN
|
||||
ca = SUM(v)
|
||||
|
@ -1078,10 +1104,6 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
|
|||
& ' rhov: n_v(g=0) up = ',omega*DBLE (rhog(1,isup)), &
|
||||
& ' rhov: n_v(g=0) down = ',omega*DBLE(rhog(1,isdw))
|
||||
END IF
|
||||
DEALLOCATE(qgbt)
|
||||
DEALLOCATE( qv )
|
||||
DEALLOCATE( fg1 )
|
||||
DEALLOCATE( fg2 )
|
||||
!
|
||||
ENDIF
|
||||
|
||||
|
@ -1092,4 +1114,62 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
|
|||
1000 CONTINUE
|
||||
!
|
||||
RETURN
|
||||
|
||||
CONTAINS
|
||||
|
||||
SUBROUTINE print_rhov()
|
||||
IF( nspin == 1 ) THEN
|
||||
IF( iverbosity > 1 ) THEN
|
||||
ca = SUM(qv)
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom g-sp = ', &
|
||||
& omegab*DBLE(qgbt(1,1))
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom r-sp = ', &
|
||||
& omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
|
||||
ENDIF
|
||||
ELSE
|
||||
IF( iverbosity > 1 ) THEN
|
||||
ca = SUM(qv)
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: up g-space = ', &
|
||||
& omegab*DBLE(qgbt(1,1))
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: up r-sp = ', &
|
||||
& omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: dw g-space = ', &
|
||||
& omegab*DBLE(qgbt(1,2))
|
||||
WRITE( stdout,'(a,f12.8)') ' rhov: dw r-sp = ', &
|
||||
& omegab*AIMAG(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
|
||||
ENDIF
|
||||
ENDIF
|
||||
END SUBROUTINE
|
||||
END SUBROUTINE rhov
|
||||
|
||||
SUBROUTINE rhoofr_host &
|
||||
( nfi, c_bgrp, irb, eigrb, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, &
|
||||
enl, denl, ekin, dekin, tstress, ndwwf )
|
||||
USE kinds, ONLY: DP
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE cp_interfaces
|
||||
IMPLICIT NONE
|
||||
INTEGER nfi
|
||||
COMPLEX(DP) c_bgrp( :, : )
|
||||
INTEGER irb( :, : )
|
||||
COMPLEX(DP) eigrb( :, : )
|
||||
REAL(DP) bec_bgrp(:,:)
|
||||
REAL(DP) dbec(:,:,:,:)
|
||||
REAL(DP) rhovan(:, :, : )
|
||||
REAL(DP) rhor(:,:)
|
||||
REAL(DP) drhor(:,:,:,:)
|
||||
COMPLEX(DP) rhog( :, : )
|
||||
COMPLEX(DP) drhog( :, :, :, : )
|
||||
REAL(DP) rhos(:,:)
|
||||
REAL(DP) enl, ekin
|
||||
REAL(DP) denl(3,3), dekin(6)
|
||||
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
|
||||
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
|
||||
COMPLEX(DP), ALLOCATABLE DEVICEATTR :: c(:,:)
|
||||
ALLOCATE( c, SOURCE=c_bgrp )
|
||||
CALL rhoofr(nfi, c_bgrp, c, bec_bgrp, dbec, rhovan, rhor, &
|
||||
drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
|
||||
DEALLOCATE( c )
|
||||
END SUBROUTINE rhoofr_host
|
||||
|
|
|
@ -7,6 +7,12 @@
|
|||
!
|
||||
! written by Carlo Cavazzoni
|
||||
|
||||
#if defined(__CUDA)
|
||||
#define DEVICEATTR ,DEVICE
|
||||
#else
|
||||
#define DEVICEATTR
|
||||
#endif
|
||||
|
||||
!=----------------------------------------------------------------------------=!
|
||||
MODULE cp_interfaces
|
||||
!=----------------------------------------------------------------------------=!
|
||||
|
@ -44,7 +50,6 @@
|
|||
PUBLIC :: packgam
|
||||
|
||||
PUBLIC :: ortho
|
||||
PUBLIC :: ortho_gamma
|
||||
|
||||
PUBLIC :: nlfh
|
||||
PUBLIC :: nlfl_bgrp
|
||||
|
@ -96,7 +101,6 @@
|
|||
PUBLIC :: dotcsc
|
||||
PUBLIC :: nlsm1
|
||||
PUBLIC :: nlsm2_bgrp
|
||||
PUBLIC :: calbec_bgrp
|
||||
PUBLIC :: ennl
|
||||
PUBLIC :: calrhovan
|
||||
PUBLIC :: calbec
|
||||
|
@ -104,8 +108,8 @@
|
|||
PUBLIC :: dennl
|
||||
PUBLIC :: nlfq_bgrp
|
||||
PUBLIC :: collect_bec
|
||||
PUBLIC :: beta_eigr
|
||||
PUBLIC :: nlsm1us
|
||||
PUBLIC :: dbeta_eigr
|
||||
|
||||
! ------------------------------------ !
|
||||
|
||||
|
@ -126,6 +130,22 @@
|
|||
INTEGER, INTENT(IN) :: n, nspin
|
||||
REAL(DP), OPTIONAL :: v1( ldv, * )
|
||||
END SUBROUTINE dforce_x
|
||||
#if defined (__CUDA)
|
||||
SUBROUTINE dforce_gpu_x( i, bec, vkb, c, df, da, v, ldv, ispin, f, n, nspin )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: i
|
||||
REAL(DP) :: bec(:,:)
|
||||
COMPLEX(DP), DEVICE :: vkb(:,:)
|
||||
COMPLEX(DP), DEVICE :: c(:,:)
|
||||
COMPLEX(DP) :: df(:), da(:)
|
||||
INTEGER, INTENT(IN) :: ldv
|
||||
REAL(DP), DEVICE :: v( :, : )
|
||||
INTEGER :: ispin( : )
|
||||
REAL(DP) :: f( : )
|
||||
INTEGER, INTENT(IN) :: n, nspin
|
||||
END SUBROUTINE dforce_gpu_x
|
||||
#endif
|
||||
END INTERFACE
|
||||
|
||||
|
||||
|
@ -225,6 +245,26 @@
|
|||
|
||||
INTERFACE rhoofr
|
||||
SUBROUTINE rhoofr_cp &
|
||||
( nfi, c_bgrp, c_d, bec, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
INTEGER nfi
|
||||
COMPLEX(DP) :: c_bgrp( :, : )
|
||||
COMPLEX(DP) DEVICEATTR :: c_d( :, : )
|
||||
REAL(DP) bec(:,:)
|
||||
REAL(DP) dbec(:,:,:,:)
|
||||
REAL(DP) rhovan(:, :, : )
|
||||
REAL(DP) rhor(:,:)
|
||||
REAL(DP) drhor(:,:,:,:)
|
||||
COMPLEX(DP) rhog( :, : )
|
||||
COMPLEX(DP) drhog( :, :, :, : )
|
||||
REAL(DP) rhos(:,:)
|
||||
REAL(DP) enl, ekin
|
||||
REAL(DP) denl(3,3), dekin(6)
|
||||
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
|
||||
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
|
||||
END SUBROUTINE rhoofr_cp
|
||||
SUBROUTINE rhoofr_host &
|
||||
( nfi, c_bgrp, irb, eigrb, bec, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
|
@ -244,7 +284,7 @@
|
|||
REAL(DP) denl(3,3), dekin(6)
|
||||
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
|
||||
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
|
||||
END SUBROUTINE rhoofr_cp
|
||||
END SUBROUTINE rhoofr_host
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE checkrho
|
||||
|
@ -309,7 +349,7 @@
|
|||
|
||||
INTERFACE runcp_uspp
|
||||
SUBROUTINE runcp_uspp_x &
|
||||
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp, fromscra, restart )
|
||||
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
integer, intent(in) :: nfi
|
||||
|
@ -318,6 +358,7 @@
|
|||
real(DP) :: rhos(:,:)
|
||||
real(DP) :: bec_bgrp(:,:)
|
||||
complex(DP) :: c0_bgrp(:,:), cm_bgrp(:,:)
|
||||
complex(DP) DEVICEATTR :: c0_d(:,:), cm_d(:,:)
|
||||
logical, optional, intent(in) :: fromscra
|
||||
logical, optional, intent(in) :: restart
|
||||
END SUBROUTINE
|
||||
|
@ -377,36 +418,19 @@
|
|||
|
||||
INTERFACE ortho
|
||||
SUBROUTINE ortho_x &
|
||||
( eigr, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
|
||||
( betae, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: idesc( :, : )
|
||||
COMPLEX(DP) :: eigr( :, : )
|
||||
COMPLEX(DP) :: betae( :, : )
|
||||
COMPLEX(DP) :: cp_bgrp( :, : ), phi_bgrp( :, : )
|
||||
REAL(DP) :: x0( :, :, : ), diff, ccc
|
||||
INTEGER :: iter
|
||||
REAL(DP) :: bephi(:,:)
|
||||
REAL(DP) :: becp_bgrp(:,:)
|
||||
END SUBROUTINE
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE ortho_gamma
|
||||
SUBROUTINE ortho_gamma_x &
|
||||
( iopt, cp, ngwx, phi, becp_dist, qbecp, nkbx, bephi, qbephi, &
|
||||
x0, nx0, idesc, diff, iter, n, nss, istart )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: iopt
|
||||
INTEGER, INTENT(IN) :: ngwx, nkbx, nx0
|
||||
INTEGER, INTENT(IN) :: n, nss, istart
|
||||
COMPLEX(DP) :: phi( ngwx, n ), cp( ngwx, n )
|
||||
REAL(DP) :: bephi( :, : )
|
||||
REAL(DP) :: becp_dist(:,:)
|
||||
REAL(DP) :: qbephi( :, : ), qbecp( :, : )
|
||||
REAL(DP) :: x0( nx0, nx0 )
|
||||
INTEGER, INTENT(IN) :: idesc( : )
|
||||
INTEGER, INTENT(OUT) :: iter
|
||||
REAL(DP), INTENT(OUT) :: diff
|
||||
#if defined (__CUDA)
|
||||
ATTRIBUTES( DEVICE ) :: becp_bgrp, bephi, cp_bgrp, phi_bgrp, betae
|
||||
#endif
|
||||
END SUBROUTINE
|
||||
END INTERFACE
|
||||
|
||||
|
@ -748,15 +772,14 @@
|
|||
|
||||
INTERFACE move_electrons
|
||||
SUBROUTINE move_electrons_x( &
|
||||
nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, cm_bgrp, phi_bgrp, enthal, enb, &
|
||||
nfi, tprint, tfirst, tlast, b1, b2, b3, fion, enthal, enb, &
|
||||
& enbi, fccc, ccc, dt2bye, stress,l_cprestart )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: nfi
|
||||
LOGICAL, INTENT(IN) :: tfirst, tlast
|
||||
LOGICAL, INTENT(IN) :: tprint, tfirst, tlast
|
||||
REAL(DP), INTENT(IN) :: b1(3), b2(3), b3(3)
|
||||
REAL(DP) :: fion(:,:)
|
||||
COMPLEX(DP) :: c0_bgrp(:,:), cm_bgrp(:,:), phi_bgrp(:,:)
|
||||
REAL(DP), INTENT(IN) :: dt2bye
|
||||
REAL(DP) :: fccc, ccc
|
||||
REAL(DP) :: enb, enbi
|
||||
|
@ -820,6 +843,13 @@
|
|||
IMPLICIT NONE
|
||||
COMPLEX(DP) :: c_bgrp(:,:)
|
||||
END SUBROUTINE c_bgrp_expand_x
|
||||
#if defined (__CUDA)
|
||||
SUBROUTINE c_bgrp_expand_gpu_x( c_bgrp )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
COMPLEX(DP), DEVICE :: c_bgrp(:,:)
|
||||
END SUBROUTINE c_bgrp_expand_gpu_x
|
||||
#endif
|
||||
END INTERFACE
|
||||
INTERFACE c_bgrp_pack
|
||||
SUBROUTINE c_bgrp_pack_x( c_bgrp )
|
||||
|
@ -827,6 +857,13 @@
|
|||
IMPLICIT NONE
|
||||
COMPLEX(DP) :: c_bgrp(:,:)
|
||||
END SUBROUTINE c_bgrp_pack_x
|
||||
#if defined (__CUDA)
|
||||
SUBROUTINE c_bgrp_pack_gpu_x( c_bgrp )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
COMPLEX(DP), DEVICE :: c_bgrp(:,:)
|
||||
END SUBROUTINE c_bgrp_pack_gpu_x
|
||||
#endif
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE vofrho
|
||||
|
@ -856,6 +893,16 @@
|
|||
REAL(DP), INTENT(IN) :: f( : )
|
||||
REAL(DP) :: enkin_x
|
||||
END FUNCTION enkin_x
|
||||
#if defined (__CUDA)
|
||||
FUNCTION enkin_gpu_x( c, f, n )
|
||||
USE kinds, ONLY: dp
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: n
|
||||
COMPLEX(DP), DEVICE, INTENT(IN) :: c( :, : )
|
||||
REAL(DP), DEVICE, INTENT(IN) :: f( : )
|
||||
REAL(DP) :: enkin_gpu_x
|
||||
END FUNCTION enkin_gpu_x
|
||||
#endif
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE newinit
|
||||
|
@ -889,44 +936,45 @@
|
|||
END INTERFACE
|
||||
|
||||
INTERFACE dotcsc
|
||||
SUBROUTINE dotcsc_x( eigr, cp, ngw, n )
|
||||
SUBROUTINE dotcsc_x( betae, cp, ngw, n )
|
||||
USE kinds, ONLY: dp
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: ngw, n
|
||||
COMPLEX(DP), INTENT(IN) :: eigr(:,:), cp(:,:)
|
||||
COMPLEX(DP), INTENT(IN) :: cp(:,:)
|
||||
COMPLEX(DP), INTENT(INOUT) :: betae(:,:)
|
||||
END SUBROUTINE dotcsc_x
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE nlsm1
|
||||
SUBROUTINE nlsm1_x ( n, nspmn, nspmx, eigr, c, becp, pptype_ )
|
||||
SUBROUTINE nlsm1_x ( n, betae, c, becp, pptype_ )
|
||||
USE kinds, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: n, nspmn, nspmx
|
||||
COMPLEX(DP), INTENT(IN) :: eigr( :, : ), c( :, : )
|
||||
INTEGER, INTENT(IN) :: n
|
||||
COMPLEX(DP), INTENT(IN) :: c( :, : )
|
||||
COMPLEX(DP), INTENT(INOUT) :: betae( :, : )
|
||||
REAL(DP), INTENT(OUT) :: becp( :, : )
|
||||
INTEGER, INTENT(IN), OPTIONAL :: pptype_
|
||||
END SUBROUTINE nlsm1_x
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE nlsm2_bgrp
|
||||
SUBROUTINE nlsm2_bgrp_x( ngw, nkb, eigr, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
|
||||
SUBROUTINE nlsm2_bgrp_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
|
||||
USE kinds, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
|
||||
COMPLEX(DP), INTENT(IN) :: eigr( :, : ), c_bgrp( :, : )
|
||||
COMPLEX(DP), INTENT(IN) :: betae( :, : ), c_bgrp( :, : )
|
||||
REAL(DP), INTENT(OUT) :: becdr_bgrp( :, :, : )
|
||||
END SUBROUTINE nlsm2_bgrp_x
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE calbec_bgrp
|
||||
SUBROUTINE calbec_bgrp_x ( nspmn, nspmx, eigr, c_bgrp, bec_bgrp, pptype_ )
|
||||
#if defined (__CUDA)
|
||||
SUBROUTINE nlsm2_bgrp_gpu_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
|
||||
USE kinds, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: nspmn, nspmx
|
||||
COMPLEX(DP), INTENT(IN) :: eigr( :, : ), c_bgrp( :, : )
|
||||
REAL(DP), INTENT(OUT) :: bec_bgrp( :, : )
|
||||
INTEGER, INTENT(IN), OPTIONAL :: pptype_
|
||||
END SUBROUTINE calbec_bgrp_x
|
||||
INTEGER, INTENT(IN) :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
|
||||
COMPLEX(DP), INTENT(IN), DEVICE :: betae( :, : )
|
||||
COMPLEX(DP), INTENT(IN), DEVICE :: c_bgrp( :, : )
|
||||
REAL(DP), INTENT(OUT) :: becdr_bgrp( :, :, : )
|
||||
END SUBROUTINE nlsm2_bgrp_gpu_x
|
||||
#endif
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE ennl
|
||||
|
@ -950,12 +998,13 @@
|
|||
END INTERFACE
|
||||
|
||||
INTERFACE calbec
|
||||
SUBROUTINE calbec_x( nspmn, nspmx, eigr, c, bec, pptype_ )
|
||||
SUBROUTINE calbec_x( n, betae, c, bec, pptype_ )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: nspmn, nspmx
|
||||
REAL(DP), INTENT(OUT) :: bec( :, : )
|
||||
COMPLEX(DP), INTENT(IN) :: c( :, : ), eigr( :, : )
|
||||
INTEGER, INTENT(IN) :: n
|
||||
REAL(DP), INTENT(OUT) :: bec( :, : )
|
||||
COMPLEX(DP), INTENT(IN) :: c( :, : )
|
||||
COMPLEX(DP), INTENT(INOUT) :: betae( :, : )
|
||||
INTEGER, INTENT(IN), OPTIONAL :: pptype_
|
||||
END SUBROUTINE calbec_x
|
||||
END INTERFACE
|
||||
|
@ -983,10 +1032,11 @@
|
|||
END INTERFACE
|
||||
|
||||
INTERFACE nlfq_bgrp
|
||||
SUBROUTINE nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
|
||||
SUBROUTINE nlfq_bgrp_x( c_bgrp, betae, bec_bgrp, becdr_bgrp, fion )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
COMPLEX(DP), INTENT(IN) :: c_bgrp( :, : ), eigr( :, : )
|
||||
COMPLEX(DP), INTENT(IN) DEVICEATTR :: c_bgrp( :, : )
|
||||
COMPLEX(DP), INTENT(IN) DEVICEATTR :: betae( :, : )
|
||||
REAL(DP), INTENT(IN) :: bec_bgrp( :, : )
|
||||
REAL(DP), INTENT(OUT) :: becdr_bgrp( :, :, : )
|
||||
REAL(DP), INTENT(OUT) :: fion( :, : )
|
||||
|
@ -1003,28 +1053,25 @@
|
|||
END SUBROUTINE collect_bec_x
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE beta_eigr
|
||||
SUBROUTINE beta_eigr_x ( beigr, nspmn, nspmx, eigr, pptype_ )
|
||||
USE kinds, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: nspmn, nspmx
|
||||
COMPLEX(DP), INTENT(IN) :: eigr( :, : )
|
||||
COMPLEX(DP), INTENT(OUT) :: beigr( :, : )
|
||||
INTEGER, INTENT(IN), OPTIONAL :: pptype_
|
||||
END SUBROUTINE beta_eigr_x
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE nlsm1us
|
||||
SUBROUTINE nlsm1us_x ( n, beigr, c, becp )
|
||||
SUBROUTINE nlsm1us_x ( n, betae, c, becp )
|
||||
USE kinds, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: n
|
||||
COMPLEX(DP), INTENT(IN) :: beigr( :, : )
|
||||
COMPLEX(DP), INTENT(IN) :: c( :, : )
|
||||
REAL(DP), INTENT(OUT) :: becp( :, : )
|
||||
COMPLEX(DP) DEVICEATTR , INTENT(INOUT) :: betae( :, : )
|
||||
COMPLEX(DP) DEVICEATTR , INTENT(IN) :: c( :, : )
|
||||
REAL(DP) DEVICEATTR , INTENT(OUT) :: becp( :, : )
|
||||
END SUBROUTINE nlsm1us_x
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE dbeta_eigr
|
||||
SUBROUTINE dbeta_eigr_x( dbeigr, eigr )
|
||||
USE kinds, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
COMPLEX(DP), INTENT(IN) :: eigr( :, : )
|
||||
COMPLEX(DP), INTENT(OUT) :: dbeigr( :, :, :, :)
|
||||
END SUBROUTINE dbeta_eigr_x
|
||||
END INTERFACE
|
||||
|
||||
!=----------------------------------------------------------------------------=!
|
||||
|
||||
|
|
|
@ -1092,6 +1092,7 @@ subroutine nlinit
|
|||
use uspp, ONLY : aainit, beta, qq_nt, dvan, nhtol, nhtolm, indv,&
|
||||
dbeta
|
||||
use uspp_param, ONLY : upf, lmaxq, nbetam, lmaxkb, nhm, nh, ish
|
||||
use uspp_gpum, ONLY : using_qq_nt, using_qq_nt_d, qq_nt_d
|
||||
use atom, ONLY : rgrid
|
||||
use qgb_mod, ONLY : qgb, dqgb
|
||||
use smallbox_gvec, ONLY : ngb
|
||||
|
@ -1503,7 +1504,7 @@ end subroutine dylmr2_
|
|||
|
||||
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE dotcsc_x( eigr, cp, ngw, n )
|
||||
SUBROUTINE dotcsc_x( betae, cp, ngw, n )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
|
@ -1514,13 +1515,14 @@ end subroutine dylmr2_
|
|||
USE uspp_param, ONLY: nh, ish, upf
|
||||
USE mp, ONLY: mp_sum
|
||||
USE mp_global, ONLY: intra_bgrp_comm, nbgrp, inter_bgrp_comm
|
||||
USE cp_interfaces, ONLY: nlsm1
|
||||
USE cp_interfaces, ONLY: calbec
|
||||
USE electrons_base, ONLY: ispin, ispin_bgrp, nbspx_bgrp, ibgrp_g2l, iupdwn, nupdwn, nbspx
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ngw, n
|
||||
COMPLEX(DP), INTENT(IN) :: eigr(:,:), cp(:,:)
|
||||
COMPLEX(DP), INTENT(IN) :: cp(:,:)
|
||||
COMPLEX(DP), INTENT(INOUT) :: betae(:,:)
|
||||
! local variables
|
||||
REAL(DP) rsum, csc(n) ! automatic array
|
||||
COMPLEX(DP) temp(ngw) ! automatic array
|
||||
|
@ -1536,7 +1538,7 @@ end subroutine dylmr2_
|
|||
! < beta | phi > is real. only the i lowest:
|
||||
!
|
||||
|
||||
CALL nlsm1( nbspx_bgrp, 1, nsp, eigr, cp, becp, 2 )
|
||||
CALL calbec( nbspx_bgrp, betae, cp, becp, 2 )
|
||||
|
||||
nnn = MIN( 12, n )
|
||||
|
||||
|
@ -1640,30 +1642,73 @@ end subroutine dylmr2_
|
|||
! local
|
||||
|
||||
INTEGER :: ig, i
|
||||
REAL(DP) :: sk(n) ! automatic array
|
||||
REAL(DP) :: sk, rsum
|
||||
!
|
||||
sk = 0.0d0
|
||||
!$omp parallel do reduction(+:sk) default(none) &
|
||||
!$omp shared(c,g2kin,gstart,ngw,n,f) private(i,ig,rsum)
|
||||
DO i=1,n
|
||||
sk(i)=0.0d0
|
||||
rsum = 0.0d0
|
||||
DO ig=gstart,ngw
|
||||
sk(i)=sk(i)+DBLE(CONJG(c(ig,i))*c(ig,i))*g2kin(ig)
|
||||
rsum = rsum + DBLE(CONJG(c(ig,i))*c(ig,i)) * g2kin(ig)
|
||||
END DO
|
||||
sk = sk + f(i) * rsum
|
||||
END DO
|
||||
!$omp end parallel do
|
||||
|
||||
CALL mp_sum( sk(1:n), intra_bgrp_comm )
|
||||
|
||||
enkin_x=0.0d0
|
||||
DO i=1,n
|
||||
enkin_x=enkin_x+f(i)*sk(i)
|
||||
END DO
|
||||
CALL mp_sum( sk, intra_bgrp_comm )
|
||||
|
||||
! ... reciprocal-space vectors are in units of alat/(2 pi) so a
|
||||
! ... multiplicative factor (2 pi/alat)**2 is required
|
||||
|
||||
enkin_x = enkin_x * tpiba2
|
||||
enkin_x = tpiba2 * sk
|
||||
!
|
||||
RETURN
|
||||
END FUNCTION enkin_x
|
||||
|
||||
#if defined (__CUDA)
|
||||
!-----------------------------------------------------------------------
|
||||
FUNCTION enkin_gpu_x( c, f, n )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE constants, ONLY: pi, fpi
|
||||
USE gvecw, ONLY: ngw
|
||||
USE gvect, ONLY: gstart
|
||||
USE gvecw, ONLY: g2kin_d
|
||||
USE mp, ONLY: mp_sum
|
||||
USE mp_global, ONLY: intra_bgrp_comm
|
||||
USE cell_base, ONLY: tpiba2
|
||||
USE cudafor
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
REAL(DP) :: enkin_gpu_x
|
||||
|
||||
INTEGER, INTENT(IN) :: n
|
||||
COMPLEX(DP), DEVICE, INTENT(IN) :: c( :, : )
|
||||
REAL(DP), DEVICE, INTENT(IN) :: f( : )
|
||||
!
|
||||
! local
|
||||
|
||||
INTEGER :: ig, i
|
||||
REAL(DP) :: sk
|
||||
!
|
||||
sk=0.0d0
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO i=1,n
|
||||
DO ig=gstart,ngw
|
||||
sk = sk + f(i) * DBLE(CONJG(c(ig,i))*c(ig,i)) * g2kin_d(ig)
|
||||
END DO
|
||||
END DO
|
||||
|
||||
CALL mp_sum( sk, intra_bgrp_comm )
|
||||
|
||||
enkin_gpu_x = tpiba2 * sk
|
||||
!
|
||||
RETURN
|
||||
END FUNCTION enkin_gpu_x
|
||||
#endif
|
||||
|
||||
!-------------------------------------------------------------------------
|
||||
SUBROUTINE nlfl_bgrp_x( bec_bgrp, becdr_bgrp, lambda, idesc, fion )
|
||||
!-----------------------------------------------------------------------
|
||||
|
@ -1932,25 +1977,34 @@ end subroutine dylmr2_
|
|||
USE gvecw, ONLY : ngw
|
||||
USE uspp, ONLY : beta, nhtol, indv_ijkb0
|
||||
USE uspp_param, ONLY : nh, upf
|
||||
USE gvect, ONLY : gstart
|
||||
!
|
||||
IMPLICIT NONE
|
||||
COMPLEX(DP), INTENT(IN) :: eigr( :, : )
|
||||
COMPLEX(DP), INTENT(OUT) :: betae( :, : )
|
||||
!
|
||||
INTEGER :: is, iv, ia, inl, ig, isa
|
||||
COMPLEX(DP), PARAMETER, DIMENSION(4) :: cfact = & ! (l == 0), (l == 1), (l == 2), (l == 3)
|
||||
[( 1.0_dp , 0.0_dp ), ( 0.0_dp , -1.0_dp ), ( -1.0_dp , 0.0_dp ), ( 0.0_dp , 1.0_dp )]
|
||||
COMPLEX(DP) :: ci
|
||||
!
|
||||
CALL start_clock( 'prefor' )
|
||||
!$omp parallel do default(shared) private(ia,is,iv,ci,inl,ig)
|
||||
DO ia=1,nat
|
||||
is=ityp(ia)
|
||||
DO iv=1,nh(is)
|
||||
ci=(0.0d0,-1.0d0)**nhtol(iv,is)
|
||||
ci=cfact( nhtol(iv,is) + 1 )
|
||||
inl = indv_ijkb0(ia) + iv
|
||||
DO ig=1,ngw
|
||||
betae(ig,inl)=ci*beta(ig,iv,is)*eigr(ig,ia)
|
||||
END DO
|
||||
!beigr(1,inl)=betae(1,inl)
|
||||
!DO ig=gstart,ngw
|
||||
! beigr(ig,inl)=2.0d0 * betae(ig,inl)
|
||||
!END DO
|
||||
END DO
|
||||
END DO
|
||||
!$omp end parallel do
|
||||
CALL stop_clock( 'prefor' )
|
||||
!
|
||||
RETURN
|
||||
|
|
|
@ -24,6 +24,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
USE core, ONLY : rhoc
|
||||
USE uspp_param, ONLY : nhm, nh, ish
|
||||
USE uspp, ONLY : nkb, vkb, becsum, deeq, okvan, nlcc_any
|
||||
USE uspp_gpum, ONLY : vkb_d
|
||||
USE energies, ONLY : eht, epseu, exc, etot, eself, enl, &
|
||||
ekin, atot, entropy, egrand, enthal, &
|
||||
ekincm, print_energies
|
||||
|
@ -80,7 +81,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
electrons_nosevel, electrons_noseupd
|
||||
USE pres_ai_mod, ONLY : P_ext, P_in, P_fin, pvar, volclu, &
|
||||
surfclu, Surf_t, abivol, abisur
|
||||
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, phi_bgrp
|
||||
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, cm_d, phi, c0_d
|
||||
USE wannier_module, ONLY : allocate_wannier
|
||||
USE cp_interfaces, ONLY : printout_new, move_electrons, newinit
|
||||
USE cell_nose, ONLY : xnhh0, xnhhm, xnhhp, vnhh, temph, &
|
||||
|
@ -96,14 +97,14 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
ema0bg, sfac, eigr, iprint_stdout, &
|
||||
irb, taub, eigrb, rhog, rhos, &
|
||||
rhor, bephi, becp_bgrp, nfi, idesc, &
|
||||
drhor, drhog, bec_bgrp, dbec
|
||||
drhor, drhog, bec_bgrp, dbec, bec_d, iabox, nabox
|
||||
USE autopilot, ONLY : event_step, event_index, &
|
||||
max_event_step, restart_p
|
||||
USE cell_base, ONLY : s_to_r, r_to_s
|
||||
USE wannier_subroutines, ONLY : wannier_startup, wf_closing_options, &
|
||||
ef_enthalpy
|
||||
USE cp_interfaces, ONLY : writefile, eigs, strucf, phfacs
|
||||
USE cp_interfaces, ONLY : ortho, elec_fakekine, calbec_bgrp, calbec, caldbec_bgrp
|
||||
USE cp_interfaces, ONLY : ortho, elec_fakekine, calbec, caldbec_bgrp
|
||||
USE constraints_module, ONLY : check_constraint, remove_constr_force
|
||||
USE cp_autopilot, ONLY : pilot
|
||||
USE ions_nose, ONLY : ions_nose_allocate, ions_nose_shiftvar
|
||||
|
@ -117,6 +118,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
USE london_module, ONLY : energy_london, force_london, stres_london
|
||||
USE input_parameters, ONLY : tcpbo
|
||||
USE xc_lib, ONLY : xclib_dft_is, start_exx, exx_is_active
|
||||
USE device_memcpy_m, ONLY : dev_memcpy
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -266,7 +268,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
!
|
||||
IF ( (okvan .or. nlcc_any ) .AND. (tfor .OR. thdyn .OR. tfirst) ) THEN
|
||||
!
|
||||
CALL initbox( tau0, alat, at, ainv, taub, irb )
|
||||
CALL initbox( tau0, alat, at, ainv, taub, irb, iabox, nabox )
|
||||
!
|
||||
CALL phbox( taub, iverbosity, eigrb )
|
||||
!
|
||||
|
@ -288,13 +290,15 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
!
|
||||
END IF
|
||||
!
|
||||
! ... why this call ??? from Paolo Umari
|
||||
IF( force_pairing ) THEN
|
||||
c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
|
||||
cm_bgrp(:,iupdwn(2):nbsp) = cm_bgrp(:,1:nupdwn(2))
|
||||
!phi(:,iupdwn(2):nbsp) = phi(:,1:nupdwn(2))
|
||||
CALL dev_memcpy(phi(:,iupdwn(2):), phi, [1, ubound(phi)], 1, [1, nbsp])
|
||||
lambda(:,:, 2) = lambda(:,:, 1)
|
||||
ENDIF
|
||||
!
|
||||
IF ( tefield .or. tefield2 ) THEN
|
||||
!
|
||||
CALL calbec( 1, nsp, eigr, c0_bgrp, bec_bgrp ) ! ATTENZIONE
|
||||
!
|
||||
END IF
|
||||
CALL dev_memcpy( c0_d, c0_bgrp )
|
||||
!
|
||||
! Autopilot (Dynamic Rules) Implimentation
|
||||
!
|
||||
|
@ -322,13 +326,6 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
!
|
||||
!=======================================================================
|
||||
!
|
||||
IF( force_pairing ) THEN
|
||||
c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
|
||||
cm_bgrp(:,iupdwn(2):nbsp) = cm_bgrp(:,1:nupdwn(2))
|
||||
phi_bgrp(:,iupdwn(2):nbsp) = phi_bgrp(:,1:nupdwn(2))
|
||||
lambda(:,:, 2) = lambda(:,:, 1)
|
||||
ENDIF
|
||||
!
|
||||
! ... fake electronic kinetic energy
|
||||
!
|
||||
IF ( .NOT. tcg ) THEN
|
||||
|
@ -339,9 +336,8 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
!
|
||||
END IF
|
||||
!
|
||||
CALL move_electrons( nfi, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3), &
|
||||
fion, c0_bgrp, cm_bgrp, phi_bgrp, &
|
||||
enthal, enb, enbi, fccc, ccc, dt2bye, stress, .false. )
|
||||
CALL move_electrons( nfi, tprint, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3), &
|
||||
fion, enthal, enb, enbi, fccc, ccc, dt2bye, stress, .false. )
|
||||
!
|
||||
IF (lda_plus_u) fion = fion + forceh
|
||||
!
|
||||
|
@ -537,6 +533,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
! ... prefor calculates vkb
|
||||
!
|
||||
CALL prefor( eigr, vkb )
|
||||
CALL dev_memcpy( vkb_d, vkb )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -553,13 +550,17 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
!
|
||||
IF ( tortho ) THEN
|
||||
!
|
||||
CALL ortho( eigr, cm_bgrp, phi_bgrp, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
|
||||
#if defined (__CUDA)
|
||||
CALL ortho( vkb_d, cm_d, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
|
||||
#else
|
||||
CALL ortho( vkb, cm_bgrp, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
|
||||
#endif
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL gram_bgrp( vkb, bec_bgrp, nkb, cm_bgrp, ngw )
|
||||
!
|
||||
IF ( iverbosity > 2 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp_bgrp )
|
||||
IF ( iverbosity > 2 ) CALL dotcsc( vkb, cm_bgrp, ngw, nbsp_bgrp )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -568,23 +569,31 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
IF ( iverbosity > 1 ) CALL laxlib_print_matrix( lambda, idesc, nbsp, 9, nudx, 1.D0, ionode, stdout )
|
||||
!
|
||||
IF ( tortho ) THEN
|
||||
CALL updatc( ccc, lambda, phi_bgrp, bephi, becp_bgrp, bec_bgrp, cm_bgrp, idesc )
|
||||
#if defined (__CUDA)
|
||||
CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_d, cm_d, idesc )
|
||||
CALL dev_memcpy( bec_bgrp, bec_d )
|
||||
CALL dev_memcpy( cm_bgrp, cm_d )
|
||||
#else
|
||||
CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_bgrp, cm_bgrp, idesc )
|
||||
#endif
|
||||
END IF
|
||||
!
|
||||
IF( force_pairing ) THEN
|
||||
c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
|
||||
cm_bgrp(:,iupdwn(2):nbsp) = cm_bgrp(:,1:nupdwn(2))
|
||||
phi_bgrp(:,iupdwn(2):nbsp) = phi_bgrp(:,1:nupdwn(2))
|
||||
c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
|
||||
cm_bgrp(:,iupdwn(2):nbsp) = cm_bgrp(:,1:nupdwn(2))
|
||||
!phi(:,iupdwn(2):nbsp) = phi(:,1:nupdwn(2))
|
||||
CALL dev_memcpy(phi(:,iupdwn(2):), phi, [1, ubound(phi)], 1, [1, nbsp])
|
||||
lambda(:,:, 2) = lambda(:,:, 1)
|
||||
ENDIF
|
||||
!
|
||||
CALL calbec_bgrp( 1, nsp, eigr, cm_bgrp, bec_bgrp, 1 )
|
||||
! the following compute only on NC pseudo components
|
||||
CALL calbec( nbsp_bgrp, vkb, cm_bgrp, bec_bgrp, 1 )
|
||||
!
|
||||
IF ( tpre ) THEN
|
||||
CALL caldbec_bgrp( eigr, cm_bgrp, dbec, idesc )
|
||||
END IF
|
||||
!
|
||||
IF ( iverbosity > 1 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp_bgrp )
|
||||
IF ( iverbosity > 1 ) CALL dotcsc( vkb, cm_bgrp, ngw, nbsp_bgrp )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -801,7 +810,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
! ... restart with CP
|
||||
!
|
||||
IF ( okvan .or. nlcc_any ) THEN
|
||||
CALL initbox( tau0, alat, at, ainv, taub, irb )
|
||||
CALL initbox( tau0, alat, at, ainv, taub, irb, iabox, nabox )
|
||||
CALL phbox( taub, iverbosity, eigrb )
|
||||
END IF
|
||||
CALL r_to_s( tau0, taus, nat, ainv )
|
||||
|
@ -816,9 +825,8 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
|
|||
!
|
||||
lambdam = lambda
|
||||
!
|
||||
CALL move_electrons( nfi, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3),&
|
||||
fion, c0_bgrp, cm_bgrp, phi_bgrp, enthal, enb,&
|
||||
enbi, fccc, ccc, dt2bye, stress,.true. )
|
||||
CALL move_electrons( nfi, tprint, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3),&
|
||||
fion, enthal, enb, enbi, fccc, ccc, dt2bye, stress,.true. )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -1094,6 +1102,7 @@ SUBROUTINE terminate_run()
|
|||
CALL print_clock( 'nlfq' )
|
||||
CALL print_clock( 'nlsm1' )
|
||||
CALL print_clock( 'nlsm2' )
|
||||
CALL print_clock( 'nlsm1us' )
|
||||
CALL print_clock( 'fft' )
|
||||
CALL print_clock( 'ffts' )
|
||||
CALL print_clock( 'fftw' )
|
||||
|
@ -1114,6 +1123,7 @@ SUBROUTINE terminate_run()
|
|||
CALL print_clock( 'new_ns' )
|
||||
CALL print_clock( 'strucf' )
|
||||
CALL print_clock( 'calbec' )
|
||||
CALL print_clock( 'exch_corr' )
|
||||
!==============================================================
|
||||
IF (ts_vdw) THEN
|
||||
WRITE( stdout, '(/5x,"Called by tsvdw:")' )
|
||||
|
|
|
@ -15,11 +15,13 @@ SUBROUTINE deallocate_modules_var()
|
|||
!
|
||||
USE core, ONLY : deallocate_core
|
||||
USE uspp, ONLY : deallocate_uspp
|
||||
USE uspp_gpum, ONLY : deallocate_uspp_gpu
|
||||
USE electrons_base, ONLY : deallocate_elct
|
||||
USE efield_module, ONLY : deallocate_efield
|
||||
USE ensemble_dft, ONLY : deallocate_ensemble_dft
|
||||
USE cg_module, ONLY : deallocate_cg
|
||||
USE gvect, ONLY : deallocate_gvect
|
||||
USE gvect_gpum, ONLY : deallocate_gvect_gpu
|
||||
USE gvecw, ONLY : deallocate_gvecw
|
||||
USE smallbox_gvec, ONLY : deallocate_smallbox_gvec
|
||||
USE local_pseudo, ONLY : deallocate_local_pseudo
|
||||
|
@ -59,8 +61,10 @@ SUBROUTINE deallocate_modules_var()
|
|||
CALL deallocate_cg( )
|
||||
CALL deallocate_core()
|
||||
CALL deallocate_uspp()
|
||||
CALL deallocate_uspp_gpu()
|
||||
CALL deallocate_gvect(.TRUE.) ! Value .true. is hard coded in init.f90:195,
|
||||
! here it prevents double free of gg variable.
|
||||
CALL deallocate_gvect_gpu()
|
||||
CALL deallocate_gvecw()
|
||||
CALL deallocate_smallbox_gvec( )
|
||||
CALL deallocate_local_pseudo()
|
||||
|
|
|
@ -63,6 +63,8 @@
|
|||
complex(DP), ALLOCATABLE :: self_rhog( :,: )
|
||||
LOGICAL :: ttsic
|
||||
real(DP) :: detmp(3,3)
|
||||
|
||||
CALL start_clock( 'exch_corr' )
|
||||
!
|
||||
! filling of gradr with the gradient of rho using fft's
|
||||
!
|
||||
|
@ -246,6 +248,8 @@
|
|||
|
||||
IF( ALLOCATED( gradr ) ) DEALLOCATE( gradr )
|
||||
|
||||
CALL stop_clock( 'exch_corr' )
|
||||
|
||||
5555 format(1x,f12.5,1x,f12.5,1x,f12.5/ &
|
||||
& 1x,f12.5,1x,f12.5,1x,f12.5/ &
|
||||
& 1x,f12.5,1x,f12.5,1x,f12.5//)
|
||||
|
@ -511,29 +515,29 @@ subroutine exch_corr_cp(nnr,nspin,grhor,rhor,etxc)
|
|||
! h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho|
|
||||
!
|
||||
!$omp parallel default(none), shared(nnr,grhor,h), private(ipol,k)
|
||||
!$omp do collapse(2)
|
||||
do ipol = 1, 3
|
||||
!$omp do
|
||||
do k = 1, nnr
|
||||
grhor (ipol, k, 1) = h (k, 1, 1) * grhor (ipol, k, 1)
|
||||
enddo
|
||||
!$omp end do
|
||||
end do
|
||||
!$omp end do
|
||||
!$omp end parallel
|
||||
!
|
||||
!
|
||||
else
|
||||
!
|
||||
!$omp parallel default(none), shared(nnr,grhor,h), private(ipol,k,grup,grdw)
|
||||
!$omp do collapse(2)
|
||||
do ipol = 1, 3
|
||||
!$omp do
|
||||
do k = 1, nnr
|
||||
grup = grhor (ipol, k, 1)
|
||||
grdw = grhor (ipol, k, 2)
|
||||
grhor (ipol, k, 1) = h (k, 1, 1) * grup + h (k, 1, 2) * grdw
|
||||
grhor (ipol, k, 2) = h (k, 2, 2) * grdw + h (k, 2, 1) * grup
|
||||
enddo
|
||||
!$omp end do
|
||||
enddo
|
||||
!$omp end do
|
||||
!$omp end parallel
|
||||
!
|
||||
end if
|
||||
|
|
|
@ -80,21 +80,14 @@
|
|||
!=======================================================================
|
||||
|
||||
nogrp_ = fftx_ntgrp(dffts)
|
||||
IF( nogrp_ > 1 ) THEN
|
||||
CALL errore('dforce','Task group not supported',1)
|
||||
END IF
|
||||
ALLOCATE( psi( dffts%nnr_tg ) )
|
||||
!
|
||||
#if defined(__MPI)
|
||||
|
||||
CALL c2psi_gamma_tg( dffts, psi, c, i, n )
|
||||
|
||||
CALL invfft('tgWave', psi, dffts)
|
||||
|
||||
#else
|
||||
|
||||
CALL c2psi_gamma( dffts, psi, c(:,i), c(:,i+1) )
|
||||
!
|
||||
CALL invfft( 'Wave', psi, dffts )
|
||||
|
||||
#endif
|
||||
!
|
||||
! the following avoids a potential out-of-bounds error
|
||||
!
|
||||
|
@ -213,11 +206,7 @@
|
|||
!
|
||||
END IF
|
||||
!
|
||||
#if defined(__MPI)
|
||||
CALL fwfft( 'tgWave', psi, dffts )
|
||||
#else
|
||||
CALL fwfft( 'Wave', psi, dffts )
|
||||
#endif
|
||||
!
|
||||
! note : the factor 0.5 appears
|
||||
! in the kinetic energy because it is defined as 0.5*g**2
|
||||
|
@ -343,3 +332,231 @@
|
|||
!
|
||||
RETURN
|
||||
END SUBROUTINE dforce_x
|
||||
|
||||
#if defined (__CUDA)
|
||||
!-------------------------------------------------------------------------
|
||||
SUBROUTINE dforce_gpu_x ( i, bec, vkb, c, df, da, v, ldv, ispin, f, n, nspin )
|
||||
!-----------------------------------------------------------------------
|
||||
!computes: the generalized force df=cmplx(dfr,dfi,kind=DP) acting on the i-th
|
||||
! electron state at the gamma point of the brillouin zone
|
||||
! represented by the vector c=cmplx(cr,ci,kind=DP)
|
||||
!
|
||||
! d_n(g) = f_n { 0.5 g^2 c_n(g) + [vc_n](g) +
|
||||
! sum_i,ij d^q_i,ij (-i)**l beta_i,i(g)
|
||||
! e^-ig.r_i < beta_i,j | c_n >}
|
||||
!
|
||||
USE parallel_include
|
||||
USE kinds, ONLY: dp
|
||||
USE control_flags, ONLY: iprint
|
||||
USE uspp, ONLY: nhsa=>nkb, dvan, deeq, indv_ijkb0
|
||||
USE uspp_param, ONLY: nhm, nh
|
||||
USE constants, ONLY: pi, fpi
|
||||
USE ions_base, ONLY: nsp, na, nat, ityp
|
||||
USE gvecw, ONLY: ngw, g2kin_d
|
||||
USE cell_base, ONLY: tpiba2
|
||||
USE ensemble_dft, ONLY: tens
|
||||
USE xc_lib, ONLY: xclib_dft_is, exx_is_active
|
||||
USE fft_base, ONLY: dffts
|
||||
USE fft_interfaces, ONLY: fwfft, invfft
|
||||
USE mp_global, ONLY: me_bgrp
|
||||
USE control_flags, ONLY: many_fft
|
||||
USE fft_helper_subroutines
|
||||
USE device_memcpy_m, ONLY : dev_memcpy
|
||||
USE cudafor
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: i
|
||||
REAL(DP) :: bec(:,:)
|
||||
COMPLEX(DP), DEVICE :: vkb(:,:)
|
||||
COMPLEX(DP), DEVICE :: c(:,:)
|
||||
COMPLEX(DP) :: df(:), da(:)
|
||||
INTEGER, INTENT(IN) :: ldv
|
||||
REAL(DP), DEVICE :: v( :, : )
|
||||
INTEGER :: ispin( : )
|
||||
REAL(DP) :: f( : )
|
||||
INTEGER, INTENT(IN) :: n, nspin
|
||||
!
|
||||
! local variables
|
||||
!
|
||||
INTEGER :: iv, jv, ia, is, iss1, iss2, ir, ig, inl, jnl
|
||||
INTEGER :: igno, igrp, ierr, ii
|
||||
INTEGER :: idx, ioff
|
||||
REAL(DP) :: fi, fip, dd, dv
|
||||
COMPLEX(DP) :: fp, fm
|
||||
complex(DP), parameter :: ci=(0.0d0,1.0d0)
|
||||
|
||||
REAL(DP), ALLOCATABLE :: af( :, : ), aa( :, : )
|
||||
REAL(DP), ALLOCATABLE, DEVICE :: af_d( :, : ), aa_d( :, : )
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: psi(:)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: df_d(:)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: da_d(:)
|
||||
INTEGER, DEVICE, POINTER :: nl_d(:), nlm_d(:)
|
||||
!
|
||||
CALL start_clock( 'dforce' )
|
||||
!
|
||||
IF(xclib_dft_is('hybrid').AND.exx_is_active()) THEN
|
||||
CALL errore(' dforce ', ' dft_is_hybrid and exx_is_active NOT implemented ', 1 )
|
||||
END IF
|
||||
|
||||
ALLOCATE( psi( dffts%nnr * many_fft ) )
|
||||
ALLOCATE( df_d( SIZE( df ) ) )
|
||||
ALLOCATE( da_d( SIZE( da ) ) )
|
||||
!
|
||||
psi = 0.0d0
|
||||
nl_d => dffts%nl_d
|
||||
nlm_d => dffts%nlm_d
|
||||
|
||||
ioff = 0
|
||||
DO ii = i, i + 2 * many_fft - 1, 2
|
||||
IF( ii < n ) THEN
|
||||
!$cuf kernel do(1)
|
||||
do ig = 1, dffts%ngw
|
||||
psi( nlm_d( ig ) + ioff) = CONJG( c( ig, ii ) ) + ci * conjg( c( ig, ii+1))
|
||||
psi( nl_d( ig ) + ioff) = c( ig, ii ) + ci * c( ig, ii+1)
|
||||
end do
|
||||
ELSE IF( ii == n ) THEN
|
||||
!$cuf kernel do(1)
|
||||
do ig = 1, dffts%ngw
|
||||
psi( nlm_d( ig ) + ioff) = CONJG( c( ig, ii ) )
|
||||
psi( nl_d( ig ) + ioff) = c( ig, ii )
|
||||
end do
|
||||
END IF
|
||||
ioff = ioff + dffts%nnr
|
||||
END DO
|
||||
!
|
||||
CALL invfft( 'Wave', psi, dffts, many_fft )
|
||||
!
|
||||
ioff = 0
|
||||
DO ii = i, i + 2 * many_fft - 1, 2
|
||||
IF( ii < n ) THEN
|
||||
iss1=ispin( ii )
|
||||
iss2=ispin( ii + 1 )
|
||||
!$cuf kernel do(1)
|
||||
DO ir=1,dffts%nnr
|
||||
psi(ir+ioff)=CMPLX( v(ir,iss1)* DBLE(psi(ir+ioff)), &
|
||||
v(ir,iss2)*AIMAG(psi(ir+ioff)) ,kind=DP)
|
||||
END DO
|
||||
ELSE IF( ii == n ) THEN
|
||||
iss1=ispin( ii )
|
||||
iss2=iss1
|
||||
!$cuf kernel do(1)
|
||||
DO ir=1,dffts%nnr
|
||||
psi(ir+ioff)=CMPLX( v(ir,iss1)* DBLE(psi(ir+ioff)), &
|
||||
v(ir,iss2)*AIMAG(psi(ir+ioff)) ,kind=DP)
|
||||
END DO
|
||||
END IF
|
||||
ioff = ioff + dffts%nnr
|
||||
END DO
|
||||
|
||||
CALL fwfft( 'Wave', psi, dffts, many_fft )
|
||||
|
||||
igno = 0
|
||||
ioff = 0
|
||||
DO idx = 1, 2 * many_fft, 2
|
||||
|
||||
IF( idx + i - 1 <= n ) THEN
|
||||
if (tens) then
|
||||
fi = -0.5d0
|
||||
fip = -0.5d0
|
||||
else
|
||||
fi = -0.5d0*f(i+idx-1)
|
||||
fip = -0.5d0*f(i+idx)
|
||||
endif
|
||||
CALL fftx_psi2c_gamma_gpu( dffts, psi( 1+ioff : ioff+dffts%nnr ), df_d(1+igno:igno+ngw), da_d(1+igno:igno+ngw))
|
||||
!$cuf kernel do(1)
|
||||
DO ig=1,ngw
|
||||
df_d(ig+igno)= fi*(tpiba2*g2kin_d(ig)* c(ig,idx+i-1)+df_d(ig+igno))
|
||||
da_d(ig+igno)=fip*(tpiba2*g2kin_d(ig)* c(ig,idx+i )+da_d(ig+igno))
|
||||
END DO
|
||||
END IF
|
||||
|
||||
igno = igno + ngw
|
||||
ioff = ioff + dffts%nnr
|
||||
|
||||
ENDDO
|
||||
|
||||
!
|
||||
|
||||
IF( nhsa > 0 ) THEN
|
||||
!
|
||||
! aa_i,i,n = sum_j d_i,ij <beta_i,j|c_n>
|
||||
!
|
||||
ALLOCATE( af( nhsa, many_fft ), aa( nhsa, many_fft ) )
|
||||
ALLOCATE( af_d( nhsa, many_fft ), aa_d( nhsa, many_fft ) )
|
||||
!
|
||||
!$omp parallel do default(none), &
|
||||
!$omp shared(many_fft,i,n,tens,f,nat,ityp,nh,dvan,indv_ijkb0,deeq,af,aa,bec,ispin), &
|
||||
!$omp private(idx,igrp,fi,fip,ia,is,iv,jv,inl,jnl,dv,dd,iss1,iss2)
|
||||
DO idx = 1, 2*many_fft , 2
|
||||
|
||||
igrp = idx/2+1
|
||||
af(:,igrp) = 0.0d0
|
||||
aa(:,igrp) = 0.0d0
|
||||
|
||||
IF( idx + i - 1 <= n ) THEN
|
||||
|
||||
IF( i + idx - 1 /= n ) THEN
|
||||
fi = f(i+idx-1)
|
||||
fip= f(i+idx)
|
||||
iss1=ispin( i+idx-1 )
|
||||
iss2=ispin( i+idx )
|
||||
ELSE
|
||||
fi = f(i+idx-1)
|
||||
iss1=ispin( i+idx-1 )
|
||||
END IF
|
||||
IF (tens) THEN
|
||||
fi = 1.0d0
|
||||
fip= 1.0d0
|
||||
ENDIF
|
||||
!
|
||||
DO ia = 1, nat
|
||||
is = ityp(ia)
|
||||
DO iv = 1, nh(is)
|
||||
DO jv = 1, nh(is)
|
||||
dv = dvan(iv,jv,is)
|
||||
inl = indv_ijkb0(ia) + iv
|
||||
jnl = indv_ijkb0(ia) + jv
|
||||
IF( i + idx - 1 /= n ) THEN
|
||||
dd = deeq(iv,jv,ia,iss1) + dv
|
||||
af(inl,igrp) = af(inl,igrp) - fi * dd * bec(jnl,i+idx-1)
|
||||
dd = deeq(iv,jv,ia,iss2) + dv
|
||||
aa(inl,igrp) = aa(inl,igrp) - fip * dd * bec(jnl,i+idx)
|
||||
ELSE
|
||||
dd = deeq(iv,jv,ia,iss1) + dv
|
||||
af(inl,igrp) = af(inl,igrp) - fi * dd * bec(jnl,i+idx-1)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
!$omp end parallel do
|
||||
|
||||
CALL dev_memcpy( af_d, af )
|
||||
CALL dev_memcpy( aa_d, aa )
|
||||
|
||||
IF( ngw > 0 ) THEN
|
||||
CALL MYDGEMM ( 'N', 'N', 2*ngw, many_fft , nhsa, 1.0d0, vkb, 2*ngw, af_d, nhsa, 1.0d0, df_d, 2*ngw)
|
||||
CALL MYDGEMM ( 'N', 'N', 2*ngw, many_fft , nhsa, 1.0d0, vkb, 2*ngw, aa_d, nhsa, 1.0d0, da_d, 2*ngw)
|
||||
END IF
|
||||
!
|
||||
DEALLOCATE( aa, af )
|
||||
DEALLOCATE( aa_d, af_d )
|
||||
!
|
||||
ENDIF
|
||||
|
||||
CALL dev_memcpy( df, df_d )
|
||||
CALL dev_memcpy( da, da_d )
|
||||
!
|
||||
DEALLOCATE( df_d )
|
||||
DEALLOCATE( da_d )
|
||||
DEALLOCATE( psi )
|
||||
NULLIFY(nl_d)
|
||||
NULLIFY(nlm_d)
|
||||
!
|
||||
CALL stop_clock( 'dforce' )
|
||||
!
|
||||
RETURN
|
||||
END SUBROUTINE dforce_gpu_x
|
||||
#endif
|
||||
|
|
|
@ -30,6 +30,7 @@ SUBROUTINE from_scratch( )
|
|||
USE energies, ONLY : dft_energy_type, debug_energies
|
||||
USE dener, ONLY : denl, denl6, dekin6, detot
|
||||
USE uspp, ONLY : vkb, becsum, deeq, nkb, okvan, nlcc_any
|
||||
USE uspp_gpum, ONLY : vkb_d
|
||||
USE io_global, ONLY : stdout, ionode
|
||||
USE core, ONLY : rhoc
|
||||
USE gvecw, ONLY : ngw
|
||||
|
@ -43,21 +44,24 @@ SUBROUTINE from_scratch( )
|
|||
USE cp_interfaces, ONLY : runcp_uspp, runcp_uspp_force_pairing, &
|
||||
strucf, phfacs, nlfh, vofrho, nlfl_bgrp, prefor
|
||||
USE cp_interfaces, ONLY : rhoofr, ortho, wave_rand_init, elec_fakekine
|
||||
USE cp_interfaces, ONLY : compute_stress, dotcsc, calbec_bgrp, caldbec_bgrp
|
||||
USE cp_interfaces, ONLY : compute_stress, dotcsc, calbec, caldbec_bgrp
|
||||
USE cp_interfaces, ONLY : nlfq_bgrp
|
||||
USE printout_base, ONLY : printout_pos
|
||||
USE orthogonalize_base, ONLY : updatc, calphi_bgrp
|
||||
USE wave_base, ONLY : wave_steepest
|
||||
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, phi_bgrp
|
||||
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, c0_d, phi, cm_d
|
||||
USE fft_base, ONLY : dfftp, dffts
|
||||
USE time_step, ONLY : delt
|
||||
USE cp_main_variables, ONLY : idesc, bephi, becp_bgrp, nfi, &
|
||||
sfac, eigr, taub, irb, eigrb, bec_bgrp, &
|
||||
USE cp_main_variables, ONLY : idesc, bephi, becp_bgrp, nfi, iabox, nabox, &
|
||||
sfac, eigr, taub, irb, eigrb, bec_bgrp, bec_d, &
|
||||
lambda, lambdam, lambdap, ema0bg, rhog, rhor, rhos, &
|
||||
vpot, ht0, edft, becdr_bgrp, dbec, drhor, drhog
|
||||
USE mp_global, ONLY : inter_bgrp_comm, nbgrp, me_bgrp
|
||||
USE mp, ONLY : mp_sum
|
||||
USE mp_world, ONLY : mpime, world_comm
|
||||
USE mp, ONLY : mp_sum, mp_barrier
|
||||
USE matrix_inversion
|
||||
USE device_memcpy_m, ONLY : dev_memcpy
|
||||
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -115,7 +119,7 @@ SUBROUTINE from_scratch( )
|
|||
CALL strucf( sfac, eigts1, eigts2, eigts3, mill, dffts%ngm )
|
||||
!
|
||||
IF ( okvan .OR. nlcc_any ) THEN
|
||||
CALL initbox ( tau0, alat, at, ainv, taub, irb )
|
||||
CALL initbox ( tau0, alat, at, ainv, taub, irb, iabox, nabox )
|
||||
CALL phbox( taub, iverbosity, eigrb )
|
||||
END IF
|
||||
!
|
||||
|
@ -133,6 +137,7 @@ SUBROUTINE from_scratch( )
|
|||
! ... prefor calculates vkb (used by gram)
|
||||
!
|
||||
CALL prefor( eigr, vkb )
|
||||
CALL dev_memcpy( vkb_d, vkb )
|
||||
!
|
||||
nspin_wfc = nspin
|
||||
IF( force_pairing ) nspin_wfc = 1
|
||||
|
@ -141,7 +146,9 @@ SUBROUTINE from_scratch( )
|
|||
|
||||
IF( force_pairing ) cm_bgrp(:,iupdwn(2):iupdwn(2)+nupdwn(2)-1) = cm_bgrp(:,1:nupdwn(2))
|
||||
!
|
||||
if( iverbosity > 1 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp )
|
||||
if( iverbosity > 1 ) CALL dotcsc( vkb, cm_bgrp, ngw, nbsp )
|
||||
!
|
||||
CALL dev_memcpy( cm_d, cm_bgrp )
|
||||
!
|
||||
! ... initialize bands
|
||||
!
|
||||
|
@ -188,11 +195,11 @@ SUBROUTINE from_scratch( )
|
|||
!
|
||||
IF( .NOT. tcg ) THEN
|
||||
!
|
||||
CALL calbec_bgrp ( 1, nsp, eigr, cm_bgrp, bec_bgrp )
|
||||
CALL calbec ( nbsp_bgrp, vkb, cm_bgrp, bec_bgrp, 0 )
|
||||
!
|
||||
if ( tstress ) CALL caldbec_bgrp( eigr, cm_bgrp, dbec, idesc )
|
||||
!
|
||||
CALL rhoofr( nfi, cm_bgrp, irb, eigrb, bec_bgrp, dbec, becsum, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
|
||||
CALL rhoofr( nfi, cm_bgrp, cm_d, bec_bgrp, dbec, becsum, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
|
||||
!
|
||||
edft%enl = enl
|
||||
edft%ekin = ekin
|
||||
|
@ -201,7 +208,7 @@ SUBROUTINE from_scratch( )
|
|||
!
|
||||
! put core charge (if present) in rhoc(r)
|
||||
!
|
||||
if ( nlcc_any ) CALL set_cc( irb, eigrb, rhoc )
|
||||
if ( nlcc_any ) CALL set_cc( rhoc )
|
||||
!
|
||||
IF( .NOT. tcg ) THEN
|
||||
|
||||
|
@ -229,7 +236,7 @@ SUBROUTINE from_scratch( )
|
|||
if( iverbosity > 1 ) &
|
||||
CALL printout_pos( stdout, fion, nat, ityp, head = ' fion ' )
|
||||
|
||||
CALL newd( vpot, irb, eigrb, becsum, fion )
|
||||
CALL newd( vpot, becsum, fion, .true. )
|
||||
!
|
||||
IF( force_pairing ) THEN
|
||||
!
|
||||
|
@ -240,26 +247,42 @@ SUBROUTINE from_scratch( )
|
|||
!
|
||||
ELSE
|
||||
!
|
||||
CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, cm_bgrp, c0_bgrp, fromscra = .TRUE. )
|
||||
CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, cm_bgrp, cm_d, c0_bgrp, c0_d, fromscra = .TRUE. )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
CALL dev_memcpy( c0_d, c0_bgrp ) ! c0 contains the updated wave functions
|
||||
!
|
||||
! nlfq needs deeq bec
|
||||
!
|
||||
IF( ttforce ) THEN
|
||||
CALL nlfq_bgrp( cm_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
|
||||
#if defined (__CUDA)
|
||||
CALL nlfq_bgrp( cm_d, vkb_d, bec_bgrp, becdr_bgrp, fion )
|
||||
#else
|
||||
CALL nlfq_bgrp( cm_bgrp, vkb, bec_bgrp, becdr_bgrp, fion )
|
||||
#endif
|
||||
END IF
|
||||
!
|
||||
! calphi calculates phi
|
||||
! the electron mass rises with g**2
|
||||
!
|
||||
CALL calphi_bgrp( cm_bgrp, ngw, bec_bgrp, nkb, vkb, phi_bgrp, nbspx_bgrp, ema0bg )
|
||||
#if defined (__CUDA)
|
||||
CALL calphi_bgrp( cm_d, ngw, bec_bgrp, nkb, vkb_d, phi, nbspx_bgrp, ema0bg )
|
||||
#else
|
||||
CALL calphi_bgrp( cm_bgrp, ngw, bec_bgrp, nkb, vkb, phi, nbspx_bgrp, ema0bg )
|
||||
#endif
|
||||
!
|
||||
IF( force_pairing ) THEN
|
||||
! phi( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi( :, 1:nupdwn(2))
|
||||
CALL dev_memcpy(phi(:,iupdwn(2):), phi(:,1:), [1, ngw], 1 , [1, nupdwn(2)], 1)
|
||||
END IF
|
||||
!
|
||||
IF( force_pairing ) &
|
||||
& phi_bgrp( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi_bgrp( :, 1:nupdwn(2))
|
||||
|
||||
if( tortho ) then
|
||||
CALL ortho( eigr, c0_bgrp, phi_bgrp, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
|
||||
#if defined (__CUDA)
|
||||
CALL ortho( vkb_d, c0_d, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
|
||||
#else
|
||||
CALL ortho( vkb, c0_bgrp, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
|
||||
#endif
|
||||
else
|
||||
CALL gram_bgrp( vkb, bec_bgrp, nkb, c0_bgrp, ngw )
|
||||
endif
|
||||
|
@ -275,23 +298,30 @@ SUBROUTINE from_scratch( )
|
|||
if ( tstress ) CALL nlfh( stress, bec_bgrp, dbec, lambda, idesc )
|
||||
!
|
||||
IF ( tortho ) THEN
|
||||
CALL updatc( ccc, lambda, phi_bgrp, bephi, becp_bgrp, bec_bgrp, c0_bgrp, idesc )
|
||||
#if defined (__CUDA)
|
||||
CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_d, c0_d, idesc )
|
||||
CALL dev_memcpy( c0_bgrp, c0_d )
|
||||
CALL dev_memcpy( bec_bgrp, bec_d )
|
||||
#else
|
||||
CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_bgrp, c0_bgrp, idesc )
|
||||
#endif
|
||||
END IF
|
||||
!
|
||||
IF( force_pairing ) THEN
|
||||
!
|
||||
c0_bgrp ( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = c0_bgrp( :, 1:nupdwn(2))
|
||||
phi_bgrp( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi_bgrp( :, 1:nupdwn(2))
|
||||
CALL dev_memcpy(phi(:,iupdwn(2):), phi(:,1:), [1, ngw], 1 , [1, nupdwn(2)], 1)
|
||||
!phi( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi( :, 1:nupdwn(2))
|
||||
lambda(:,:,2) = lambda(:,:,1)
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
!
|
||||
CALL calbec_bgrp ( 1, nsp, eigr, c0_bgrp, bec_bgrp, 1 )
|
||||
! the following compute only on NC pseudo components
|
||||
CALL calbec ( nbsp_bgrp, vkb, c0_bgrp, bec_bgrp, 1 )
|
||||
!
|
||||
if ( tstress ) CALL caldbec_bgrp( eigr, cm_bgrp, dbec, idesc )
|
||||
|
||||
if ( iverbosity > 1 ) CALL dotcsc( eigr, c0_bgrp, ngw, nbsp_bgrp )
|
||||
if ( iverbosity > 1 ) CALL dotcsc( vkb, c0_bgrp, ngw, nbsp_bgrp )
|
||||
!
|
||||
xnhp0 = 0.0d0
|
||||
xnhpm = 0.0d0
|
||||
|
@ -325,3 +355,10 @@ SUBROUTINE from_scratch( )
|
|||
RETURN
|
||||
!
|
||||
END SUBROUTINE from_scratch
|
||||
|
||||
subroutine hangup
|
||||
USE mp_world, ONLY : mpime, world_comm
|
||||
USE mp, ONLY : mp_sum, mp_barrier
|
||||
call mp_barrier(world_comm)
|
||||
CALL stop_cp_run()
|
||||
end subroutine
|
||||
|
|
242
CPV/src/gram.f90
242
CPV/src/gram.f90
|
@ -16,33 +16,47 @@ SUBROUTINE gram_bgrp( betae, bec_bgrp, nkbx, cp_bgrp, ngwx )
|
|||
USE gvecw, ONLY : ngw
|
||||
USE electrons_base, ONLY : nbspx_bgrp, ibgrp_g2l, nupdwn, iupdwn, nbspx, iupdwn_bgrp, nspin
|
||||
USE kinds, ONLY : DP
|
||||
USE mp_global, ONLY : inter_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
USE gvect, ONLY : gstart
|
||||
USE mp_global, ONLY : intra_bgrp_comm, inter_bgrp_comm, me_bgrp, nproc_bgrp
|
||||
USE mp_world, ONLY : mpime
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: nkbx, ngwx
|
||||
REAL(DP) :: bec_bgrp( nkbx, nbspx_bgrp )
|
||||
COMPLEX(DP) :: cp_bgrp( ngwx, nbspx_bgrp ), betae( ngwx, nkbx )
|
||||
COMPLEX(DP) :: cp_bgrp( ngwx, nbspx_bgrp )
|
||||
COMPLEX(DP), INTENT(IN) :: betae( ngwx, nkbx )
|
||||
!
|
||||
REAL(DP) :: anorm
|
||||
REAL(DP), ALLOCATABLE :: csc( : )
|
||||
COMPLEX(DP), ALLOCATABLE :: ctmp( : )
|
||||
REAL(DP), ALLOCATABLE :: temp(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: cp_tmp(:)
|
||||
REAL(DP), ALLOCATABLE :: bec_tmp(:)
|
||||
REAL(DP), ALLOCATABLE :: csc2( : )
|
||||
INTEGER :: i,k,j, ig, ibgrp_k, ibgrp_i, nbgrp_im1, iss
|
||||
REAL(DP), PARAMETER :: one = 1.d0
|
||||
REAL(DP), PARAMETER :: mone = -1.d0
|
||||
REAL(DP) :: g0
|
||||
!
|
||||
CALL start_clock( 'gram' )
|
||||
|
||||
g0 = 0.0d0
|
||||
IF (gstart == 2) g0 = 1.0d0
|
||||
|
||||
ALLOCATE( csc( nbspx ) )
|
||||
ALLOCATE( ctmp( ngwx ) )
|
||||
ALLOCATE( cp_tmp( ngwx ) )
|
||||
ALLOCATE( bec_tmp( nkbx ) )
|
||||
ALLOCATE( csc2( SIZE( csc ) ) )
|
||||
!
|
||||
DO iss = 1, nspin
|
||||
DO i = iupdwn(iss), iupdwn(iss) + nupdwn(iss) - 1
|
||||
!
|
||||
ibgrp_i = ibgrp_g2l( i )
|
||||
!
|
||||
CALL gracsc_bgrp( bec_bgrp, betae, cp_bgrp, i, csc, iss, nbgrp_im1 )
|
||||
CALL gracsc_bgrp( i, csc, iss, nbgrp_im1 )
|
||||
!
|
||||
! calculate orthogonalized cp(i) : |cp(i)>=|cp(i)>-\sum_k<i csc(k)|cp(k)>
|
||||
!
|
||||
|
@ -60,14 +74,19 @@ SUBROUTINE gram_bgrp( betae, bec_bgrp, nkbx, cp_bgrp, ngwx )
|
|||
IF( ibgrp_i > 0 ) THEN
|
||||
cp_bgrp( :, ibgrp_i ) = ctmp
|
||||
anorm = cscnorm( bec_bgrp, cp_bgrp, ibgrp_i, nbspx_bgrp )
|
||||
CALL dscal( 2*ngw, 1.0d0/anorm, cp_bgrp(1,ibgrp_i), 1 )
|
||||
CALL dscal( nkbx, 1.0d0/anorm, bec_bgrp(1,ibgrp_i), 1 )
|
||||
cp_bgrp(:,ibgrp_i) = cp_bgrp(:,ibgrp_i) / anorm
|
||||
bec_bgrp(:,ibgrp_i) = bec_bgrp(:,ibgrp_i) / anorm
|
||||
!CALL dscal( 2*ngw, 1.0d0/anorm, cp_bgrp(1,ibgrp_i), 1 )
|
||||
!CALL dscal( nkbx, 1.0d0/anorm, bec_bgrp(1,ibgrp_i), 1 )
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
DEALLOCATE( ctmp )
|
||||
DEALLOCATE( csc )
|
||||
DEALLOCATE( csc2 )
|
||||
DEALLOCATE( bec_tmp )
|
||||
DEALLOCATE( cp_tmp )
|
||||
|
||||
CALL stop_clock( 'gram' )
|
||||
!
|
||||
|
@ -84,7 +103,6 @@ CONTAINS
|
|||
!
|
||||
USE ions_base, ONLY: nat, ityp
|
||||
USE gvecw, ONLY: ngw
|
||||
USE gvect, ONLY: gstart
|
||||
USE uspp_param, ONLY: nh, upf
|
||||
USE uspp, ONLY: qq_nt, indv_ijkb0
|
||||
USE mp, ONLY: mp_sum
|
||||
|
@ -97,47 +115,39 @@ CONTAINS
|
|||
REAL(DP), INTENT(IN) :: bec( :, : )
|
||||
COMPLEX(DP), INTENT(IN) :: cp( :, : )
|
||||
!
|
||||
REAL(DP) :: cscnorm
|
||||
REAL(DP) :: cscnorm, ddot
|
||||
!
|
||||
INTEGER ig, is, iv, jv, ia, inl, jnl
|
||||
INTEGER :: is, iv, jv, ia, indv
|
||||
REAL(DP) rsum
|
||||
REAL(DP), ALLOCATABLE:: temp(:)
|
||||
!
|
||||
ALLOCATE(temp(ngw))
|
||||
rsum = 2.d0 * ddot(2*ngw,cp(1,i),1,cp(1,i),1) - g0 * REAL(CONJG(cp(1,i))*cp(1,i), DP)
|
||||
!
|
||||
DO ig=1,ngw
|
||||
temp(ig)=DBLE(CONJG(cp(ig,i))*cp(ig,i))
|
||||
DO ia=1,nat
|
||||
IF ( MOD( ia, nproc_bgrp ) == me_bgrp ) THEN
|
||||
is = ityp(ia)
|
||||
IF( upf(is)%tvanp ) THEN
|
||||
indv = indv_ijkb0(ia)
|
||||
DO iv=1,nh(is)
|
||||
DO jv=1,nh(is)
|
||||
IF(ABS(qq_nt(iv,jv,is)).GT.1.e-5) THEN
|
||||
rsum = rsum + qq_nt(iv,jv,is)*bec(indv+iv,i)*bec(indv+jv,i)
|
||||
ENDIF
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
END IF
|
||||
END DO
|
||||
rsum=2.d0*SUM(temp)
|
||||
IF (gstart == 2) rsum=rsum-temp(1)
|
||||
|
||||
CALL mp_sum( rsum, intra_bgrp_comm )
|
||||
!
|
||||
DO ia=1,nat
|
||||
is = ityp(ia)
|
||||
IF( upf(is)%tvanp ) THEN
|
||||
DO iv=1,nh(is)
|
||||
inl=indv_ijkb0(ia) + iv
|
||||
DO jv=1,nh(is)
|
||||
jnl=indv_ijkb0(ia) + jv
|
||||
IF(ABS(qq_nt(iv,jv,is)).GT.1.e-5) THEN
|
||||
rsum = rsum + qq_nt(iv,jv,is)*bec(inl,i)*bec(jnl,i)
|
||||
ENDIF
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
!
|
||||
cscnorm=SQRT(rsum)
|
||||
|
||||
DEALLOCATE(temp)
|
||||
!
|
||||
RETURN
|
||||
END FUNCTION cscnorm
|
||||
!
|
||||
!
|
||||
!-------------------------------------------------------------------------
|
||||
SUBROUTINE gracsc_bgrp( bec_bgrp, betae, cp_bgrp, i, csc, iss, nk )
|
||||
SUBROUTINE gracsc_bgrp( i, csc, iss, nk )
|
||||
!-----------------------------------------------------------------------
|
||||
! requires in input the updated bec(k) for k<i
|
||||
! on output: bec(i) is recalculated
|
||||
|
@ -148,7 +158,6 @@ CONTAINS
|
|||
USE electrons_base, ONLY: ispin, ispin_bgrp, nbspx_bgrp, ibgrp_g2l, iupdwn, nupdwn, nbspx
|
||||
USE gvecw, ONLY: ngw
|
||||
USE mp, ONLY: mp_sum
|
||||
USE mp_global, ONLY: intra_bgrp_comm, inter_bgrp_comm, me_bgrp, nproc_bgrp
|
||||
USE kinds, ONLY: DP
|
||||
USE gvect, ONLY: gstart
|
||||
!
|
||||
|
@ -156,97 +165,67 @@ CONTAINS
|
|||
!
|
||||
INTEGER, INTENT(IN) :: i, iss
|
||||
INTEGER, INTENT(OUT) :: nk
|
||||
COMPLEX(DP) :: betae( :, : )
|
||||
REAL(DP) :: bec_bgrp( :, : )
|
||||
COMPLEX(DP) :: cp_bgrp( :, : )
|
||||
REAL(DP) :: csc( : )
|
||||
INTEGER :: k, kmax,ig, is, iv, jv, ia, inl, jnl, ibgrp_k, ibgrp_i
|
||||
REAL(DP) :: rsum
|
||||
REAL(DP), ALLOCATABLE :: temp(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: cp_tmp(:)
|
||||
REAL(DP), ALLOCATABLE :: bec_tmp(:)
|
||||
REAL(DP), ALLOCATABLE :: csc2( : )
|
||||
#if defined(_OPENMP)
|
||||
INTEGER :: mytid, ntids, omp_get_thread_num, omp_get_num_threads
|
||||
#endif
|
||||
INTEGER :: k, kmax_bgrp, kmax,ig, is, iv, jv, ia, inl, jnl, ibgrp_k, ibgrp_i
|
||||
REAL(DP) :: rsum, ddot
|
||||
INTEGER :: omp_get_thread_num, omp_get_num_threads
|
||||
!
|
||||
! calculate csc(k)=<cp(i)|cp(k)>, k<i
|
||||
!
|
||||
kmax = i - 1
|
||||
!
|
||||
ALLOCATE( cp_tmp( ngwx ) )
|
||||
ALLOCATE( bec_tmp( nkbx ) )
|
||||
ALLOCATE( csc2( SIZE( csc ) ) )
|
||||
|
||||
|
||||
cp_tmp = 0.0d0
|
||||
csc = 0.0d0
|
||||
|
||||
ibgrp_i = ibgrp_g2l( i )
|
||||
IF( ibgrp_i > 0 ) cp_tmp = cp_bgrp( :, ibgrp_i )
|
||||
IF( ibgrp_i > 0 ) THEN
|
||||
cp_tmp = cp_bgrp( :, ibgrp_i )
|
||||
ELSE
|
||||
cp_tmp = 0.0d0
|
||||
END IF
|
||||
|
||||
CALL mp_sum( cp_tmp, inter_bgrp_comm )
|
||||
|
||||
!$omp parallel default(none), &
|
||||
!$omp shared(iupdwn,kmax,ispin,ibgrp_g2l,ngw,cp_bgrp,cp_tmp,csc,betae,bec_bgrp,i,iss,gstart), &
|
||||
!$omp shared(upf,nat,ityp,indv_ijkb0,nh), &
|
||||
!$omp private( temp, k, ig, inl, ibgrp_k, ibgrp_i, is, ia )
|
||||
ALLOCATE( temp( ngw ) )
|
||||
!$omp do
|
||||
kmax_bgrp = 0
|
||||
nk = 0
|
||||
DO k = iupdwn( iss ), kmax
|
||||
IF ( ispin(i) .EQ. ispin(k) ) THEN
|
||||
ibgrp_k = ibgrp_g2l( k )
|
||||
IF( ibgrp_k > 0 ) THEN
|
||||
DO ig = 1, ngw
|
||||
temp(ig) = DBLE( cp_bgrp(ig,ibgrp_k) * CONJG(cp_tmp(ig)) )
|
||||
END DO
|
||||
csc(k) = 2.0d0 * SUM(temp)
|
||||
IF (gstart == 2) csc(k) = csc(k) - temp(1)
|
||||
END IF
|
||||
ENDIF
|
||||
IF( ibgrp_g2l( k ) > 0 ) THEN
|
||||
kmax_bgrp = ibgrp_g2l( k )
|
||||
nk = nk + 1
|
||||
END IF
|
||||
END DO
|
||||
!$omp end do
|
||||
!
|
||||
!
|
||||
! calculate bec(i)=<cp(i)|beta>
|
||||
!
|
||||
ibgrp_i = ibgrp_g2l( i )
|
||||
!
|
||||
kmax_bgrp = kmax_bgrp - iupdwn_bgrp(iss) + 1
|
||||
|
||||
IF( kmax_bgrp > 0 .AND. ngw > 0 ) &
|
||||
CALL dgemv( 'T', 2*ngw, kmax_bgrp, 1.0d0, cp_bgrp(1,iupdwn_bgrp(iss)), 2*ngwx, cp_tmp, 1, 0.0d0, csc2, 1 )
|
||||
|
||||
nk = 0
|
||||
DO k = iupdwn( iss ), kmax
|
||||
ibgrp_k = ibgrp_g2l( k )
|
||||
IF( ibgrp_k > 0 ) THEN
|
||||
nk = nk + 1
|
||||
csc(k) = 2.0d0 * csc2(nk) - g0 * DBLE( cp_bgrp(1,ibgrp_k) * CONJG(cp_tmp(1)) )
|
||||
END IF
|
||||
END DO
|
||||
|
||||
IF( ibgrp_i > 0 ) THEN
|
||||
!$omp do
|
||||
DO ia = 1, nat
|
||||
is = ityp(ia)
|
||||
IF( upf(is)%tvanp ) THEN
|
||||
DO iv=1,nh(is)
|
||||
inl=indv_ijkb0(ia)+iv
|
||||
DO ig=1,ngw
|
||||
temp(ig) = DBLE( cp_bgrp(ig,ibgrp_i) * CONJG(betae(ig,inl)) )
|
||||
END DO
|
||||
bec_bgrp(inl,ibgrp_i)=2.d0*SUM(temp)
|
||||
IF (gstart == 2) bec_bgrp(inl,ibgrp_i)= bec_bgrp(inl,ibgrp_i)-temp(1)
|
||||
bec_tmp(inl) = 2.d0 * DDOT( 2*ngw, cp_bgrp(1,ibgrp_i), 1, betae(1,inl), 1) &
|
||||
- g0 * DBLE(cp_bgrp(1,ibgrp_i) * CONJG(betae(1,inl)))
|
||||
END DO
|
||||
ELSE
|
||||
inl= indv_ijkb0(ia)
|
||||
bec_tmp( inl + 1: inl + nh(is) ) = 0.0d0
|
||||
END IF
|
||||
END DO
|
||||
!$omp end do
|
||||
CALL mp_sum( bec_tmp, intra_bgrp_comm ) ! parallel sum over G vectors within a band group
|
||||
bec_bgrp( : , ibgrp_i ) = bec_tmp( : )
|
||||
ELSE
|
||||
bec_tmp = 0.0d0
|
||||
END IF
|
||||
DEALLOCATE( temp )
|
||||
!$omp end parallel
|
||||
|
||||
CALL mp_sum( csc, intra_bgrp_comm )
|
||||
CALL mp_sum( csc, inter_bgrp_comm )
|
||||
|
||||
IF( ibgrp_i > 0 ) THEN
|
||||
DO ia = 1, nat
|
||||
is = ityp(ia)
|
||||
IF( upf(is)%tvanp ) THEN
|
||||
inl=indv_ijkb0(ia)
|
||||
CALL mp_sum( bec_bgrp( inl + 1: inl + nh(is), ibgrp_i ), intra_bgrp_comm )
|
||||
END IF
|
||||
END DO
|
||||
END IF
|
||||
|
||||
bec_tmp = 0.0d0
|
||||
IF( ibgrp_i > 0 ) bec_tmp = bec_bgrp(:,ibgrp_i )
|
||||
|
||||
CALL mp_sum( bec_tmp, inter_bgrp_comm )
|
||||
!
|
||||
|
@ -254,62 +233,46 @@ CONTAINS
|
|||
!
|
||||
csc2 = 0.0d0
|
||||
|
||||
!$omp parallel default(none), &
|
||||
!$omp shared(iupdwn,iss,kmax,nproc_bgrp,me_bgrp,ispin,i,ibgrp_g2l,nh), &
|
||||
!$omp parallel if( (kmax - iupdwn( iss )) > omp_get_num_threads() ) default(none), &
|
||||
!$omp shared(iupdwn,iss,kmax,nproc_bgrp,me_bgrp,nbspx,i,ibgrp_g2l,nh), &
|
||||
!$omp shared(indv_ijkb0,qq_nt,na,bec_tmp,bec_bgrp,csc2,nat,ityp,upf), &
|
||||
!$omp private( k, is, iv, jv, ia, inl, jnl, rsum, ibgrp_k, ntids, mytid )
|
||||
#if defined(_OPENMP)
|
||||
mytid = omp_get_thread_num() ! take the thread ID
|
||||
ntids = omp_get_num_threads() ! take the number of threads
|
||||
#endif
|
||||
|
||||
DO k=iupdwn(iss), kmax
|
||||
IF ( MOD( k, nproc_bgrp ) /= me_bgrp ) CYCLE
|
||||
#if defined(_OPENMP)
|
||||
! distribute bands round robin to threads
|
||||
IF( MOD( k / nproc_bgrp, ntids ) /= mytid ) CYCLE
|
||||
#endif
|
||||
IF (ispin(i).EQ.ispin(k)) THEN
|
||||
rsum=0.d0
|
||||
ibgrp_k = ibgrp_g2l( k )
|
||||
IF( ibgrp_k > 0 ) THEN
|
||||
DO ia = 1, nat
|
||||
!$omp private( k, is, iv, jv, ia, inl, jnl, rsum, ibgrp_k )
|
||||
!$omp do
|
||||
DO k = iupdwn( iss ), kmax
|
||||
rsum=0.d0
|
||||
ibgrp_k = ibgrp_g2l( k )
|
||||
IF( ibgrp_k > 0 ) THEN
|
||||
DO ia = 1, nat
|
||||
IF ( MOD( ia-1, nproc_bgrp ) == me_bgrp ) THEN
|
||||
is=ityp(ia)
|
||||
IF( upf(is)%tvanp ) THEN
|
||||
inl = indv_ijkb0(ia)
|
||||
DO iv=1,nh(is)
|
||||
inl=indv_ijkb0(ia)+iv
|
||||
DO jv=1,nh(is)
|
||||
jnl=indv_ijkb0(ia)+jv
|
||||
IF(ABS(qq_nt(iv,jv,is)).GT.1.e-5) THEN
|
||||
rsum = rsum + qq_nt(iv,jv,is)*bec_tmp(inl)*bec_bgrp(jnl,ibgrp_k)
|
||||
rsum = rsum + qq_nt(iv,jv,is)*bec_tmp(inl+iv)*bec_bgrp(inl+jv,ibgrp_k)
|
||||
ENDIF
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
END IF
|
||||
csc2(k)=csc2(k)+rsum
|
||||
END IF
|
||||
END DO
|
||||
ENDIF
|
||||
csc2(k)=csc2(k)+rsum
|
||||
END DO
|
||||
!$omp end do
|
||||
!$omp end parallel
|
||||
!
|
||||
! orthogonalized cp(i) : |cp(i)>=|cp(i)>-\sum_k<i csc(k)|cp(k)>
|
||||
!
|
||||
! corresponing bec: bec(i)=<cp(i)|beta>-csc(k)<cp(k)|beta>
|
||||
!
|
||||
CALL mp_sum( csc, intra_bgrp_comm )
|
||||
CALL mp_sum( csc2, intra_bgrp_comm )
|
||||
CALL mp_sum( csc, inter_bgrp_comm )
|
||||
CALL mp_sum( csc2, inter_bgrp_comm )
|
||||
csc = csc + csc2
|
||||
|
||||
bec_tmp = 0.0d0
|
||||
DO k = iupdwn(iss), kmax
|
||||
ibgrp_k = ibgrp_g2l( k )
|
||||
IF( ibgrp_k > 0 ) THEN
|
||||
DO inl=1,nkbx
|
||||
bec_tmp(inl)=bec_tmp(inl)-csc(k)*bec_bgrp(inl,ibgrp_k)
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
nk = 0
|
||||
DO k = iupdwn(iss), kmax
|
||||
ibgrp_k = ibgrp_g2l( k )
|
||||
|
@ -318,12 +281,15 @@ CONTAINS
|
|||
csc( nk ) = csc( k )
|
||||
END IF
|
||||
END DO
|
||||
|
||||
IF( nk > 0 .AND. ngw > 0 ) THEN
|
||||
CALL dgemv( 'N', nkbx, nk, -1.0d0, bec_bgrp(1,iupdwn_bgrp(iss)), nkbx, csc, 1, 0.0d0, bec_tmp, 1 )
|
||||
ELSE
|
||||
bec_tmp = 0.0d0
|
||||
END IF
|
||||
|
||||
CALL mp_sum( bec_tmp, inter_bgrp_comm )
|
||||
IF( ibgrp_i > 0 ) bec_bgrp(:,ibgrp_i ) = bec_bgrp(:,ibgrp_i ) + bec_tmp
|
||||
|
||||
DEALLOCATE( csc2 )
|
||||
DEALLOCATE( bec_tmp )
|
||||
DEALLOCATE( cp_tmp )
|
||||
!
|
||||
RETURN
|
||||
END SUBROUTINE gracsc_bgrp
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
USE gvect, ONLY : mill_g, eigts1,eigts2,eigts3, g, gg, &
|
||||
ecutrho, gcutm, gvect_init, mill, &
|
||||
ig_l2g, gstart, ngm, ngm_g, gshells
|
||||
USE gvect_gpum, ONLY : using_g, using_g_d
|
||||
use gvecs, only : gcutms, gvecs_init, ngms
|
||||
use gvecw, only : gkcut, gvecw_init, g2kin_init
|
||||
USE smallbox_subs, ONLY : ggenb
|
||||
|
@ -195,6 +196,11 @@
|
|||
!
|
||||
END IF
|
||||
|
||||
CALL using_g(2)
|
||||
#if defined (__CUDA)
|
||||
CALL using_g_d(0)
|
||||
#endif
|
||||
|
||||
CALL gshells (.TRUE.)
|
||||
!
|
||||
! ... allocate and generate (modified) kinetic energy
|
||||
|
@ -401,6 +407,7 @@
|
|||
cell_base_reinit
|
||||
USE gvecw, ONLY : g2kin_init
|
||||
USE gvect, ONLY : g, gg, ngm, mill
|
||||
USE gvect_gpum, ONLY : using_g, using_g_d
|
||||
USE fft_base, ONLY : dfftp, dfftb
|
||||
USE small_box, ONLY : small_box_set
|
||||
USE smallbox_subs, ONLY : gcalb
|
||||
|
@ -425,10 +432,15 @@
|
|||
!
|
||||
! re-calculate G-vectors and kinetic energy
|
||||
!
|
||||
CALL using_g(2)
|
||||
!
|
||||
do ig = 1, dfftp%ngm
|
||||
g(:,ig)= mill(1,ig)*bg(:,1) + mill(2,ig)*bg(:,2) + mill(3,ig)*bg(:,3)
|
||||
gg(ig)=g(1,ig)**2 + g(2,ig)**2 + g(3,ig)**2
|
||||
enddo
|
||||
#if defined (__CUDA)
|
||||
CALL using_g_d(0)
|
||||
#endif
|
||||
!
|
||||
call g2kin_init ( gg, tpiba2 )
|
||||
!
|
||||
|
|
|
@ -27,8 +27,9 @@ SUBROUTINE init_run()
|
|||
USE fft_base, ONLY : dfftp, dffts
|
||||
USE electrons_base, ONLY : nspin, nbsp, nbspx, nupdwn, f
|
||||
USE uspp, ONLY : nkb, vkb, deeq, becsum,nkbus
|
||||
USE uspp_gpum, ONLY : vkb_d
|
||||
USE core, ONLY : rhoc
|
||||
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, phi_bgrp
|
||||
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, allocate_cp_wavefunctions
|
||||
USE ensemble_dft, ONLY : tens, z0t
|
||||
USE cg_module, ONLY : tcg
|
||||
USE electrons_base, ONLY : nudx
|
||||
|
@ -80,6 +81,9 @@ SUBROUTINE init_run()
|
|||
USE cell_base, ONLY : ref_tpiba2, init_tpiba2
|
||||
USE tsvdw_module, ONLY : tsvdw_initialize
|
||||
USE exx_module, ONLY : exx_initialize
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -171,10 +175,7 @@ SUBROUTINE init_run()
|
|||
!
|
||||
! initialize wave functions descriptors and allocate wf
|
||||
!
|
||||
IF(lwfpbe0nscf) ALLOCATE(cv0( ngw, vnbsp ) ) ! Lingzhu Kong
|
||||
ALLOCATE( c0_bgrp( ngw, nbspx ) )
|
||||
ALLOCATE( cm_bgrp( ngw, nbspx ) )
|
||||
ALLOCATE( phi_bgrp( ngw, nbspx ) )
|
||||
CALL allocate_cp_wavefunctions( ngw, nbspx, vnbsp, lwfpbe0nscf )
|
||||
!
|
||||
IF ( iverbosity > 1 ) THEN
|
||||
!
|
||||
|
@ -203,6 +204,9 @@ SUBROUTINE init_run()
|
|||
ALLOCATE( deeq( nhm, nhm, nat, nspin ) )
|
||||
!
|
||||
ALLOCATE( vkb( ngw, nkb ) )
|
||||
#if defined(_CUDA)
|
||||
ALLOCATE( vkb_d( ngw, nkb ) )
|
||||
#endif
|
||||
!
|
||||
IF ( xclib_dft_is('meta') .AND. tens ) &
|
||||
CALL errore( ' init_run ', 'ensemble_dft not implemented for metaGGA', 1 )
|
||||
|
@ -264,11 +268,6 @@ SUBROUTINE init_run()
|
|||
!
|
||||
hnew = h
|
||||
!
|
||||
IF(lwfpbe0nscf) cv0=( 0.D0, 0.D0 ) ! Lingzhu Kong
|
||||
cm_bgrp = ( 0.D0, 0.D0 )
|
||||
c0_bgrp = ( 0.D0, 0.D0 )
|
||||
phi_bgrp = ( 0.D0, 0.D0 )
|
||||
!
|
||||
IF ( tens ) then
|
||||
CALL id_matrix_init( idesc, nspin )
|
||||
CALL h_matrix_init( idesc, nspin )
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
!
|
||||
!
|
||||
!====================================================================
|
||||
SUBROUTINE inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
SUBROUTINE inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
|
||||
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, &
|
||||
sfac, c0, bec, dbec, firstiter, vpot )
|
||||
!====================================================================
|
||||
|
@ -109,7 +109,7 @@
|
|||
|
||||
! calculates the overlaps bec between the wavefunctions c0
|
||||
! and the beta functions
|
||||
CALL calbec( 1, nsp, eigr, c0, bec )
|
||||
CALL calbec( n, betae, c0, bec )
|
||||
|
||||
! rotates the wavefunctions c0 and the overlaps bec
|
||||
! (the occupation matrix f_ij becomes diagonal f_i)
|
||||
|
@ -119,7 +119,7 @@
|
|||
! calculates the electronic charge density
|
||||
CALL rhoofr( nfi, c0diag, irb, eigrb, becdiag, dbec, rhovan, &
|
||||
rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
|
||||
IF(nlcc_any) CALL set_cc( irb, eigrb, rhoc )
|
||||
IF(nlcc_any) CALL set_cc( rhoc )
|
||||
|
||||
! calculates the SCF potential, the total energy
|
||||
! and the ionic forces
|
||||
|
@ -141,7 +141,7 @@
|
|||
! the augmentation charges and the
|
||||
! corresponding contribution to the ionic force
|
||||
|
||||
CALL newd( vpot, irb, eigrb, rhovan, fion2 )
|
||||
CALL newd( vpot, rhovan, fion2, .true. )
|
||||
|
||||
! operates the Hamiltonian on the wavefunction c0
|
||||
h0c0( :, : )= 0.D0
|
||||
|
@ -260,7 +260,7 @@
|
|||
! calculates the electronic charge density
|
||||
CALL rhoofr( nfi, c0diag, irb, eigrb, becdiag, dbec, rhovan, &
|
||||
rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
|
||||
IF(nlcc_any) CALL set_cc( irb, eigrb, rhoc )
|
||||
IF(nlcc_any) CALL set_cc( rhoc )
|
||||
|
||||
! calculates the SCF potential, the total energy
|
||||
! and the ionic forces
|
||||
|
@ -409,7 +409,7 @@
|
|||
! calculates the electronic charge density
|
||||
CALL rhoofr( nfi, c0diag, irb, eigrb, becdiag, dbec, rhovan, &
|
||||
rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
|
||||
IF(nlcc_any) CALL set_cc( irb, eigrb, rhoc )
|
||||
IF(nlcc_any) CALL set_cc( rhoc )
|
||||
|
||||
! calculates the SCF potential, the total energy
|
||||
! and the ionic forces
|
||||
|
|
|
@ -119,10 +119,11 @@
|
|||
USE step_penalty, ONLY: penalty_e, penalty_f
|
||||
USE mp_pools, ONLY: intra_pool_comm, me_pool, nproc_pool
|
||||
USE mp_bands, only: nbgrp
|
||||
USE cp_interfaces, only: nlsm1, nlsm2_bgrp
|
||||
USE cp_interfaces, only: calbec, nlsm2_bgrp
|
||||
!
|
||||
implicit none
|
||||
complex(DP), intent(in) :: c(ngw,nx), eigr(ngw,nat), betae(ngw,nkb)
|
||||
complex(DP), intent(in) :: c(ngw,nx), eigr(ngw,nat)
|
||||
complex(DP), intent(inout) :: betae(ngw,nkb)
|
||||
complex(DP), intent(out) :: hpsi(ngw,nx)
|
||||
real(DP), INTENT(OUT) :: forceh(3,nat)
|
||||
!
|
||||
|
@ -243,10 +244,10 @@
|
|||
allocate(dns(ldmx,ldmx,nspin,nat))
|
||||
allocate (spsi(ngw,n))
|
||||
!
|
||||
call nlsm1 ( n, 1, nsp, eigr, c, bp )
|
||||
call calbec ( n, betae, c, bp )
|
||||
call s_wfc ( n, bp, betae, c, spsi )
|
||||
call nlsm2_bgrp( ngw, nkb, eigr, c, dbp, nx, n )
|
||||
call nlsm2_bgrp( ngw, nkb, eigr, wfcU, wdb, nwfcU, nwfcU )
|
||||
call nlsm2_bgrp( ngw, nkb, betae, c, dbp, nx, n )
|
||||
call nlsm2_bgrp( ngw, nkb, betae, wfcU, wdb, nwfcU, nwfcU )
|
||||
!
|
||||
! poor-man parallelization over bands
|
||||
! - if nproc_pool=1 : nb_s=1, nb_e=n, mykey=0
|
||||
|
@ -620,12 +621,13 @@
|
|||
USE gvect, ONLY: gstart
|
||||
USE ions_base, ONLY: nsp, nat
|
||||
USE uspp, ONLY: nkb
|
||||
USE cp_interfaces, only: nlsm1
|
||||
USE cp_interfaces, only: calbec
|
||||
!
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: nx, n, nwfcU, offset(nat), &
|
||||
Hubbard_l(nsp)
|
||||
COMPLEX(DP), INTENT(IN) :: c( ngw, nx ), eigr(ngw,nat), betae(ngw,nkb)
|
||||
COMPLEX(DP), INTENT(IN) :: c( ngw, nx ), eigr(ngw,nat)
|
||||
COMPLEX(DP), INTENT(INOUT) :: betae(ngw,nkb)
|
||||
!
|
||||
COMPLEX(DP), INTENT(OUT):: wfcU(ngw, nwfcU), &
|
||||
& swfc(ngw, nwfcU)
|
||||
|
@ -641,7 +643,7 @@
|
|||
!
|
||||
! calculate bec = <beta|wfc>
|
||||
!
|
||||
CALL nlsm1( nwfcU, 1, nsp, eigr, wfcU, becwfc )
|
||||
CALL calbec( nwfcU, betae, wfcU, becwfc )
|
||||
!
|
||||
! calculate swfc = S|wfc>
|
||||
!
|
||||
|
|
|
@ -42,6 +42,8 @@ MODULE cp_main_variables
|
|||
REAL(DP), ALLOCATABLE :: taub(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: eigrb(:,:)
|
||||
INTEGER, ALLOCATABLE :: irb(:,:)
|
||||
INTEGER, ALLOCATABLE :: iabox(:)
|
||||
INTEGER :: nabox
|
||||
!
|
||||
! ... nonlocal projectors:
|
||||
! ... bec = scalar product of projectors and wave functions
|
||||
|
@ -53,8 +55,12 @@ MODULE cp_main_variables
|
|||
REAL(DP), ALLOCATABLE :: bephi(:,:) ! distributed (orhto group)
|
||||
REAL(DP), ALLOCATABLE :: becp_bgrp(:,:) ! distributed becp (band group)
|
||||
REAL(DP), ALLOCATABLE :: bec_bgrp(:,:) ! distributed bec (band group)
|
||||
REAL(DP), ALLOCATABLE :: bec_d(:,:) ! distributed bec (band group)
|
||||
REAL(DP), ALLOCATABLE :: becdr_bgrp(:,:,:) ! distributed becdr (band group)
|
||||
REAL(DP), ALLOCATABLE :: dbec(:,:,:,:) ! derivative of bec distributed(ortho group)
|
||||
#if defined (__CUDA)
|
||||
ATTRIBUTES( DEVICE ) :: becp_bgrp, bephi, bec_d
|
||||
#endif
|
||||
!
|
||||
! ... mass preconditioning
|
||||
!
|
||||
|
@ -148,6 +154,10 @@ MODULE cp_main_variables
|
|||
ALLOCATE( irb( 3, nat ), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' allocate_mainvar ', ' unable to allocate irb ', ierr )
|
||||
ALLOCATE( iabox( nat ), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' allocate_mainvar ', ' unable to allocate iabox ', ierr )
|
||||
nabox = 0
|
||||
!
|
||||
IF ( xclib_dft_is('meta') ) THEN
|
||||
!
|
||||
|
@ -249,6 +259,11 @@ MODULE cp_main_variables
|
|||
ALLOCATE( bec_bgrp( nhsa, nbspx_bgrp ), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' allocate_mainvar ', ' unable to allocate bec_bgrp ', ierr )
|
||||
#if defined (__CUDA)
|
||||
ALLOCATE( bec_d( nhsa, nbspx_bgrp ), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' allocate_mainvar ', ' unable to allocate bec_d ', ierr )
|
||||
#endif
|
||||
ALLOCATE( bephi( nhsa, nspin*nrcx ), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' allocate_mainvar ', ' unable to allocate becphi ', ierr )
|
||||
|
@ -281,12 +296,14 @@ MODULE cp_main_variables
|
|||
IF( ALLOCATED( sfac ) ) DEALLOCATE( sfac )
|
||||
IF( ALLOCATED( eigrb ) ) DEALLOCATE( eigrb )
|
||||
IF( ALLOCATED( irb ) ) DEALLOCATE( irb )
|
||||
IF( ALLOCATED( iabox ) ) DEALLOCATE( iabox )
|
||||
IF( ALLOCATED( rhor ) ) DEALLOCATE( rhor )
|
||||
IF( ALLOCATED( rhos ) ) DEALLOCATE( rhos )
|
||||
IF( ALLOCATED( rhog ) ) DEALLOCATE( rhog )
|
||||
IF( ALLOCATED( drhog ) ) DEALLOCATE( drhog )
|
||||
IF( ALLOCATED( drhor ) ) DEALLOCATE( drhor )
|
||||
IF( ALLOCATED( bec_bgrp ) ) DEALLOCATE( bec_bgrp )
|
||||
IF( ALLOCATED( bec_d ) ) DEALLOCATE( bec_d )
|
||||
IF( ALLOCATED( becdr_bgrp ) ) DEALLOCATE( becdr_bgrp )
|
||||
IF( ALLOCATED( bephi ) ) DEALLOCATE( bephi )
|
||||
IF( ALLOCATED( becp_bgrp ) ) DEALLOCATE( becp_bgrp )
|
||||
|
|
|
@ -7,8 +7,8 @@
|
|||
!
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
|
||||
cm_bgrp, phi_bgrp, enthal, enb, enbi, fccc, ccc, dt2bye, stress, l_cprestart )
|
||||
SUBROUTINE move_electrons_x( nfi, tprint, tfirst, tlast, b1, b2, b3, fion, &
|
||||
enthal, enb, enbi, fccc, ccc, dt2bye, stress, l_cprestart )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... this routine updates the electronic degrees of freedom
|
||||
|
@ -17,10 +17,11 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
|
|||
USE control_flags, ONLY : lwf, tfor, tprnfor, thdyn
|
||||
USE cg_module, ONLY : tcg
|
||||
USE cp_main_variables, ONLY : eigr, irb, eigrb, rhog, rhos, rhor, drhor, &
|
||||
drhog, sfac, ema0bg, bec_bgrp, becdr_bgrp, &
|
||||
drhog, sfac, ema0bg, bec_bgrp, becdr_bgrp, &
|
||||
taub, lambda, lambdam, lambdap, vpot, dbec, idesc
|
||||
USE cell_base, ONLY : omega, ibrav, h, press
|
||||
USE uspp, ONLY : becsum, vkb, nkb, nlcc_any
|
||||
USE uspp_gpum, ONLY : vkb_d
|
||||
USE energies, ONLY : ekin, enl, entropy, etot
|
||||
USE electrons_base, ONLY : nbsp, nspin, f, nudx, nupdwn, nbspx_bgrp, nbsp_bgrp
|
||||
USE core, ONLY : rhoc
|
||||
|
@ -43,16 +44,16 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
|
|||
USE electrons_module, ONLY : distribute_c, collect_c, distribute_b
|
||||
USE gvect, ONLY : eigts1, eigts2, eigts3
|
||||
USE control_flags, ONLY : lwfpbe0nscf ! exx_wf related
|
||||
USE wavefunctions, ONLY : cv0 ! Lingzhu Kong
|
||||
USE wavefunctions, ONLY : cv0, c0_bgrp, cm_bgrp, phi, c0_d, cm_d
|
||||
USE xc_lib, ONLY : xclib_dft_is, exx_is_active
|
||||
USE device_memcpy_m, ONLY : dev_memcpy
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: nfi
|
||||
LOGICAL, INTENT(IN) :: tfirst, tlast
|
||||
LOGICAL, INTENT(IN) :: tprint, tfirst, tlast
|
||||
REAL(DP), INTENT(IN) :: b1(3), b2(3), b3(3)
|
||||
REAL(DP) :: fion(:,:)
|
||||
COMPLEX(DP) :: c0_bgrp(:,:), cm_bgrp(:,:), phi_bgrp(:,:)
|
||||
REAL(DP), INTENT(IN) :: dt2bye
|
||||
REAL(DP) :: fccc, ccc
|
||||
REAL(DP) :: enb, enbi
|
||||
|
@ -67,10 +68,14 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
|
|||
CALL start_clock('move_electrons')
|
||||
electron_dynamic: IF ( tcg ) THEN
|
||||
!
|
||||
#if defined (__CUDA)
|
||||
CALL errore(' move_electrons ', ' GPU version of runcg not yet implemented ', 1 )
|
||||
#else
|
||||
CALL runcg_uspp( nfi, tfirst, tlast, eigr, bec_bgrp, irb, eigrb, &
|
||||
rhor, rhog, rhos, rhoc, eigts1, eigts2, eigts3, sfac, &
|
||||
fion, ema0bg, becdr_bgrp, lambdap, lambda, SIZE(lambda,1), vpot, c0_bgrp, &
|
||||
cm_bgrp, phi_bgrp, dbec, l_cprestart )
|
||||
cm_bgrp, phi, dbec, l_cprestart )
|
||||
#endif
|
||||
!
|
||||
CALL compute_stress( stress, detot, h, omega )
|
||||
!
|
||||
|
@ -80,7 +85,7 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
|
|||
CALL get_wannier_center( tfirst, cm_bgrp, bec_bgrp, eigr, &
|
||||
eigrb, taub, irb, ibrav, b1, b2, b3 )
|
||||
!
|
||||
CALL rhoofr( nfi, c0_bgrp, irb, eigrb, bec_bgrp, dbec, becsum, rhor, &
|
||||
CALL rhoofr( nfi, c0_bgrp, c0_d, bec_bgrp, dbec, becsum, rhor, &
|
||||
drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
|
||||
!
|
||||
!=================================================================
|
||||
|
@ -105,7 +110,7 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
|
|||
!=================================================================
|
||||
! ... put core charge (if present) in rhoc(r)
|
||||
!
|
||||
IF ( nlcc_any ) CALL set_cc( irb, eigrb, rhoc )
|
||||
IF ( nlcc_any ) CALL set_cc( rhoc )
|
||||
!
|
||||
IF ( lwf ) THEN
|
||||
!
|
||||
|
@ -154,9 +159,10 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
|
|||
!
|
||||
!=======================================================================
|
||||
!
|
||||
CALL newd( vpot, irb, eigrb, becsum, fion )
|
||||
CALL newd( vpot, becsum, fion, tprint )
|
||||
!
|
||||
CALL prefor( eigr, vkb )
|
||||
CALL dev_memcpy( vkb_d, vkb )
|
||||
!
|
||||
IF( force_pairing ) THEN
|
||||
!
|
||||
|
@ -165,23 +171,29 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
|
|||
!
|
||||
ELSE
|
||||
!
|
||||
CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp )
|
||||
CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
CALL dev_memcpy( cm_d, cm_bgrp ) ! cm contains the updated wavefunctions
|
||||
!
|
||||
!----------------------------------------------------------------------
|
||||
! contribution to fion due to lambda
|
||||
!----------------------------------------------------------------------
|
||||
!
|
||||
! ... nlfq needs deeq bec
|
||||
!
|
||||
IF ( tfor .OR. tprnfor ) THEN
|
||||
CALL nlfq_bgrp( c0_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
|
||||
IF ( tfor .OR. ( tprnfor .AND. tprint ) ) THEN
|
||||
#if defined (__CUDA)
|
||||
CALL nlfq_bgrp( c0_d, vkb_d, bec_bgrp, becdr_bgrp, fion )
|
||||
#else
|
||||
CALL nlfq_bgrp( c0_bgrp, vkb, bec_bgrp, becdr_bgrp, fion )
|
||||
#endif
|
||||
END IF
|
||||
!
|
||||
IF ( (tfor.or.tprnfor) .AND. tefield ) &
|
||||
IF ( (tfor.or.(tprnfor.AND.tprint)) .AND. tefield ) &
|
||||
CALL bforceion( fion, .TRUE. , ipolp, qmat, bec_bgrp, becdr_bgrp, gqq, evalue )
|
||||
IF ( (tfor.or.tprnfor) .AND. tefield2 ) &
|
||||
IF ( (tfor.or.(tprnfor.AND.tprint)) .AND. tefield2 ) &
|
||||
CALL bforceion( fion, .TRUE. , ipolp2, qmat2, bec_bgrp, becdr_bgrp, gqq2, evalue2 )
|
||||
!
|
||||
IF( force_pairing ) THEN
|
||||
|
@ -199,13 +211,17 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
|
|||
! ... calphi calculates phi
|
||||
! ... the electron mass rises with g**2
|
||||
!
|
||||
CALL calphi_bgrp( c0_bgrp, ngw, bec_bgrp, nkb, vkb, phi_bgrp, nbspx_bgrp, ema0bg )
|
||||
#if defined (__CUDA)
|
||||
CALL calphi_bgrp( c0_d, ngw, bec_bgrp, nkb, vkb_d, phi, nbspx_bgrp, ema0bg )
|
||||
#else
|
||||
CALL calphi_bgrp( c0_bgrp, ngw, bec_bgrp, nkb, vkb, phi, nbspx_bgrp, ema0bg )
|
||||
#endif
|
||||
!
|
||||
! ... begin try and error loop (only one step!)
|
||||
!
|
||||
! ... nlfl and nlfh need: lambda (guessed) becdr
|
||||
!
|
||||
IF ( tfor .OR. tprnfor ) THEN
|
||||
IF ( tfor .OR. (tprnfor .AND. tprint) ) THEN
|
||||
CALL nlfl_bgrp( bec_bgrp, becdr_bgrp, lambda, idesc, fion )
|
||||
END IF
|
||||
!
|
||||
|
|
374
CPV/src/newd.f90
374
CPV/src/newd.f90
|
@ -6,7 +6,7 @@
|
|||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!-------------------------------------------------------------------------
|
||||
SUBROUTINE newd(vr,irb,eigrb,rhovan,fion)
|
||||
SUBROUTINE newd( vr, rhovan, fion, tprint )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! this routine calculates array deeq:
|
||||
|
@ -32,28 +32,28 @@
|
|||
my_bgrp_id, nbgrp
|
||||
USE fft_interfaces, ONLY: invfft
|
||||
USE fft_base, ONLY: dfftb, dfftp
|
||||
USE cp_main_variables,ONLY: irb, eigrb, iabox, nabox
|
||||
!
|
||||
IMPLICIT NONE
|
||||
! input
|
||||
INTEGER irb(3,nat)
|
||||
REAL(DP) rhovan(nhm*(nhm+1)/2,nat,nspin)
|
||||
COMPLEX(DP) eigrb(ngb,nat)
|
||||
REAL(DP), INTENT(IN) :: rhovan(nhm*(nhm+1)/2,nat,nspin)
|
||||
REAL(DP) vr(dfftp%nnr,nspin)
|
||||
LOGICAL, INTENT(IN) :: tprint
|
||||
! output
|
||||
REAL(DP) fion(3,nat)
|
||||
! local
|
||||
INTEGER isup,isdw,iss, iv,ijv,jv, ik, nfft, ia, is, ig
|
||||
REAL(DP) fac, fac1, fac2, res
|
||||
COMPLEX(DP) ci, facg1, facg2
|
||||
INTEGER :: isup,isdw,iss, iv,ijv,jv, ik, nfft, ia, iia, is, ig
|
||||
REAL(DP) :: fac, fac1, fac2, res
|
||||
COMPLEX(DP), PARAMETER :: ci = (0.d0,1.d0)
|
||||
COMPLEX(DP) :: facg1, facg2
|
||||
COMPLEX(DP), ALLOCATABLE :: qv(:), fg1(:), fg2(:)
|
||||
REAL(DP), ALLOCATABLE :: fvan(:,:)
|
||||
INTEGER :: na_bgrp, ia_bgrp
|
||||
|
||||
INTEGER :: mytid, ntids
|
||||
#if defined(_OPENMP)
|
||||
INTEGER :: itid, mytid, ntids, omp_get_thread_num, omp_get_num_threads
|
||||
INTEGER :: omp_get_thread_num, omp_get_num_threads
|
||||
EXTERNAL :: omp_get_thread_num, omp_get_num_threads
|
||||
#endif
|
||||
|
||||
!
|
||||
IF ( dfftb%nr1==0 .OR. dfftb%nr2==0 .OR. dfftb%nr3==0 ) THEN
|
||||
RETURN
|
||||
|
@ -62,263 +62,151 @@
|
|||
CALL start_clock( 'newd' )
|
||||
|
||||
ALLOCATE( fvan( 3, nat ) )
|
||||
ci=(0.d0,1.d0)
|
||||
fac=omegab/DBLE(dfftb%nr1*dfftb%nr2*dfftb%nr3)
|
||||
deeq (:,:,:,:) = 0.d0
|
||||
fvan (:,:) = 0.d0
|
||||
|
||||
fac=omegab/DBLE(dfftb%nr1*dfftb%nr2*dfftb%nr3)
|
||||
|
||||
!$omp parallel default(none) &
|
||||
!$omp shared(ngb, nh, qgb, eigrb, dfftb, irb, vr, ci, deeq, &
|
||||
!$omp fac, nspin, my_bgrp_id, nbgrp, ityp, upf, nat ) &
|
||||
!$omp private(mytid, ntids, is, ia, nfft, iv, jv, ijv, ig, qv, fg1, fg2, itid, res, iss )
|
||||
!$omp shared(ngb, nh, qgb, eigrb, dfftb, irb, vr, deeq, tfor, thdyn, tprnfor, tprint, nabox, &
|
||||
!$omp fac, nspin, my_bgrp_id, nbgrp, ityp, upf, nat, fvan, tpibab, gxb, rhovan, iabox ) &
|
||||
!$omp private(mytid, ntids, is, ia, iia, nfft, iv, jv, ijv, ig, qv, fg1, fg2, res, &
|
||||
!$omp iss, isup, isdw, fac2, facg1, fac1 )
|
||||
|
||||
#if defined(_OPENMP)
|
||||
mytid = omp_get_thread_num() ! take the thread ID
|
||||
ntids = omp_get_num_threads() ! take the number of threads
|
||||
itid = 0
|
||||
#else
|
||||
mytid = 0
|
||||
ntids = 1
|
||||
#endif
|
||||
|
||||
!$omp workshare
|
||||
deeq (:,:,:,:) = 0.d0
|
||||
fvan (:,:) = 0.d0
|
||||
!$omp end workshare
|
||||
|
||||
ALLOCATE( qv( dfftb%nnr ) )
|
||||
ALLOCATE( fg1( ngb ) )
|
||||
ALLOCATE( fg2( ngb ) )
|
||||
!
|
||||
! calculation of deeq_i,lm = \int V_eff(r) q_i,lm(r) dr
|
||||
!
|
||||
DO ia = 1, nat
|
||||
|
||||
is = ityp(ia)
|
||||
|
||||
IF( .NOT. upf(is)%tvanp ) &
|
||||
CYCLE
|
||||
|
||||
#if defined(__MPI)
|
||||
IF( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
|
||||
CYCLE
|
||||
!
|
||||
! calculation of deeq_i,lm = \int V_eff(r) q_i,lm(r) dr
|
||||
!
|
||||
DO iia = 1, nabox
|
||||
IF( MOD( iia - 1, ntids ) == mytid ) THEN
|
||||
ia = iabox( iia )
|
||||
is = ityp(ia)
|
||||
nfft = 1
|
||||
DO iv=1,nh(is)
|
||||
DO jv=iv,nh(is)
|
||||
ijv = (jv-1)*jv/2 + iv
|
||||
fg1 = eigrb(1:ngb,ia )*qgb(1:ngb,ijv,is)
|
||||
CALL fft_oned2box( qv, fg1 )
|
||||
CALL invfft( qv, dfftb, ia )
|
||||
DO iss=1,nspin
|
||||
res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
|
||||
deeq(iv,jv,ia,iss) = fac * res
|
||||
IF (iv.NE.jv) &
|
||||
& deeq(jv,iv,ia,iss)=deeq(iv,jv,ia,iss)
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
#endif
|
||||
|
||||
nfft = 1
|
||||
|
||||
#if defined(_OPENMP)
|
||||
IF ( mytid /= itid ) THEN
|
||||
itid = MOD( itid + 1, ntids )
|
||||
CYCLE
|
||||
ELSE
|
||||
itid = MOD( itid + 1, ntids )
|
||||
END IF
|
||||
#endif
|
||||
|
||||
DO iv=1,nh(is)
|
||||
DO jv=iv,nh(is)
|
||||
ijv = (jv-1)*jv/2 + iv
|
||||
IF (nfft.EQ.2) THEN
|
||||
fg1 = eigrb(1:ngb,ia )*qgb(1:ngb,ijv,is)
|
||||
fg2 = eigrb(1:ngb,ia+1)*qgb(1:ngb,ijv,is)
|
||||
CALL fft_oned2box( qv, fg1, fg2 )
|
||||
ELSE
|
||||
fg1 = eigrb(1:ngb,ia )*qgb(1:ngb,ijv,is)
|
||||
CALL fft_oned2box( qv, fg1 )
|
||||
END IF
|
||||
!
|
||||
CALL invfft( qv, dfftb, ia )
|
||||
!
|
||||
DO iss=1,nspin
|
||||
res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
|
||||
deeq(iv,jv,ia,iss) = fac * res
|
||||
IF (iv.NE.jv) &
|
||||
& deeq(jv,iv,ia,iss)=deeq(iv,jv,ia,iss)
|
||||
IF (nfft.EQ.2) THEN
|
||||
res = boxdotgrid(irb(:,ia+1),2,qv,vr(:,iss))
|
||||
deeq(iv,jv,ia+1,iss) = fac * res
|
||||
IF (iv.NE.jv) &
|
||||
& deeq(jv,iv,ia+1,iss)=deeq(iv,jv,ia+1,iss)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
|
||||
IF ( tfor .OR. thdyn .OR. (tprnfor.AND.tprint) ) THEN
|
||||
!
|
||||
! calculation of fion_i = \int V_eff(r) \sum_lm rho_lm (dq_i,lm(r)/dR_i) dr
|
||||
!
|
||||
IF( nspin == 1 ) THEN
|
||||
!
|
||||
! case nspin=1: two ffts at the same time, on two atoms (if possible)
|
||||
!
|
||||
iss=1
|
||||
nfft=1
|
||||
|
||||
DO iia = 1, nabox
|
||||
IF( MOD( iia - 1, ntids ) == mytid ) THEN
|
||||
ia = iabox( iia )
|
||||
is = ityp(ia)
|
||||
DO ik=1,3
|
||||
qv(:) = (0.d0, 0.d0)
|
||||
DO iv=1,nh(is)
|
||||
DO jv=iv,nh(is)
|
||||
ijv = (jv-1)*jv/2 + iv
|
||||
IF(iv.NE.jv) THEN
|
||||
fac1=2.d0*fac*tpibab*rhovan(ijv,ia,iss)
|
||||
ELSE
|
||||
fac1= fac*tpibab*rhovan(ijv,ia,iss)
|
||||
ENDIF
|
||||
DO ig=1,ngb
|
||||
facg1 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is)*fac1
|
||||
fg1(ig) = eigrb(ig,ia )*facg1
|
||||
END DO
|
||||
CALL fft_add_oned2box( qv, fg1 )
|
||||
END DO
|
||||
END DO
|
||||
CALL invfft( qv, dfftb, ia)
|
||||
res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
|
||||
fvan(ik,ia) = res
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
|
||||
ELSE
|
||||
!
|
||||
! case nspin=2: up and down spin fft's combined into a single fft
|
||||
!
|
||||
isup=1
|
||||
isdw=2
|
||||
|
||||
DO iia = 1, nabox
|
||||
IF( MOD( iia - 1, ntids ) == mytid ) THEN
|
||||
ia = iabox( iia )
|
||||
is = ityp(ia)
|
||||
DO ik=1,3
|
||||
qv(:) = (0.d0, 0.d0)
|
||||
DO iv=1,nh(is)
|
||||
DO jv=iv,nh(is)
|
||||
ijv = (jv-1)*jv/2 + iv
|
||||
IF(iv.NE.jv) THEN
|
||||
fac1=2.d0*fac*tpibab*rhovan(ijv,ia,isup)
|
||||
fac2=2.d0*fac*tpibab*rhovan(ijv,ia,isdw)
|
||||
ELSE
|
||||
fac1= fac*tpibab*rhovan(ijv,ia,isup)
|
||||
fac2= fac*tpibab*rhovan(ijv,ia,isdw)
|
||||
END IF
|
||||
DO ig=1,ngb
|
||||
fg1(ig) = fac1 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
|
||||
& qgb(ig,ijv,is) * eigrb(ig,ia)
|
||||
fg2(ig) = fac2 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
|
||||
& qgb(ig,ijv,is) * eigrb(ig,ia)
|
||||
END DO
|
||||
CALL fft_add_oned2box( qv, fg1, fg2 )
|
||||
END DO
|
||||
END DO
|
||||
CALL invfft( qv, dfftb, ia)
|
||||
fvan(ik,ia) = &
|
||||
& boxdotgrid(irb(:,ia),isup,qv,vr(:,isup)) + &
|
||||
& boxdotgrid(irb(:,ia),isdw,qv,vr(:,isdw))
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
END IF
|
||||
END IF
|
||||
|
||||
DEALLOCATE( qv )
|
||||
DEALLOCATE( fg1 )
|
||||
DEALLOCATE( fg2 )
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
CALL mp_sum( deeq, intra_bgrp_comm )
|
||||
CALL mp_sum( deeq, inter_bgrp_comm )
|
||||
|
||||
IF (.NOT.( tfor .OR. thdyn .OR. tprnfor ) ) go to 10
|
||||
!
|
||||
! calculation of fion_i = \int V_eff(r) \sum_lm rho_lm (dq_i,lm(r)/dR_i) dr
|
||||
!
|
||||
|
||||
IF( nspin == 1 ) THEN
|
||||
|
||||
! =================================================================
|
||||
! case nspin=1: two ffts at the same time, on two atoms (if possible)
|
||||
! -----------------------------------------------------------------
|
||||
|
||||
!$omp parallel default(none) &
|
||||
!$omp shared(nat, ngb, nh, qgb, eigrb, dfftb, irb, vr, ci, deeq, &
|
||||
!$omp fac, nspin, rhovan, tpibab, gxb, fvan, my_bgrp_id, nbgrp, ityp, upf ) &
|
||||
!$omp private(mytid, ntids, is, ia, ik, nfft, iv, jv, ijv, ig, qv, itid, res, iss, &
|
||||
!$omp fac1, fac2, facg1, facg2, fg1, fg2 )
|
||||
|
||||
|
||||
ALLOCATE( qv( dfftb%nnr ) )
|
||||
ALLOCATE( fg1( ngb ) )
|
||||
ALLOCATE( fg2( ngb ) )
|
||||
|
||||
iss=1
|
||||
|
||||
#if defined(_OPENMP)
|
||||
mytid = omp_get_thread_num() ! take the thread ID
|
||||
ntids = omp_get_num_threads() ! take the number of threads
|
||||
itid = 0
|
||||
#endif
|
||||
|
||||
DO ia = 1, nat
|
||||
|
||||
is = ityp(ia)
|
||||
|
||||
IF( .NOT. upf(is)%tvanp ) &
|
||||
CYCLE
|
||||
|
||||
#if defined(__MPI)
|
||||
IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
|
||||
CYCLE
|
||||
END IF
|
||||
#endif
|
||||
nfft=1
|
||||
|
||||
#if defined(_OPENMP)
|
||||
IF ( mytid /= itid ) THEN
|
||||
itid = MOD( itid + 1, ntids )
|
||||
CYCLE
|
||||
ELSE
|
||||
itid = MOD( itid + 1, ntids )
|
||||
END IF
|
||||
#endif
|
||||
DO ik=1,3
|
||||
qv(:) = (0.d0, 0.d0)
|
||||
DO iv=1,nh(is)
|
||||
DO jv=iv,nh(is)
|
||||
ijv = (jv-1)*jv/2 + iv
|
||||
IF(iv.NE.jv) THEN
|
||||
fac1=2.d0*fac*tpibab*rhovan(ijv,ia,iss)
|
||||
IF (nfft.EQ.2) fac2=2.d0*fac*tpibab* &
|
||||
& rhovan(ijv,ia+1,iss)
|
||||
ELSE
|
||||
fac1= fac*tpibab*rhovan(ijv,ia,iss)
|
||||
IF (nfft.EQ.2) fac2= fac*tpibab* &
|
||||
& rhovan(ijv,ia+1,iss)
|
||||
ENDIF
|
||||
IF (nfft.EQ.2) THEN
|
||||
DO ig=1,ngb
|
||||
facg1 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is) * fac1
|
||||
facg2 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is) * fac2
|
||||
fg1(ig) = eigrb(ig,ia )*facg1
|
||||
fg2(ig) = eigrb(ig,ia+1)*facg2
|
||||
END DO
|
||||
CALL fft_add_oned2box( qv, fg1, fg2 )
|
||||
ELSE
|
||||
DO ig=1,ngb
|
||||
facg1 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is)*fac1
|
||||
fg1(ig) = eigrb(ig,ia )*facg1
|
||||
END DO
|
||||
CALL fft_add_oned2box( qv, fg1 )
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
CALL invfft( qv, dfftb, ia)
|
||||
!
|
||||
res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
|
||||
fvan(ik,ia) = res
|
||||
!
|
||||
IF (nfft.EQ.2) THEN
|
||||
res = boxdotgrid(irb(:,ia+1),2,qv,vr(:,iss))
|
||||
fvan(ik,ia+1) = res
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
|
||||
DEALLOCATE( qv )
|
||||
DEALLOCATE( fg1 )
|
||||
DEALLOCATE( fg2 )
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
ELSE
|
||||
|
||||
! =================================================================
|
||||
! case nspin=2: up and down spin fft's combined into a single fft
|
||||
! -----------------------------------------------------------------
|
||||
|
||||
ALLOCATE( qv( dfftb%nnr ) )
|
||||
ALLOCATE( fg1( ngb ) )
|
||||
ALLOCATE( fg2( ngb ) )
|
||||
isup=1
|
||||
isdw=2
|
||||
|
||||
DO ia = 1, nat
|
||||
|
||||
is = ityp(ia)
|
||||
|
||||
IF( .NOT. upf(is)%tvanp ) &
|
||||
CYCLE
|
||||
|
||||
#if defined(__MPI)
|
||||
IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
|
||||
CYCLE
|
||||
END IF
|
||||
#endif
|
||||
|
||||
DO ik=1,3
|
||||
qv(:) = (0.d0, 0.d0)
|
||||
!
|
||||
DO iv=1,nh(is)
|
||||
DO jv=iv,nh(is)
|
||||
ijv = (jv-1)*jv/2 + iv
|
||||
IF(iv.NE.jv) THEN
|
||||
fac1=2.d0*fac*tpibab*rhovan(ijv,ia,isup)
|
||||
fac2=2.d0*fac*tpibab*rhovan(ijv,ia,isdw)
|
||||
ELSE
|
||||
fac1= fac*tpibab*rhovan(ijv,ia,isup)
|
||||
fac2= fac*tpibab*rhovan(ijv,ia,isdw)
|
||||
END IF
|
||||
DO ig=1,ngb
|
||||
fg1(ig) = fac1 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
|
||||
& qgb(ig,ijv,is) * eigrb(ig,ia)
|
||||
fg2(ig) = fac2 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
|
||||
& qgb(ig,ijv,is) * eigrb(ig,ia)
|
||||
END DO
|
||||
CALL fft_add_oned2box( qv, fg1, fg2 )
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
CALL invfft( qv, dfftb, ia)
|
||||
!
|
||||
fvan(ik,ia) = &
|
||||
& boxdotgrid(irb(:,ia),isup,qv,vr(:,isup)) + &
|
||||
& boxdotgrid(irb(:,ia),isdw,qv,vr(:,isdw))
|
||||
END DO
|
||||
END DO
|
||||
|
||||
DEALLOCATE( qv )
|
||||
DEALLOCATE( fg1 )
|
||||
DEALLOCATE( fg2 )
|
||||
|
||||
IF ( tfor .OR. thdyn .OR. (tprnfor.AND.tprint) ) THEN
|
||||
CALL mp_sum( fvan, intra_bgrp_comm )
|
||||
CALL mp_sum( fvan, inter_bgrp_comm )
|
||||
fion(:,:) = fion(:,:) - fvan(:,:)
|
||||
END IF
|
||||
|
||||
CALL mp_sum( fvan, intra_bgrp_comm )
|
||||
CALL mp_sum( fvan, inter_bgrp_comm )
|
||||
|
||||
fion(:,:) = fion(:,:) - fvan(:,:)
|
||||
|
||||
10 CONTINUE
|
||||
CALL mp_sum( deeq, intra_bgrp_comm )
|
||||
CALL mp_sum( deeq, inter_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock( 'newd' )
|
||||
!
|
||||
|
|
|
@ -5,108 +5,15 @@
|
|||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE beta_eigr_x ( beigr, nspmn, nspmx, eigr, pptype_ )
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
! computes: the array becp
|
||||
! beigr(ig,iv)=
|
||||
! = [(-i)**l beta(g,iv,is) e^(-ig.r_ia)]^*
|
||||
!
|
||||
! routine makes use of c*(g)=c(-g) (g> see routine ggen)
|
||||
! input : beta(ig,l,is), eigr, c
|
||||
! output: becp as parameter
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE ions_base, only : nat, nsp, ityp
|
||||
USE gvecw, only : ngw
|
||||
USE uspp, only : nkb, nhtol, beta, indv_ijkb0
|
||||
USE uspp_param, only : nh, upf, nhm
|
||||
!
|
||||
USE gvect, ONLY : gstart
|
||||
!
|
||||
implicit none
|
||||
|
||||
integer, intent(in) :: nspmn, nspmx
|
||||
complex(DP), intent(in) :: eigr( :, : )
|
||||
complex(DP), intent(out) :: beigr( :, : )
|
||||
INTEGER, INTENT(IN), OPTIONAL :: pptype_
|
||||
! pptype_: pseudo type to process: 0 = all, 1 = norm-cons, 2 = ultra-soft
|
||||
!
|
||||
integer :: ig, is, iv, ia, l, inl
|
||||
complex(DP) :: cfact
|
||||
integer :: pptype
|
||||
!
|
||||
call start_clock( 'beta_eigr' )
|
||||
|
||||
IF( PRESENT( pptype_ ) ) THEN
|
||||
pptype = pptype_
|
||||
ELSE
|
||||
pptype = 0
|
||||
END IF
|
||||
|
||||
!allocate( wrk2( ngw, nkb ) )
|
||||
|
||||
beigr = 0.0d0
|
||||
|
||||
!$omp parallel default(none), &
|
||||
!$omp shared(nat,ngw,nh,nhtol,beigr,beta,eigr,ityp,pptype,nspmn,nspmx,upf,gstart,indv_ijkb0), &
|
||||
!$omp private(is,ia,iv,inl,l,cfact,ig)
|
||||
!$omp do
|
||||
DO ia = 1, nat
|
||||
is = ityp(ia)
|
||||
!
|
||||
IF( pptype == 2 .AND. .NOT. upf(is)%tvanp ) CYCLE
|
||||
IF( pptype == 1 .AND. upf(is)%tvanp ) CYCLE
|
||||
IF( is >= nspmn .AND. is <= nspmx ) THEN
|
||||
!
|
||||
inl = indv_ijkb0(ia)
|
||||
|
||||
do iv = 1, nh( is )
|
||||
!
|
||||
l = nhtol( iv, is )
|
||||
!
|
||||
if (l == 0) then
|
||||
cfact = cmplx( 1.0_dp , 0.0_dp )
|
||||
else if (l == 1) then
|
||||
cfact = - cmplx( 0.0_dp , 1.0_dp )
|
||||
else if (l == 2) then
|
||||
cfact = - cmplx( 0.0_dp , 1.0_dp )
|
||||
cfact = cfact * cfact
|
||||
else if (l == 3) then
|
||||
cfact = - cmplx( 0.0_dp , 1.0_dp )
|
||||
cfact = cfact * cfact * cfact
|
||||
endif
|
||||
!
|
||||
! q = 0 component (with weight 1.0)
|
||||
!
|
||||
if (gstart == 2) then
|
||||
beigr( 1, iv + inl ) = cfact * beta(1,iv,is) * eigr(1,ia)
|
||||
end if
|
||||
!
|
||||
! q > 0 components (with weight 2.0)
|
||||
!
|
||||
do ig = gstart, ngw
|
||||
beigr( ig, iv + inl ) = 2.0d0 * cfact * beta(ig,iv,is) * eigr(ig,ia)
|
||||
end do
|
||||
!
|
||||
end do
|
||||
!
|
||||
END IF
|
||||
END DO
|
||||
!$omp end do
|
||||
!$omp end parallel
|
||||
|
||||
call stop_clock( 'beta_eigr' )
|
||||
|
||||
RETURN
|
||||
END SUBROUTINE beta_eigr_x
|
||||
!-----------------------------------------------------------------------
|
||||
#if defined(__CUDA)
|
||||
#define DEVICEATTR ,DEVICE
|
||||
#else
|
||||
#define DEVICEATTR
|
||||
#endif
|
||||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
subroutine nlsm1us_x ( n, beigr, c, becp )
|
||||
subroutine nlsm1us_x ( n, betae, c, becp )
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
! computes: the array becp
|
||||
|
@ -124,17 +31,36 @@
|
|||
USE mp_global, ONLY : nproc_bgrp, intra_bgrp_comm
|
||||
USE gvecw, only : ngw
|
||||
USE uspp, only : nkb
|
||||
USE gvect, ONLY : gstart
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
!
|
||||
implicit none
|
||||
|
||||
integer, intent(in) :: n
|
||||
complex(DP), intent(in) :: beigr( :, : ), c( :, : )
|
||||
real(DP), intent(out) :: becp( :, : )
|
||||
complex(DP) DEVICEATTR , intent(in) :: c( :, : )
|
||||
complex(DP) DEVICEATTR , intent(inout) :: betae( :, : )
|
||||
real(DP) DEVICEATTR , intent(out) :: becp( :, : )
|
||||
INTEGER :: i
|
||||
!
|
||||
call start_clock( 'nlsm1us' )
|
||||
|
||||
IF( ngw > 0 .AND. nkb > 0 ) THEN
|
||||
CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 1.0d0, beigr, 2*ngw, c, 2*ngw, 0.0d0, becp, nkb )
|
||||
IF( gstart > 1 ) THEN
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, nkb
|
||||
betae( 1, i ) = 0.5d0 * betae( 1, i )
|
||||
END DO
|
||||
END IF
|
||||
CALL MYDGEMM( 'T', 'N', nkb, n, 2*ngw, 2.0d0, betae, 2*ngw, c, 2*ngw, 0.0d0, becp, nkb )
|
||||
IF( gstart > 1 ) THEN
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, nkb
|
||||
betae( 1, i ) = 2.0d0 * betae( 1, i )
|
||||
END DO
|
||||
END IF
|
||||
END IF
|
||||
|
||||
IF( nproc_bgrp > 1 ) THEN
|
||||
|
@ -149,7 +75,7 @@
|
|||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
subroutine nlsm1_x ( n, nspmn, nspmx, eigr, c, becp, pptype_ )
|
||||
subroutine nlsm1_x ( n, betae, c, becp, pptype_ )
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
! computes: the array becp
|
||||
|
@ -170,21 +96,20 @@
|
|||
USE uspp, only : nkb, nhtol, beta, indv_ijkb0
|
||||
USE uspp_param, only : nh, upf, nhm
|
||||
USE gvect, ONLY : gstart
|
||||
USE cp_interfaces, only : beta_eigr
|
||||
!
|
||||
implicit none
|
||||
|
||||
integer, intent(in) :: n, nspmn, nspmx
|
||||
complex(DP), intent(in) :: eigr( :, : ), c( :, : )
|
||||
real(DP), intent(out) :: becp( :, : )
|
||||
integer, intent(in) :: n
|
||||
complex(DP), intent(in) :: c( :, : )
|
||||
complex(DP), intent(inout) :: betae( :, : )
|
||||
real(DP), intent(out) :: becp( :, : )
|
||||
INTEGER, INTENT(IN), OPTIONAL :: pptype_
|
||||
! pptype_: pseudo type to process: 0 = all, 1 = norm-cons, 2 = ultra-soft
|
||||
!
|
||||
integer :: ig, is, iv, ia, l, inl
|
||||
real(DP), allocatable :: becps( :, : )
|
||||
complex(DP), allocatable :: wrk2( :, : )
|
||||
complex(DP) :: cfact
|
||||
integer :: ig, is, iv, ia, l, inl
|
||||
integer :: pptype
|
||||
real(DP), allocatable :: becps( :, : )
|
||||
LOGICAL :: nothing_to_do
|
||||
!
|
||||
call start_clock( 'nlsm1' )
|
||||
|
||||
|
@ -194,21 +119,40 @@
|
|||
pptype = 0
|
||||
END IF
|
||||
|
||||
allocate( wrk2( ngw, nkb ) )
|
||||
allocate( becps( SIZE(becp,1), SIZE(becp,2) ) )
|
||||
|
||||
CALL beta_eigr ( wrk2, nspmn, nspmx, eigr, pptype_ )
|
||||
|
||||
IF( ngw > 0 .AND. nkb > 0 ) THEN
|
||||
CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 1.0d0, wrk2, 2*ngw, c, 2*ngw, 0.0d0, becps, nkb )
|
||||
IF( pptype == 1 ) THEN
|
||||
nothing_to_do = .TRUE.
|
||||
do is = 1, nsp
|
||||
IF( .NOT. upf(is)%tvanp ) THEN
|
||||
nothing_to_do = .FALSE.
|
||||
END IF
|
||||
END DO
|
||||
IF( nothing_to_do ) GO TO 100
|
||||
END IF
|
||||
|
||||
DEALLOCATE( wrk2 )
|
||||
IF( pptype == 0 ) THEN
|
||||
IF( ngw > 0 .AND. nkb > 0 ) THEN
|
||||
IF( gstart > 1 ) betae( 1, : ) = 0.5d0 * betae( 1, : )
|
||||
CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 2.0d0, betae, 2*ngw, c, 2*ngw, 0.0d0, becp, SIZE(becp,1) )
|
||||
IF( gstart > 1 ) betae( 1, : ) = 2.0d0 * betae( 1, : )
|
||||
END IF
|
||||
IF( nproc_bgrp > 1 ) THEN
|
||||
CALL mp_sum( becp, intra_bgrp_comm )
|
||||
END IF
|
||||
GO TO 100
|
||||
END IF
|
||||
|
||||
allocate( becps( SIZE(becp,1), SIZE(becp,2) ) )
|
||||
|
||||
IF( ngw > 0 .AND. nkb > 0 ) THEN
|
||||
IF( gstart > 1 ) betae( 1, : ) = 0.5d0 * betae( 1, : )
|
||||
CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 2.0d0, betae, 2*ngw, c, 2*ngw, 0.0d0, becps, nkb )
|
||||
IF( gstart > 1 ) betae( 1, : ) = 2.0d0 * betae( 1, : )
|
||||
END IF
|
||||
|
||||
IF( nproc_bgrp > 1 ) THEN
|
||||
CALL mp_sum( becps, intra_bgrp_comm )
|
||||
END IF
|
||||
do is = nspmn, nspmx
|
||||
do is = 1, nsp
|
||||
IF( pptype == 2 .AND. .NOT. upf(is)%tvanp ) CYCLE
|
||||
IF( pptype == 1 .AND. upf(is)%tvanp ) CYCLE
|
||||
DO ia = 1, nat
|
||||
|
@ -218,21 +162,21 @@
|
|||
becp(inl+iv,:) = becps( inl+iv, : )
|
||||
end do
|
||||
END IF
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
!
|
||||
|
||||
DEALLOCATE( becps )
|
||||
|
||||
100 CONTINUE
|
||||
|
||||
call stop_clock( 'nlsm1' )
|
||||
|
||||
return
|
||||
end subroutine nlsm1_x
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
|
||||
!-------------------------------------------------------------------------
|
||||
subroutine nlsm2_bgrp_x( ngw, nkb, eigr, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
|
||||
subroutine nlsm2_bgrp_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
! computes: the array becdr
|
||||
|
@ -245,9 +189,6 @@
|
|||
!
|
||||
|
||||
USE kinds, ONLY : DP
|
||||
use ions_base, only : nsp, ityp, nat
|
||||
use uspp, only : nhtol, beta, indv_ijkb0
|
||||
use uspp_param, only : nh, upf
|
||||
use cell_base, only : tpiba
|
||||
use mp, only : mp_sum
|
||||
use mp_global, only : nproc_bgrp, intra_bgrp_comm
|
||||
|
@ -256,69 +197,40 @@
|
|||
implicit none
|
||||
|
||||
integer, intent(in) :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
|
||||
complex(DP), intent(in) :: eigr(:,:), c_bgrp(:,:)
|
||||
complex(DP), intent(in) :: betae(:,:), c_bgrp(:,:)
|
||||
real(DP), intent(out) :: becdr_bgrp(:,:,:)
|
||||
!
|
||||
complex(DP), allocatable :: wrk2(:,:)
|
||||
!
|
||||
integer :: ig, is, iv, ia, k, l, inl
|
||||
complex(DP) :: cfact
|
||||
integer :: ig, iv, k, info
|
||||
complex(DP) :: cfact1, cfact2
|
||||
!
|
||||
call start_clock( 'nlsm2' )
|
||||
|
||||
allocate( wrk2( ngw, nkb ) )
|
||||
cfact2 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba * 2.0d0
|
||||
cfact1 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba
|
||||
|
||||
allocate( wrk2, MOLD = betae, STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' nlsm2 ', ' allocating wrk2', ABS( info ) )
|
||||
!
|
||||
DO k = 1, 3
|
||||
!
|
||||
DO ia = 1, nat
|
||||
|
||||
is = ityp(ia)
|
||||
|
||||
inl = indv_ijkb0(ia)
|
||||
|
||||
do iv=1,nh(is)
|
||||
!
|
||||
! order of states: s_1 p_x1 p_z1 p_y1 s_2 p_x2 p_z2 p_y2
|
||||
!
|
||||
l=nhtol(iv,is)
|
||||
|
||||
! compute (-i)^(l+1)
|
||||
!
|
||||
if (l == 0) then
|
||||
cfact = - cmplx( 0.0_dp , 1.0_dp )
|
||||
else if (l == 1) then
|
||||
cfact = - cmplx( 0.0_dp , 1.0_dp )
|
||||
cfact = cfact * cfact
|
||||
else if (l == 2) then
|
||||
cfact = - cmplx( 0.0_dp , 1.0_dp )
|
||||
cfact = cfact * cfact * cfact
|
||||
else if (l == 3) then
|
||||
cfact = cmplx( 1.0_dp , 0.0_dp )
|
||||
endif
|
||||
|
||||
cfact = cfact * tpiba
|
||||
|
||||
! q = 0 component (with weight 1.0)
|
||||
if (gstart == 2) then
|
||||
wrk2(1,iv+inl) = cfact * g(k,1) * beta(1,iv,is) * eigr(1,ia)
|
||||
end if
|
||||
! q > 0 components (with weight 2.0)
|
||||
do ig=gstart,ngw
|
||||
wrk2(ig,iv+inl) = cfact * 2.0d0 * g(k,ig) * beta(ig,iv,is) * eigr(ig,ia)
|
||||
end do
|
||||
!
|
||||
end do
|
||||
|
||||
do iv=1,nkb
|
||||
wrk2(1,iv) = cfact1 * g(k,1) * betae(1,iv)
|
||||
end do
|
||||
|
||||
!$omp parallel do default(shared) private(iv,ig) collapse(2)
|
||||
do iv=1,nkb
|
||||
do ig=gstart,ngw
|
||||
wrk2(ig,iv) = cfact2 * g(k,ig) * betae(ig,iv)
|
||||
end do
|
||||
end do
|
||||
!$omp end parallel do
|
||||
IF( ngw > 0 .AND. nkb > 0 ) THEN
|
||||
CALL dgemm( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2, 2*ngw, &
|
||||
CALL dgemm( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2(1,1), 2*ngw, &
|
||||
c_bgrp, 2*ngw, 0.0d0, becdr_bgrp( 1, 1, k ), nkb )
|
||||
END IF
|
||||
|
||||
end do
|
||||
|
||||
|
||||
deallocate( wrk2 )
|
||||
|
||||
IF( nproc_bgrp > 1 ) THEN
|
||||
|
@ -331,7 +243,86 @@
|
|||
end subroutine nlsm2_bgrp_x
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
#if defined (__CUDA)
|
||||
!-------------------------------------------------------------------------
|
||||
subroutine nlsm2_bgrp_gpu_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
! computes: the array becdr
|
||||
! becdr(ia,n,iv,is,k)
|
||||
! =2.0 sum_g> g_k beta(g,iv,is) re[ (i)**(l+1) e^(ig.r_ia) c(g,n)]
|
||||
!
|
||||
! routine makes use of c*(g)=c(-g) (g> see routine ggen)
|
||||
! input : eigr, c
|
||||
! output: becdr
|
||||
!
|
||||
|
||||
USE kinds, ONLY : DP
|
||||
use cell_base, only : tpiba
|
||||
use mp, only : mp_sum
|
||||
use mp_global, only : nproc_bgrp, intra_bgrp_comm
|
||||
use gvect, only : gstart
|
||||
use gvect_gpum, only : g_d
|
||||
USE device_memcpy_m, ONLY : dev_memcpy
|
||||
USE cudafor
|
||||
USE cublas
|
||||
!
|
||||
implicit none
|
||||
|
||||
integer, intent(in) :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
|
||||
complex(DP), intent(in), DEVICE :: c_bgrp(:,:)
|
||||
complex(DP), intent(in), DEVICE :: betae(:,:)
|
||||
real(DP), intent(out) :: becdr_bgrp(:,:,:)
|
||||
!
|
||||
complex(DP), allocatable, DEVICE :: wrk2(:,:)
|
||||
real(DP), allocatable, DEVICE :: becdr_d(:,:)
|
||||
!
|
||||
integer :: ig, iv, k, info
|
||||
complex(DP) :: cfact1, cfact2
|
||||
!
|
||||
call start_clock( 'nlsm2' )
|
||||
|
||||
ALLOCATE( wrk2, MOLD=betae, STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' nlsm2 ', ' allocating wrk2', ABS( info ) )
|
||||
ALLOCATE( becdr_d( SIZE( becdr_bgrp, 1 ), SIZE( becdr_bgrp, 2 ) ), STAT=info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' nlsm2 ', ' allocating becdr_d ', ABS( info ) )
|
||||
|
||||
cfact2 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba * 2.0d0
|
||||
cfact1 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba
|
||||
|
||||
DO k = 1, 3
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
do iv=1,nkb
|
||||
wrk2(1,iv) = cfact1 * g_d(k,1) * betae(1,iv)
|
||||
end do
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
do iv=1,nkb
|
||||
do ig=gstart,ngw
|
||||
wrk2(ig,iv) = cfact2 * g_d(k,ig) * betae(ig,iv)
|
||||
end do
|
||||
end do
|
||||
IF( ngw > 0 .AND. nkb > 0 ) THEN
|
||||
CALL MYDGEMM( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2(1,1), 2*ngw, &
|
||||
c_bgrp, 2*ngw, 0.0d0, becdr_d, nkb )
|
||||
CALL dev_memcpy( becdr_bgrp(:,:,k), becdr_d )
|
||||
END IF
|
||||
end do
|
||||
|
||||
DEALLOCATE( becdr_d )
|
||||
deallocate( wrk2 )
|
||||
|
||||
IF( nproc_bgrp > 1 ) THEN
|
||||
CALL mp_sum( becdr_bgrp, intra_bgrp_comm )
|
||||
END IF
|
||||
|
||||
call stop_clock( 'nlsm2' )
|
||||
!
|
||||
return
|
||||
end subroutine nlsm2_bgrp_gpu_x
|
||||
!-----------------------------------------------------------------------
|
||||
#endif
|
||||
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE ennl_x( ennl_val, rhovan, bec_bgrp )
|
||||
|
@ -356,35 +347,40 @@
|
|||
! local
|
||||
!
|
||||
real(DP) :: sumt, sums(2), ennl_t
|
||||
integer :: is, iv, jv, ijv, inl, jnl, ia, iss, i
|
||||
integer :: is, iv, jv, ijv, inl, jnl, ia, iss, i, indv
|
||||
INTEGER :: omp_get_num_threads
|
||||
!
|
||||
ennl_t = 0.d0
|
||||
!
|
||||
do is = 1, nsp
|
||||
!$omp parallel num_threads(min(4,omp_get_num_threads())) default(none) &
|
||||
!$omp shared(nat,ityp,indv_ijkb0,nh,nbsp_bgrp,ispin_bgrp,f_bgrp,bec_bgrp,rhovan,dvan,nspin,ennl_t) &
|
||||
!$omp private(ia,is,indv,iv,inl,jv,ijv,jnl,sums,iss,i,sumt)
|
||||
!$omp do reduction(+:ennl_t)
|
||||
do ia = 1, nat
|
||||
is = ityp(ia)
|
||||
indv = indv_ijkb0(ia)
|
||||
do iv = 1, nh(is)
|
||||
inl = indv + iv
|
||||
do jv = iv, nh(is)
|
||||
ijv = (jv-1)*jv/2 + iv
|
||||
do ia = 1, nat
|
||||
IF( ityp(ia) == is ) THEN
|
||||
inl = indv_ijkb0(ia) + iv
|
||||
jnl = indv_ijkb0(ia) + jv
|
||||
sums = 0.d0
|
||||
do i = 1, nbsp_bgrp
|
||||
iss = ispin_bgrp(i)
|
||||
sums(iss) = sums(iss) + f_bgrp(i) * bec_bgrp(inl,i) * bec_bgrp(jnl,i)
|
||||
end do
|
||||
sumt = 0.d0
|
||||
do iss = 1, nspin
|
||||
rhovan( ijv, ia, iss ) = sums( iss )
|
||||
sumt = sumt + sums( iss )
|
||||
end do
|
||||
if( iv .ne. jv ) sumt = 2.d0 * sumt
|
||||
ennl_t = ennl_t + sumt * dvan( jv, iv, is)
|
||||
END IF
|
||||
jnl = indv + jv
|
||||
sums = 0.d0
|
||||
do i = 1, nbsp_bgrp
|
||||
iss = ispin_bgrp(i)
|
||||
sums(iss) = sums(iss) + f_bgrp(i) * bec_bgrp(inl,i) * bec_bgrp(jnl,i)
|
||||
end do
|
||||
sumt = 0.d0
|
||||
do iss = 1, nspin
|
||||
rhovan( ijv, ia, iss ) = sums( iss )
|
||||
sumt = sumt + sums( iss )
|
||||
end do
|
||||
if( iv .ne. jv ) sumt = 2.d0 * sumt
|
||||
ennl_t = ennl_t + sumt * dvan( jv, iv, is)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
!$omp end do
|
||||
!$omp end parallel
|
||||
!
|
||||
ennl_val = ennl_t
|
||||
!
|
||||
|
@ -435,10 +431,8 @@
|
|||
end subroutine calrhovan_x
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
!-----------------------------------------------------------------------
|
||||
subroutine calbec_x ( nspmn, nspmx, eigr, c, bec, pptype_ )
|
||||
subroutine calbec_x ( n, betae, c, bec, pptype_ )
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
! this routine calculates array bec
|
||||
|
@ -450,59 +444,79 @@
|
|||
!
|
||||
|
||||
USE kinds, ONLY : DP
|
||||
use electrons_base, only : nbsp
|
||||
use cp_interfaces, only : nlsm1
|
||||
!
|
||||
implicit none
|
||||
!
|
||||
integer, intent(in) :: nspmn, nspmx
|
||||
real(DP), intent(out) :: bec( :, : )
|
||||
complex(DP), intent(in) :: c( :, : ), eigr( :, : )
|
||||
INTEGER, INTENT(IN) :: n
|
||||
real(DP), intent(out) :: bec( :, : )
|
||||
complex(DP), intent(in) :: c( :, : )
|
||||
complex(DP), intent(inout) :: betae( :, : )
|
||||
INTEGER, INTENT(IN), OPTIONAL :: pptype_
|
||||
|
||||
! local variables
|
||||
!
|
||||
call start_clock( 'calbec' )
|
||||
!
|
||||
call nlsm1( nbsp, nspmn, nspmx, eigr, c, bec, pptype_ )
|
||||
!
|
||||
call nlsm1( n, betae, c, bec, pptype_ )
|
||||
call stop_clock( 'calbec' )
|
||||
!
|
||||
return
|
||||
end subroutine calbec_x
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
!-----------------------------------------------------------------------
|
||||
subroutine calbec_bgrp_x ( nspmn, nspmx, eigr, c_bgrp, bec_bgrp, pptype_ )
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
! this routine calculates array bec
|
||||
!
|
||||
! < psi_n | beta_i,i > = c_n(0) beta_i,i(0) +
|
||||
! 2 sum_g> re(c_n*(g) (-i)**l beta_i,i(g) e^-ig.r_i)
|
||||
!
|
||||
! routine makes use of c(-g)=c*(g) and beta(-g)=beta*(g)
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE dbeta_eigr_x( dbeigr, eigr )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
use ions_base, only : nat, ityp
|
||||
use uspp, only : nhtol, nkb, dbeta, indv_ijkb0
|
||||
use uspp_param, only : nh, nhm
|
||||
use gvect, only : gstart
|
||||
use gvecw, only : ngw
|
||||
!
|
||||
implicit none
|
||||
!
|
||||
include 'laxlib.fh'
|
||||
!
|
||||
complex(DP), intent(out) :: dbeigr( :, :, :, : )
|
||||
complex(DP), intent(in) :: eigr(:,:)
|
||||
!
|
||||
integer :: ig, is, iv, ia, l, inl, i, j
|
||||
complex(DP) :: cfact(4)
|
||||
!
|
||||
!if (l == 0) then
|
||||
cfact(1) = cmplx( 1.0_dp , 0.0_dp )
|
||||
!else if (l == 1) then
|
||||
cfact(2) = - cmplx( 0.0_dp , 1.0_dp )
|
||||
!else if (l == 2) then
|
||||
cfact(3) = - cmplx( 0.0_dp , 1.0_dp )
|
||||
cfact(3) = cfact(3) * cfact(3)
|
||||
!else if (l == 3) then
|
||||
cfact(4) = - cmplx( 0.0_dp , 1.0_dp )
|
||||
cfact(4) = cfact(4) * cfact(4) * cfact(4)
|
||||
!endif
|
||||
|
||||
USE kinds, ONLY : DP
|
||||
use electrons_base, only : nbsp_bgrp
|
||||
use cp_interfaces, only : nlsm1
|
||||
!
|
||||
implicit none
|
||||
!
|
||||
integer, intent(in) :: nspmn, nspmx
|
||||
real(DP), intent(out) :: bec_bgrp( :, : )
|
||||
complex(DP), intent(in) :: c_bgrp( :, : ), eigr( :, : )
|
||||
INTEGER, INTENT(IN), OPTIONAL :: pptype_
|
||||
!
|
||||
call start_clock( 'calbec' )
|
||||
!
|
||||
call nlsm1( nbsp_bgrp, nspmn, nspmx, eigr, c_bgrp, bec_bgrp, pptype_ )
|
||||
!
|
||||
call stop_clock( 'calbec' )
|
||||
!
|
||||
return
|
||||
end subroutine calbec_bgrp_x
|
||||
do j=1,3
|
||||
do i=1,3
|
||||
do ia = 1, nat
|
||||
is = ityp(ia)
|
||||
inl = indv_ijkb0(ia)
|
||||
do iv=1,nh(is)
|
||||
l=nhtol(iv,is)
|
||||
! q = 0 component (with weight 1.0)
|
||||
dbeigr(1,iv+inl,i,j)= cfact(l+1)*dbeta(1,iv,is,i,j)*eigr(1,ia)
|
||||
! q > 0 components (with weight 2.0)
|
||||
do ig = gstart, ngw
|
||||
dbeigr(ig,iv+inl,i,j) = 2.0d0*cfact(l+1)*dbeta(ig,iv,is,i,j)*eigr(ig,ia)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
!
|
||||
return
|
||||
end subroutine dbeta_eigr_x
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
|
||||
!-----------------------------------------------------------------------
|
||||
|
@ -526,8 +540,9 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
|
|||
use uspp_param, only : nh, nhm
|
||||
use gvect, only : gstart
|
||||
use gvecw, only : ngw
|
||||
use electrons_base, only : nspin, iupdwn, nupdwn, nbspx_bgrp, iupdwn_bgrp, nupdwn_bgrp, &
|
||||
ibgrp_g2l, i2gupdwn_bgrp, nbspx, nbsp_bgrp
|
||||
use electrons_base, only : nspin, iupdwn, nupdwn, nbspx_bgrp, iupdwn_bgrp, nupdwn_bgrp, &
|
||||
ibgrp_g2l, i2gupdwn_bgrp, nbspx, nbsp_bgrp
|
||||
use cp_interfaces, only : dbeta_eigr
|
||||
!
|
||||
implicit none
|
||||
!
|
||||
|
@ -538,7 +553,7 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
|
|||
real(DP), intent(out) :: dbec( :, :, :, : )
|
||||
integer, intent(in) :: idesc( :, : )
|
||||
!
|
||||
complex(DP), allocatable :: wrk2(:,:)
|
||||
complex(DP), allocatable :: wrk2(:,:,:,:)
|
||||
real(DP), allocatable :: dwrk_bgrp(:,:)
|
||||
!
|
||||
integer :: ig, is, iv, ia, l, inl, i, j, ii, iw, iss, nr, ir, istart, nss
|
||||
|
@ -549,47 +564,23 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
|
|||
!
|
||||
dbec = 0.0d0
|
||||
!
|
||||
allocate( wrk2( ngw, nkb, 3, 3 ) )
|
||||
allocate( dwrk_bgrp( nkb, nbspx_bgrp ) )
|
||||
!
|
||||
CALL dbeta_eigr( wrk2, eigr )
|
||||
!
|
||||
do j=1,3
|
||||
do i=1,3
|
||||
IF( ngw > 0 .AND. nkb > 0 ) THEN
|
||||
CALL dgemm( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2(1,1,i,j), 2*ngw, &
|
||||
c_bgrp, 2*ngw, 0.0d0, dwrk_bgrp(1,1), nkb )
|
||||
END IF
|
||||
if( nproc_bgrp > 1 ) then
|
||||
call mp_sum( dwrk_bgrp, intra_bgrp_comm )
|
||||
end if
|
||||
do ia = 1, nat
|
||||
is = ityp(ia)
|
||||
allocate( wrk2( ngw, nh(is) ) )
|
||||
allocate( dwrk_bgrp( nh(is), nbspx_bgrp ) )
|
||||
|
||||
do iv=1,nh(is)
|
||||
l=nhtol(iv,is)
|
||||
if (l == 0) then
|
||||
cfact = cmplx( 1.0_dp , 0.0_dp )
|
||||
else if (l == 1) then
|
||||
cfact = - cmplx( 0.0_dp , 1.0_dp )
|
||||
else if (l == 2) then
|
||||
cfact = - cmplx( 0.0_dp , 1.0_dp )
|
||||
cfact = cfact * cfact
|
||||
else if (l == 3) then
|
||||
cfact = - cmplx( 0.0_dp , 1.0_dp )
|
||||
cfact = cfact * cfact * cfact
|
||||
else
|
||||
CALL errore(' caldbec ', ' l not implemented ', ABS( l ) )
|
||||
endif
|
||||
!
|
||||
! q = 0 component (with weight 1.0)
|
||||
if (gstart == 2) then
|
||||
wrk2(1,iv)= cfact*dbeta(1,iv,is,i,j)*eigr(1,ia)
|
||||
end if
|
||||
! q > 0 components (with weight 2.0)
|
||||
do ig = gstart, ngw
|
||||
wrk2(ig,iv) = 2.0d0*cfact*dbeta(ig,iv,is,i,j)*eigr(ig,ia)
|
||||
end do
|
||||
end do
|
||||
IF( ngw > 0 .AND. nh(is) > 0 ) THEN
|
||||
CALL dgemm( 'T', 'N', nh(is), nbsp_bgrp, 2*ngw, 1.0d0, wrk2, 2*ngw, &
|
||||
c_bgrp, 2*ngw, 0.0d0, dwrk_bgrp(1,1), nh(is) )
|
||||
END IF
|
||||
|
||||
if( nproc_bgrp > 1 ) then
|
||||
call mp_sum( dwrk_bgrp, intra_bgrp_comm )
|
||||
end if
|
||||
|
||||
inl = indv_ijkb0(ia)
|
||||
do iss=1,nspin
|
||||
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
|
||||
nr = idesc( LAX_DESC_NR, iss )
|
||||
|
@ -600,19 +591,18 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
|
|||
ibgrp_i = ibgrp_g2l( ii + ir - 1 + istart - 1 )
|
||||
IF( ibgrp_i > 0 ) THEN
|
||||
do iw = 1, nh(is)
|
||||
dbec( indv_ijkb0(ia) + iw, ii + (iss-1)*nrcx, i, j ) = dwrk_bgrp( iw, ibgrp_i )
|
||||
dbec( inl + iw, ii + (iss-1)*nrcx, i, j ) = dwrk_bgrp( inl + iw, ibgrp_i )
|
||||
end do
|
||||
END IF
|
||||
end do
|
||||
END IF
|
||||
end do
|
||||
|
||||
deallocate( wrk2 )
|
||||
deallocate( dwrk_bgrp )
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
|
||||
deallocate( wrk2 )
|
||||
deallocate( dwrk_bgrp )
|
||||
if( nbgrp > 1 ) then
|
||||
CALL mp_sum( dbec, inter_bgrp_comm )
|
||||
end if
|
||||
|
@ -659,6 +649,11 @@ subroutine dennl_x( bec_bgrp, dbec, drhovan, denl, idesc )
|
|||
denl=0.d0
|
||||
drhovan=0.0d0
|
||||
|
||||
!$omp parallel default(none) &
|
||||
!$omp shared(nat,ityp,indv_ijkb0,nh,nbsp_bgrp,ispin_bgrp,f_bgrp,bec_bgrp,drhovan,dvan,nspin,denl) &
|
||||
!$omp shared(idesc,iupdwn,nupdwn,ibgrp_g2l,nrcx,dbec) &
|
||||
!$omp private(ia,is,iv,inl,jv,ijv,jnl,dsums,iss,i,dsum,ii,ir,k,j,nr,istart,nss,ibgrp)
|
||||
!$omp do reduction(+:denl)
|
||||
do ia=1,nat
|
||||
is = ityp(ia)
|
||||
do iv=1,nh(is)
|
||||
|
@ -701,6 +696,8 @@ subroutine dennl_x( bec_bgrp, dbec, drhovan, denl, idesc )
|
|||
end do
|
||||
end do
|
||||
end do
|
||||
!$omp end do
|
||||
!$omp end parallel
|
||||
|
||||
CALL mp_sum( denl, intra_bgrp_comm )
|
||||
CALL mp_sum( drhovan, intra_bgrp_comm )
|
||||
|
@ -720,7 +717,7 @@ end subroutine dennl_x
|
|||
|
||||
|
||||
!-----------------------------------------------------------------------
|
||||
subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
|
||||
subroutine nlfq_bgrp_x( c_bgrp, betae, bec_bgrp, becdr_bgrp, fion )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! contribution to fion due to nonlocal part
|
||||
|
@ -739,12 +736,16 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
|
|||
!
|
||||
implicit none
|
||||
!
|
||||
COMPLEX(DP), INTENT(IN) :: c_bgrp( :, : ), eigr( :, : )
|
||||
COMPLEX(DP), INTENT(IN) :: c_bgrp( :, : )
|
||||
COMPLEX(DP), INTENT(IN) :: betae( :, : )
|
||||
#if defined (__CUDA)
|
||||
ATTRIBUTES( DEVICE ) :: c_bgrp, betae
|
||||
#endif
|
||||
REAL(DP), INTENT(IN) :: bec_bgrp( :, : )
|
||||
REAL(DP), INTENT(OUT) :: becdr_bgrp( :, :, : )
|
||||
REAL(DP), INTENT(OUT) :: fion( :, : )
|
||||
!
|
||||
integer :: k, is, ia, inl, iv, jv, i
|
||||
integer :: k, is, ia, inl, jnl, iv, jv, i
|
||||
real(DP) :: temp
|
||||
real(DP) :: sum_tmpdr
|
||||
!
|
||||
|
@ -758,7 +759,7 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
|
|||
!
|
||||
! nlsm2 fills becdr
|
||||
!
|
||||
call nlsm2_bgrp( ngw, nkb, eigr, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
|
||||
call nlsm2_bgrp( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
|
||||
!
|
||||
allocate ( fion_loc( 3, nat ) )
|
||||
!
|
||||
|
@ -767,17 +768,17 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
|
|||
!$omp parallel default(none), &
|
||||
!$omp shared(becdr_bgrp,bec_bgrp,fion_loc,f_bgrp,deeq,dvan,nbsp_bgrp,indv_ijkb0,nh, &
|
||||
!$omp nat,nhm,nbspx_bgrp,ispin_bgrp,nproc_bgrp,me_bgrp,ityp), &
|
||||
!$omp private(tmpbec,tmpdr,is,ia,iv,jv,k,inl,temp,i,mytid,ntids,sum_tmpdr)
|
||||
!$omp private(tmpbec,tmpdr,is,ia,iv,jv,k,inl,jnl,temp,i,mytid,ntids,sum_tmpdr)
|
||||
|
||||
#if defined(_OPENMP)
|
||||
mytid = omp_get_thread_num() ! take the thread ID
|
||||
ntids = omp_get_num_threads() ! take the number of threads
|
||||
#endif
|
||||
|
||||
allocate ( tmpbec( nhm, nbspx_bgrp ), tmpdr( nhm, nbspx_bgrp ) )
|
||||
allocate ( tmpbec( nbspx_bgrp, nhm ), tmpdr( nbspx_bgrp, nhm ) )
|
||||
|
||||
DO k = 1, 3
|
||||
DO ia=1,nat
|
||||
DO ia = 1, nat
|
||||
is = ityp(ia)
|
||||
|
||||
! better if we distribute to MPI tasks too!
|
||||
|
@ -790,29 +791,27 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
|
|||
IF( MOD( ( ia + (k-1)*nat ) / nproc_bgrp, ntids ) /= mytid ) CYCLE
|
||||
#endif
|
||||
tmpbec = 0.d0
|
||||
tmpdr = 0.d0
|
||||
|
||||
do iv=1,nh(is)
|
||||
do jv=1,nh(is)
|
||||
inl = indv_ijkb0(ia) + jv
|
||||
do jv=1,nh(is)
|
||||
jnl = indv_ijkb0(ia) + jv
|
||||
do iv=1,nh(is)
|
||||
do i = 1, nbsp_bgrp
|
||||
temp = dvan(iv,jv,is) + deeq(jv,iv,ia,ispin_bgrp( i ) )
|
||||
tmpbec(iv,i) = tmpbec(iv,i) + temp * bec_bgrp(inl,i)
|
||||
tmpbec(i,iv) = tmpbec(i,iv) + temp * bec_bgrp(jnl,i)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
|
||||
do iv=1,nh(is)
|
||||
do iv = 1, nh(is)
|
||||
inl = indv_ijkb0(ia) + iv
|
||||
do i = 1, nbsp_bgrp
|
||||
tmpdr(iv,i) = f_bgrp( i ) * becdr_bgrp( inl, i, k )
|
||||
tmpdr(i,iv) = f_bgrp( i ) * becdr_bgrp( inl, i, k )
|
||||
end do
|
||||
end do
|
||||
|
||||
sum_tmpdr = 0.0d0
|
||||
do i = 1, nbsp_bgrp
|
||||
do iv = 1, nh(is)
|
||||
sum_tmpdr = sum_tmpdr + tmpdr(iv,i)*tmpbec(iv,i)
|
||||
do iv = 1, nh(is)
|
||||
do i = 1, nbsp_bgrp
|
||||
sum_tmpdr = sum_tmpdr + tmpdr(i,iv)*tmpbec(i,iv)
|
||||
end do
|
||||
end do
|
||||
|
||||
|
@ -823,7 +822,6 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
|
|||
deallocate ( tmpbec, tmpdr )
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
!
|
||||
CALL mp_sum( fion_loc, intra_bgrp_comm )
|
||||
IF( nbgrp > 1 ) THEN
|
||||
|
|
|
@ -305,7 +305,7 @@
|
|||
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
subroutine set_cc( irb, eigrb, rhoc )
|
||||
subroutine set_cc( rhoc )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! Calculate core charge contribution in real space, rhoc(r)
|
||||
|
@ -320,91 +320,76 @@
|
|||
use core, only: rhocb
|
||||
use fft_interfaces, only: invfft
|
||||
use fft_base, only: dfftb, dfftp
|
||||
USE cp_main_variables, ONLY: irb, eigrb
|
||||
USE mp_global, ONLY: nproc_bgrp, me_bgrp, inter_bgrp_comm, my_bgrp_id, nbgrp
|
||||
USE mp, ONLY: mp_sum
|
||||
|
||||
implicit none
|
||||
! input
|
||||
integer, intent(in) :: irb(3,nat)
|
||||
complex(dp), intent(in):: eigrb(ngb,nat)
|
||||
! output
|
||||
real(dp), intent(out) :: rhoc(dfftp%nnr)
|
||||
! local
|
||||
integer nfft, ig, is, ia, isa
|
||||
complex(dp) ci
|
||||
integer :: ig, is, ia, isa, iia
|
||||
INTEGER :: nabox, iabox( nat )
|
||||
complex(dp), PARAMETER :: ci = (0.d0,1.d0)
|
||||
complex(dp), allocatable :: wrk1(:)
|
||||
complex(dp), allocatable :: qv(:), fg1(:), fg2(:)
|
||||
|
||||
INTEGER :: mytid, ntids
|
||||
#if defined(_OPENMP)
|
||||
INTEGER :: itid, mytid, ntids, omp_get_thread_num, omp_get_num_threads
|
||||
INTEGER :: omp_get_thread_num, omp_get_num_threads
|
||||
EXTERNAL :: omp_get_thread_num, omp_get_num_threads
|
||||
#endif
|
||||
!
|
||||
call start_clock( 'set_cc' )
|
||||
ci=(0.d0,1.d0)
|
||||
|
||||
allocate( wrk1 ( dfftp%nnr ) )
|
||||
wrk1 (:) = (0.d0, 0.d0)
|
||||
|
||||
nabox = 0
|
||||
DO ia = 1, nat
|
||||
IF( .NOT. upf(ityp(ia))%nlcc ) CYCLE
|
||||
IF( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2 ( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) CYCLE
|
||||
nabox = nabox + 1
|
||||
iabox( nabox ) = ia
|
||||
END DO
|
||||
!
|
||||
!$omp parallel default(none) &
|
||||
!$omp shared(nsp, na, ngb, eigrb, dfftb, irb, ci, rhocb, &
|
||||
!$omp nat, upf, wrk1, ityp ) &
|
||||
!$omp private(mytid, ntids, is, ia, nfft, ig, isa, qv, fg1, fg2, itid )
|
||||
!$omp shared( nsp, na, ngb, eigrb, dfftb, irb, rhocb, &
|
||||
!$omp nat, wrk1, ityp, nabox, iabox ) &
|
||||
!$omp private( mytid, ntids, is, ia, iia, ig, qv, fg1 )
|
||||
|
||||
allocate( qv ( dfftb%nnr ) )
|
||||
allocate( fg1 ( ngb ) )
|
||||
allocate( fg2 ( ngb ) )
|
||||
!
|
||||
isa = 0
|
||||
|
||||
!$omp workshare
|
||||
wrk1 = (0.d0, 0.d0)
|
||||
!$omp end workshare
|
||||
!
|
||||
#if defined(_OPENMP)
|
||||
mytid = omp_get_thread_num() ! take the thread ID
|
||||
ntids = omp_get_num_threads() ! take the number of threads
|
||||
itid = 0
|
||||
#else
|
||||
mytid = 0
|
||||
ntids = 1
|
||||
#endif
|
||||
|
||||
do ia = 1, nat
|
||||
!
|
||||
is = ityp(ia)
|
||||
|
||||
if (.not.upf(is)%nlcc) then
|
||||
cycle
|
||||
end if
|
||||
!
|
||||
#if defined(__MPI)
|
||||
nfft=1
|
||||
if ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2 ( ia ) <= 0 ) ) cycle
|
||||
#endif
|
||||
|
||||
#if defined(_OPENMP)
|
||||
IF ( mytid /= itid ) THEN
|
||||
itid = MOD( itid + 1, ntids )
|
||||
CYCLE
|
||||
ELSE
|
||||
itid = MOD( itid + 1, ntids )
|
||||
END IF
|
||||
#endif
|
||||
|
||||
if(nfft.eq.2)then
|
||||
fg1 = eigrb(1:ngb,ia )*rhocb(1:ngb,is)
|
||||
fg2 = eigrb(1:ngb,ia+1)*rhocb(1:ngb,is)
|
||||
CALL fft_oned2box( qv, fg1, fg2 )
|
||||
else
|
||||
do iia = 1, nabox
|
||||
IF( MOD( iia - 1, ntids ) == mytid ) THEN
|
||||
ia = iabox(iia)
|
||||
is = ityp(ia)
|
||||
fg1 = eigrb(1:ngb,ia )*rhocb(1:ngb,is)
|
||||
CALL fft_oned2box( qv, fg1 )
|
||||
endif
|
||||
!
|
||||
call invfft( qv, dfftb, ia )
|
||||
!
|
||||
call box2grid(irb(:,ia),1,qv,wrk1)
|
||||
if (nfft.eq.2) call box2grid(irb(:,ia+1),2,qv,wrk1)
|
||||
!
|
||||
call invfft( qv, dfftb, ia )
|
||||
call box2grid(irb(:,ia),1,qv,wrk1)
|
||||
END IF
|
||||
end do
|
||||
!
|
||||
deallocate( qv )
|
||||
deallocate( fg1 )
|
||||
deallocate( fg2 )
|
||||
deallocate( qv )
|
||||
|
||||
!$omp end parallel
|
||||
|
||||
CALL mp_sum( wrk1, inter_bgrp_comm )
|
||||
|
||||
call dcopy( dfftp%nnr, wrk1, 2, rhoc, 1 )
|
||||
|
||||
deallocate( wrk1 )
|
||||
|
|
|
@ -6,17 +6,208 @@
|
|||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
|
||||
#if defined(__CUDA)
|
||||
#define DGEMMDRV cublasDgemm
|
||||
#define DEVICEATTR ,DEVICE
|
||||
#else
|
||||
#define DGEMMDRV dgemm
|
||||
#define DEVICEATTR
|
||||
#endif
|
||||
|
||||
MODULE ortho_module
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
SAVE
|
||||
|
||||
REAL(DP), ALLOCATABLE DEVICEATTR :: s(:,:), sig(:,:), tau(:,:), stmp(:,:)
|
||||
REAL(DP), ALLOCATABLE DEVICEATTR :: wrk(:,:), rhoa(:,:), rhos(:,:), rhod(:)
|
||||
REAL(DP), ALLOCATABLE DEVICEATTR :: xloc(:,:)
|
||||
|
||||
CONTAINS
|
||||
|
||||
SUBROUTINE allocate_local_ortho_memory( nss, nx0 )
|
||||
INTEGER, INTENT(IN) :: nss, nx0
|
||||
INTEGER :: info
|
||||
IF( ALLOCATED( rhos ) ) THEN
|
||||
IF( nx0 == SIZE( rhos, 1 ) ) THEN
|
||||
RETURN
|
||||
ELSE
|
||||
DEALLOCATE( rhos, rhoa, s, sig, tau, rhod )
|
||||
IF(ALLOCATED(wrk)) DEALLOCATE(wrk)
|
||||
IF(ALLOCATED(stmp)) DEALLOCATE(stmp)
|
||||
END IF
|
||||
END IF
|
||||
ALLOCATE( rhos( nx0, nx0 ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating rhos ', ABS( info ) )
|
||||
rhos = 0
|
||||
ALLOCATE( rhoa( nx0, nx0 ), STAT = info ) ! antisymmetric part of rho
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating rhoa ', ABS( info ) )
|
||||
rhoa = 0
|
||||
ALLOCATE( s( nx0, nx0 ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating s ', ABS( info ) )
|
||||
s = 0
|
||||
ALLOCATE( sig( nx0, nx0 ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating sig ', ABS( info ) )
|
||||
sig = 0
|
||||
ALLOCATE( tau( nx0, nx0 ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( info ) )
|
||||
tau = 0
|
||||
ALLOCATE( rhod( nss ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( info ) )
|
||||
rhod = 0
|
||||
#if defined(__CUDA)
|
||||
ALLOCATE( wrk( nss, nss ), STAT = info )
|
||||
IF( info /= 0 ) CALL errore( ' ortho_gamma ', ' allocating wrk ', 1 )
|
||||
ALLOCATE( stmp( nss, nss ), STAT = info )
|
||||
IF( info /= 0 ) CALL errore( ' ortho_gamma ', ' allocating stmp ', 1 )
|
||||
#endif
|
||||
END SUBROUTINE
|
||||
|
||||
SUBROUTINE deallocate_local_ortho_memory()
|
||||
IF(ALLOCATED(s)) DEALLOCATE(s)
|
||||
IF(ALLOCATED(sig)) DEALLOCATE(sig)
|
||||
IF(ALLOCATED(tau)) DEALLOCATE(tau)
|
||||
IF(ALLOCATED(stmp)) DEALLOCATE(stmp)
|
||||
IF(ALLOCATED(wrk)) DEALLOCATE(wrk)
|
||||
IF(ALLOCATED(rhoa)) DEALLOCATE(rhoa)
|
||||
IF(ALLOCATED(rhos)) DEALLOCATE(rhos)
|
||||
IF(ALLOCATED(rhod)) DEALLOCATE(rhod)
|
||||
IF(ALLOCATED(xloc)) DEALLOCATE(xloc)
|
||||
END SUBROUTINE deallocate_local_ortho_memory
|
||||
|
||||
SUBROUTINE x0_to_xloc( x0, nx0, ccc_, idesc )
|
||||
REAL(DP), INTENT(IN) :: x0(:,:)
|
||||
REAL(DP), INTENT(IN) :: ccc_
|
||||
INTEGER, INTENT(IN) :: nx0
|
||||
INTEGER, INTENT(IN) :: idesc(:)
|
||||
include 'laxlib.fh'
|
||||
INTEGER :: i, j, info
|
||||
REAL(DP) DEVICEATTR :: ccc
|
||||
IF( ALLOCATED(xloc) ) THEN
|
||||
IF( SIZE(x0,1) /= SIZE(xloc,1) .OR. SIZE(x0,2) /= SIZE(xloc,2) ) THEN
|
||||
DEALLOCATE(xloc)
|
||||
END IF
|
||||
END IF
|
||||
IF( .NOT. ALLOCATED(xloc) ) THEN
|
||||
ALLOCATE( xloc, MOLD=x0, STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' x0_to_xloc ', ' allocating xloc ', ABS( info ) )
|
||||
END IF
|
||||
IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
|
||||
RETURN
|
||||
ENDIF
|
||||
xloc = x0
|
||||
ccc = ccc_
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j = 1, SIZE(xloc,2)
|
||||
DO i = 1, SIZE(xloc,1)
|
||||
xloc(i,j) = xloc(i,j) * ccc
|
||||
END DO
|
||||
END DO
|
||||
END SUBROUTINE x0_to_xloc
|
||||
|
||||
SUBROUTINE xloc_to_x0( x0, nx0, ccc_, idesc )
|
||||
REAL(DP), INTENT(OUT) :: x0(:,:)
|
||||
INTEGER, INTENT(IN) :: nx0
|
||||
REAL(DP), INTENT(IN) :: ccc_
|
||||
INTEGER, INTENT(IN) :: idesc(:)
|
||||
include 'laxlib.fh'
|
||||
INTEGER :: i, j
|
||||
REAL(DP) DEVICEATTR :: byccc
|
||||
IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
|
||||
RETURN
|
||||
ENDIF
|
||||
IF( .NOT. ALLOCATED(xloc) ) THEN
|
||||
CALL errore( ' xloc_to_x0 ', ' xloc not allocated ', 1 )
|
||||
END IF
|
||||
byccc = 1.0d0 / ccc_
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j = 1, SIZE(xloc,2)
|
||||
DO i = 1, SIZE(xloc,1)
|
||||
xloc(i,j) = xloc(i,j) * byccc
|
||||
END DO
|
||||
END DO
|
||||
x0 = xloc
|
||||
END SUBROUTINE xloc_to_x0
|
||||
|
||||
SUBROUTINE distribute_matrix( a, b, ir, nr, ic, nc, comm )
|
||||
USE mp, ONLY: mp_bcast
|
||||
REAL(DP) DEVICEATTR :: a(:,:), b(:,:)
|
||||
INTEGER, INTENT(IN) :: ir, nr, ic, nc, comm
|
||||
INTEGER :: i, j, info
|
||||
CALL mp_bcast( a, 0, comm )
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j = 1, nc
|
||||
DO i = 1, nr
|
||||
b( i, j ) = a( i + ir - 1, j + ic - 1 )
|
||||
END DO
|
||||
END DO
|
||||
RETURN
|
||||
END SUBROUTINE
|
||||
|
||||
SUBROUTINE collect_matrix( a, b, ir, nr, ic, nc, comm )
|
||||
USE mp, ONLY: mp_sum
|
||||
REAL(DP) DEVICEATTR :: a(:,:), b(:,:)
|
||||
INTEGER, INTENT(IN) :: ir, nr, ic, nc, comm
|
||||
INTEGER :: i, j, info
|
||||
a = 0.0d0
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j = 1, nc
|
||||
DO i = 1, nr
|
||||
a( ir + i - 1, ic + j - 1 ) = b( i, j )
|
||||
END DO
|
||||
END DO
|
||||
CALL mp_sum( a, comm )
|
||||
RETURN
|
||||
END SUBROUTINE
|
||||
|
||||
SUBROUTINE consistency_check( a, idesc )
|
||||
REAL(DP) DEVICEATTR, INTENT(IN) :: a(:,:)
|
||||
INTEGER, INTENT(IN) :: idesc(:)
|
||||
INTEGER :: i, j
|
||||
include 'laxlib.fh'
|
||||
!
|
||||
! on some machines (IBM RS/6000 for instance) the following test allows
|
||||
! to distinguish between Numbers and Sodium Nitride (NaN, Not a Number).
|
||||
! If a matrix of Not-Numbers is passed to rs, the most likely outcome is
|
||||
! that the program goes on forever doing nothing and writing nothing.
|
||||
!
|
||||
#if ! defined(__CUDA)
|
||||
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
|
||||
DO j = 1, idesc(LAX_DESC_NC)
|
||||
DO i = 1, idesc(LAX_DESC_NR)
|
||||
IF (a(i,j) /= a(i,j)) &
|
||||
CALL errore(' ortho ',' ortho went bananas ',1)
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
#endif
|
||||
RETURN
|
||||
END SUBROUTINE
|
||||
|
||||
!=----------------------------------------------------------------------------=!
|
||||
SUBROUTINE ortho_gamma_x( iopt, cp, ngwx, phi, becp_dist, qbecp, nkbx, bephi, qbephi, &
|
||||
x0, nx0, idesc, diff, iter, n, nss, istart )
|
||||
SUBROUTINE ortho_gamma( cp, ngwx, phi, becp_dist, qbecp, nkbx, bephi, qbephi, &
|
||||
nx0, idesc, diff, iter, n, nss, istart )
|
||||
!=----------------------------------------------------------------------------=!
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE orthogonalize_base, ONLY: rhoset, sigset, tauset, ortho_iterate, &
|
||||
ortho_alt_iterate, use_parallel_diag
|
||||
use_parallel_diag
|
||||
USE control_flags, ONLY: diagonalize_on_host
|
||||
USE mp_global, ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, my_bgrp_id, inter_bgrp_comm, nbgrp
|
||||
USE mp, ONLY: mp_sum, mp_bcast
|
||||
USE mp_world, ONLY: mpime
|
||||
USE device_memcpy_m
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
|
@ -24,23 +215,21 @@
|
|||
|
||||
! ... Arguments
|
||||
|
||||
INTEGER, INTENT(IN) :: iopt
|
||||
INTEGER, INTENT(IN) :: ngwx, nkbx, nx0
|
||||
INTEGER, INTENT(IN) :: n, nss, istart
|
||||
COMPLEX(DP) :: phi( ngwx, n ), cp( ngwx, n )
|
||||
REAL(DP) :: bephi( :, : )
|
||||
REAL(DP) :: becp_dist( :, : )
|
||||
REAL(DP) :: qbephi( :, : ), qbecp( :, : )
|
||||
REAL(DP) :: x0( nx0, nx0 )
|
||||
COMPLEX(DP) DEVICEATTR :: phi( :, : ), cp( :, : )
|
||||
REAL(DP) DEVICEATTR :: bephi( :, : )
|
||||
REAL(DP) DEVICEATTR :: becp_dist( :, : )
|
||||
REAL(DP) DEVICEATTR :: qbephi( :, : ), qbecp( :, : )
|
||||
INTEGER, INTENT(IN) :: idesc(:)
|
||||
INTEGER, INTENT(OUT) :: iter
|
||||
REAL(DP), INTENT(OUT) :: diff
|
||||
|
||||
! ... Locals
|
||||
|
||||
REAL(DP), ALLOCATABLE :: s(:,:), sig(:,:), tau(:,:), rhot(:,:)
|
||||
REAL(DP), ALLOCATABLE :: wrk(:,:), rhoa(:,:), rhos(:,:), rhod(:)
|
||||
INTEGER :: i, j, info, nr, nc, ir, ic
|
||||
INTEGER, SAVE :: icnt = 1
|
||||
REAL(DP), ALLOCATABLE :: rhos_h(:,:), s_h(:,:), rhod_h(:)
|
||||
!
|
||||
! ... Subroutine body
|
||||
!
|
||||
|
@ -66,98 +255,13 @@
|
|||
!
|
||||
END IF
|
||||
!
|
||||
ALLOCATE( rhos( nx0, nx0 ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating rhos ', ABS( info ) )
|
||||
ALLOCATE( rhoa( nx0, nx0 ), STAT = info ) ! antisymmetric part of rho
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating rhoa ', ABS( info ) )
|
||||
ALLOCATE( s( nx0, nx0 ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating s ', ABS( info ) )
|
||||
ALLOCATE( sig( nx0, nx0 ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating sig ', ABS( info ) )
|
||||
ALLOCATE( tau( nx0, nx0 ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( info ) )
|
||||
!
|
||||
ALLOCATE( rhod( nss ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( rhod ) )
|
||||
CALL allocate_local_ortho_memory(nss, nx0)
|
||||
!
|
||||
! rho = <s'c0|s|cp>
|
||||
!
|
||||
CALL start_clock( 'rhoset' )
|
||||
!
|
||||
CALL rhoset( cp, ngwx, phi, bephi, nkbx, qbecp, n, nss, istart, rhos, nx0, idesc )
|
||||
!
|
||||
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
|
||||
!
|
||||
ALLOCATE( rhot( nx0, nx0 ), STAT = info ) ! transpose of rho
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho_gamma ', ' allocating rhot ', ABS( rhod ) )
|
||||
!
|
||||
! distributed array rhos contains "rho",
|
||||
! now transpose rhos and store the result in distributed array rhot
|
||||
!
|
||||
CALL sqr_tr_cannon( nss, rhos, nx0, rhot, nx0, idesc )
|
||||
!
|
||||
! Compute the symmetric part of rho
|
||||
!
|
||||
DO j = 1, nc
|
||||
DO i = 1, nr
|
||||
rhos( i, j ) = 0.5d0 * ( rhos( i, j ) + rhot( i, j ) )
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
! distributed array rhos now contains symmetric part of "rho",
|
||||
!
|
||||
CALL consistency_check( rhos )
|
||||
!
|
||||
! Antisymmetric part of rho, alredy distributed across ortho procs.
|
||||
!
|
||||
DO j = 1, nc
|
||||
DO i = 1, nr
|
||||
rhoa( i, j ) = rhos( i, j ) - rhot( i, j )
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
DEALLOCATE( rhot )
|
||||
!
|
||||
END IF
|
||||
|
||||
CALL rhoset( cp, ngwx, phi, bephi, nkbx, qbecp, n, nss, istart, rhos, rhoa, nx0, idesc )
|
||||
CALL stop_clock( 'rhoset' )
|
||||
|
||||
CALL start_clock( 'rsg' )
|
||||
!
|
||||
! ... Diagonalize symmetric part of rho (rhos)
|
||||
! ... "s" is the matrix of eigenvectors, "rhod" is the array of eigenvalues
|
||||
!
|
||||
IF( use_parallel_diag ) THEN
|
||||
!
|
||||
CALL laxlib_diagonalize( nss, rhos, rhod, s, idesc )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
|
||||
!
|
||||
ALLOCATE( wrk( nss, nss ), STAT = info )
|
||||
IF( info /= 0 ) CALL errore( ' ortho_gamma ', ' allocating wrk ', 1 )
|
||||
!
|
||||
CALL collect_matrix( wrk, rhos )
|
||||
!
|
||||
CALL laxlib_diagonalize( nss, wrk, rhod )
|
||||
!
|
||||
CALL distribute_matrix( wrk, s )
|
||||
!
|
||||
DEALLOCATE( wrk )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL stop_clock( 'rsg' )
|
||||
!
|
||||
! sig = 1-<cp|s|cp>
|
||||
!
|
||||
|
@ -171,6 +275,74 @@
|
|||
CALL tauset( phi, ngwx, bephi, nkbx, qbephi, n, nss, istart, tau, nx0, idesc )
|
||||
CALL stop_clock( 'tauset' )
|
||||
!
|
||||
CALL consistency_check(rhos,idesc)
|
||||
|
||||
CALL start_clock( 'rsg' )
|
||||
!
|
||||
! ... Diagonalize symmetric part of rho (rhos)
|
||||
! ... "s" is the matrix of eigenvectors, "rhod" is the array of eigenvalues
|
||||
!
|
||||
IF( use_parallel_diag ) THEN
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
IF( idesc(LAX_DESC_NR) == idesc(LAX_DESC_NC) .AND. idesc(LAX_DESC_NR) == idesc(LAX_DESC_N) ) THEN
|
||||
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
|
||||
CALL laxlib_diagonalize( nss, rhos, rhod, s, info )
|
||||
END IF
|
||||
ELSE IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
|
||||
IF( diagonalize_on_host ) THEN ! tune here
|
||||
ALLOCATE( rhos_h, SOURCE = rhos )
|
||||
ALLOCATE( rhod_h, MOLD = rhod )
|
||||
ALLOCATE( s_h, MOLD = s )
|
||||
CALL laxlib_diagonalize( nss, rhos_h, rhod_h, s_h, idesc )
|
||||
CALL dev_memcpy( s, s_h )
|
||||
CALL dev_memcpy( rhod, rhod_h )
|
||||
DEALLOCATE( rhos_h, rhod_h, s_h )
|
||||
ELSE
|
||||
CALL collect_matrix( wrk, rhos, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
|
||||
IF( idesc(LAX_DESC_IC) == 1 .AND. idesc(LAX_DESC_IR) == 1 ) THEN
|
||||
CALL laxlib_diagonalize( nss, wrk, rhod, stmp, info )
|
||||
END IF
|
||||
CALL distribute_matrix( stmp, s, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
|
||||
CALL mp_bcast( rhod, 0, idesc(LAX_DESC_COMM) )
|
||||
END IF
|
||||
END IF
|
||||
#else
|
||||
CALL laxlib_diagonalize( nss, rhos, rhod, s, idesc )
|
||||
#endif
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
|
||||
!
|
||||
IF( idesc(LAX_DESC_NR) == idesc(LAX_DESC_NC) .AND. idesc(LAX_DESC_NR) == idesc(LAX_DESC_N) ) THEN
|
||||
!
|
||||
! rhos and s matrixes, are replicated, no need of collect them
|
||||
#if defined(__CUDA)
|
||||
CALL laxlib_diagonalize( nss, rhos, rhod, s, info )
|
||||
#else
|
||||
s = rhos
|
||||
CALL laxlib_diagonalize( nss, s, rhod )
|
||||
#endif
|
||||
ELSE
|
||||
!
|
||||
CALL collect_matrix( wrk, rhos, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
CALL laxlib_diagonalize( nss, wrk, rhod, s, info )
|
||||
#else
|
||||
CALL laxlib_diagonalize( nss, wrk, rhod )
|
||||
#endif
|
||||
!
|
||||
CALL distribute_matrix( wrk, s, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL stop_clock( 'rsg' )
|
||||
CALL start_clock( 'ortho_iter' )
|
||||
!
|
||||
IF( my_bgrp_id == 0 ) THEN
|
||||
|
@ -180,15 +352,7 @@
|
|||
! group are enough. Moreover replicating the computation across groups could leads
|
||||
! to small numerical differences and weird numerical effects.
|
||||
!
|
||||
IF( iopt == 0 ) THEN
|
||||
!
|
||||
CALL ortho_iterate( iter, diff, s, nx0, rhod, x0, nx0, sig, rhoa, rhos, tau, nss, idesc)
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL ortho_alt_iterate( iter, diff, s, nx0, rhod, x0, nx0, sig, rhoa, tau, nss, idesc)
|
||||
!
|
||||
END IF
|
||||
CALL ortho_iterate( iter, diff, s, nx0, rhod, xloc, nx0, sig, rhoa, rhos, tau, nss, idesc)
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -197,74 +361,137 @@
|
|||
! All groups must have the same lambda matrix, in order to avoid weird
|
||||
! numerical side effects.
|
||||
!
|
||||
CALL mp_bcast( x0, 0, inter_bgrp_comm )
|
||||
CALL mp_bcast( xloc, 0, inter_bgrp_comm )
|
||||
CALL mp_bcast( iter, 0, inter_bgrp_comm )
|
||||
CALL mp_bcast( diff, 0, inter_bgrp_comm )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL consistency_check( xloc,idesc )
|
||||
|
||||
CALL stop_clock( 'ortho_iter' )
|
||||
!
|
||||
DEALLOCATE( rhoa, rhos, rhod, s, sig, tau )
|
||||
!
|
||||
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) CALL consistency_check( x0 )
|
||||
|
||||
RETURN
|
||||
END SUBROUTINE ortho_gamma
|
||||
|
||||
CONTAINS
|
||||
!
|
||||
|
||||
SUBROUTINE distribute_matrix( a, b )
|
||||
REAL(DP) :: a(:,:), b(:,:)
|
||||
INTEGER :: i, j
|
||||
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
|
||||
DO j = 1, nc
|
||||
DO i = 1, nr
|
||||
b( i, j ) = a( i + ir - 1, j + ic - 1 )
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
RETURN
|
||||
END SUBROUTINE
|
||||
SUBROUTINE compute_qs_times_betas( bephi, bec_row, qbephi, qbecp, idesc )
|
||||
USE uspp, ONLY: nkb, qq_nt, indv_ijkb0, nkbus
|
||||
USE uspp_gpum, ONLY: qq_nt_d, using_qq_nt
|
||||
USE uspp_param, ONLY: nh, upf
|
||||
USE electrons_base, ONLY: nspin, nbsp_bgrp, iupdwn_bgrp, nupdwn_bgrp, nbsp, nupdwn, iupdwn
|
||||
USE ions_base, ONLY: na, nat, nsp, ityp
|
||||
#if defined (__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
include 'laxlib.fh'
|
||||
!
|
||||
REAL(DP), INTENT(OUT) DEVICEATTR :: qbephi(:,:,:), qbecp(:,:,:)
|
||||
REAL(DP), INTENT(IN) DEVICEATTR :: bephi(:,:)
|
||||
REAL(DP), INTENT(IN) DEVICEATTR :: bec_row(:,:)
|
||||
INTEGER, INTENT(IN) :: idesc(:,:)
|
||||
|
||||
SUBROUTINE collect_matrix( a, b )
|
||||
REAL(DP) :: a(:,:), b(:,:)
|
||||
INTEGER :: i, j
|
||||
a = 0.0d0
|
||||
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
|
||||
DO j = 1, nc
|
||||
DO i = 1, nr
|
||||
a( ir + i - 1, ic + j - 1 ) = b( i, j )
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
CALL mp_sum( a, idesc(LAX_DESC_COMM) )
|
||||
RETURN
|
||||
END SUBROUTINE
|
||||
REAL(DP), ALLOCATABLE DEVICEATTR :: bec_col(:,:), bephi_col(:,:)
|
||||
|
||||
SUBROUTINE consistency_check( a )
|
||||
REAL(DP) :: a(:,:)
|
||||
INTEGER :: i, j
|
||||
INTEGER :: nkbx, info, nrcx, iss, is, jv, iv, ia
|
||||
INTEGER :: i, j, inl, jnl, indv, nc, nhs
|
||||
|
||||
nkbx = nkb
|
||||
nrcx = idesc( LAX_DESC_NRCX, 1 )
|
||||
IF( nspin > 1 ) nrcx = MAX( nrcx, idesc( LAX_DESC_NRCX, 2 ) )
|
||||
|
||||
qbephi = 0.d0
|
||||
qbecp = 0.d0
|
||||
|
||||
CALL using_qq_nt(0)
|
||||
!
|
||||
IF( nkbus > 0 ) THEN
|
||||
!
|
||||
! on some machines (IBM RS/6000 for instance) the following test allows
|
||||
! to distinguish between Numbers and Sodium Nitride (NaN, Not a Number).
|
||||
! If a matrix of Not-Numbers is passed to rs, the most likely outcome is
|
||||
! that the program goes on forever doing nothing and writing nothing.
|
||||
ALLOCATE( bec_col ( nkbx, nrcx*nspin ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' compute_qs_times_betas ', ' allocating bec_col ', ABS( info ) )
|
||||
ALLOCATE( bephi_col ( nkbx, nrcx*nspin ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' compute_qs_times_betas ', ' allocating bephi_col ', ABS( info ) )
|
||||
!
|
||||
DO j = 1, nc
|
||||
DO i = 1, nr
|
||||
IF (a(i,j) /= a(i,j)) CALL errore(' ortho ',' ortho went bananas ',1)
|
||||
END DO
|
||||
CALL redist_row2col( nupdwn(1), bephi, bephi_col, nkbx, nrcx, idesc(:,1) )
|
||||
CALL redist_row2col( nupdwn(1), bec_row, bec_col, nkbx, nrcx, idesc(:,1) )
|
||||
IF( nspin == 2 ) THEN
|
||||
CALL redist_row2col( nupdwn(2), bephi(:,nrcx+1:), bephi_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
|
||||
CALL redist_row2col( nupdwn(2), bec_row(:,nrcx+1:), bec_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
|
||||
END IF
|
||||
!
|
||||
DO iss = 1, nspin
|
||||
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
|
||||
nc = idesc( LAX_DESC_NC, iss )
|
||||
DO ia = 1, nat
|
||||
is = ityp(ia)
|
||||
IF( upf(is)%tvanp ) THEN
|
||||
indv = indv_ijkb0(ia)
|
||||
nhs = nh(is)
|
||||
#if defined (__CUDA)
|
||||
CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt_d(1,1,is), SIZE(qq_nt_d,1), &
|
||||
bephi_col(indv+1,(iss-1)*nrcx+1), SIZE(bephi_col,1), 0.0d0, qbephi(indv+1,1,iss), SIZE(qbephi,1))
|
||||
CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt_d(1,1,is), SIZE(qq_nt_d,1), &
|
||||
bec_col(indv+1,(iss-1)*nrcx+1), SIZE(bec_col,1), 0.0d0, qbecp(indv+1,1,iss), SIZE(qbecp,1))
|
||||
#else
|
||||
CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt(1,1,is), SIZE(qq_nt,1), &
|
||||
bephi_col(indv+1,(iss-1)*nrcx+1), SIZE(bephi_col,1), 0.0d0, qbephi(indv+1,1,iss), SIZE(qbephi,1))
|
||||
CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt(1,1,is), SIZE(qq_nt,1), &
|
||||
bec_col(indv+1,(iss-1)*nrcx+1), SIZE(bec_col,1), 0.0d0, qbecp(indv+1,1,iss), SIZE(qbecp,1))
|
||||
#endif
|
||||
!!$cuf kernel do (2)
|
||||
! DO iv=1,nhs
|
||||
! DO i = 1, nc
|
||||
! DO jv = 1, nhs
|
||||
! qbephi(indv+iv,i,iss) = qbephi(indv+iv,i,iss) + &
|
||||
! qq_nt_d(iv,jv,is) * bephi_col(indv+jv,i+(iss-1)*nrcx)
|
||||
! qbecp(indv+iv,i,iss) = qbecp(indv+iv,i,iss) + &
|
||||
! qq_nt_d(iv,jv,is) * bec_col(indv+jv,i+(iss-1)*nrcx)
|
||||
! END DO
|
||||
! END DO
|
||||
! END DO
|
||||
END IF
|
||||
END DO
|
||||
ENDIF
|
||||
END DO
|
||||
RETURN
|
||||
END SUBROUTINE
|
||||
|
||||
END SUBROUTINE ortho_gamma_x
|
||||
DEALLOCATE( bec_col )
|
||||
DEALLOCATE( bephi_col )
|
||||
END IF
|
||||
END SUBROUTINE compute_qs_times_betas
|
||||
|
||||
SUBROUTINE keep_only_us(wrk)
|
||||
USE uspp, ONLY: indv_ijkb0
|
||||
USE uspp_param, ONLY: nh, upf
|
||||
USE ions_base, ONLY: na, nat, nsp, ityp
|
||||
#if defined (__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
COMPLEX(DP) DEVICEATTR, INTENT(OUT) :: wrk(:,:)
|
||||
INTEGER :: ia, is, inl, nhs, iv
|
||||
DO ia = 1, nat
|
||||
is = ityp(ia)
|
||||
inl = indv_ijkb0(ia)
|
||||
nhs = nh(is)
|
||||
IF( .NOT. upf(is)%tvanp ) THEN
|
||||
!$cuf kernel do (1)
|
||||
DO iv = 1, nhs
|
||||
wrk(:,iv+inl) = 0.0d0
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
END SUBROUTINE
|
||||
|
||||
END MODULE ortho_module
|
||||
|
||||
|
||||
!=----------------------------------------------------------------------------=!
|
||||
SUBROUTINE ortho_x( eigr, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
|
||||
SUBROUTINE ortho_x( betae, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
|
||||
!=----------------------------------------------------------------------------=!
|
||||
!
|
||||
! input = cp (non-orthonormal), beta
|
||||
|
@ -279,72 +506,63 @@
|
|||
! for vanderbilt pseudo pot - kl & ap
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE ions_base, ONLY: na, nat, nsp, ityp
|
||||
USE uspp, ONLY: nkb, qq_nt, indv_ijkb0, nkbus
|
||||
USE uspp_param, ONLY: nh, upf
|
||||
USE electrons_base, ONLY: f, nbsp_bgrp, iupdwn_bgrp, nupdwn_bgrp, i2gupdwn_bgrp, nbsp, nspin, nupdwn, iupdwn
|
||||
USE uspp, ONLY: nkb, nkbus
|
||||
USE ions_base, ONLY: nsp
|
||||
USE electrons_base, ONLY: f, nbsp_bgrp, iupdwn_bgrp, nupdwn_bgrp, nbsp, nspin, nupdwn, iupdwn
|
||||
USE gvecw, ONLY: ngw
|
||||
USE control_flags, ONLY: iprint, iverbosity, ortho_max
|
||||
USE control_flags, ONLY: force_pairing
|
||||
USE io_global, ONLY: stdout, ionode
|
||||
USE cp_interfaces, ONLY: ortho_gamma, c_bgrp_expand, c_bgrp_pack, nlsm1, collect_bec, beta_eigr, nlsm1us
|
||||
USE mp_global, ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, inter_bgrp_comm ! DEBUG
|
||||
USE cp_interfaces, ONLY: c_bgrp_expand, c_bgrp_pack, nlsm1, collect_bec, nlsm1us
|
||||
USE mp_global, ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, inter_bgrp_comm! DEBUG
|
||||
USE mp_world, ONLY: mpime
|
||||
USE orthogonalize_base, ONLY: bec_bgrp2ortho
|
||||
USE mp, ONLY : mp_sum
|
||||
USE ortho_module
|
||||
USE device_memcpy_m
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
include 'laxlib.fh'
|
||||
!
|
||||
INTEGER, INTENT(IN) :: idesc(:,:)
|
||||
COMPLEX(DP) :: eigr(:,:)
|
||||
COMPLEX(DP) :: cp_bgrp(:,:), phi_bgrp(:,:)
|
||||
COMPLEX(DP) DEVICEATTR :: betae(:,:)
|
||||
COMPLEX(DP) DEVICEATTR :: cp_bgrp(:,:), phi_bgrp(:,:)
|
||||
REAL(DP) :: x0(:,:,:), diff, ccc
|
||||
INTEGER :: iter
|
||||
REAL(DP) :: bephi(:,:)
|
||||
REAL(DP) :: becp_bgrp(:,:)
|
||||
REAL(DP) DEVICEATTR :: bephi(:,:)
|
||||
REAL(DP) DEVICEATTR :: becp_bgrp(:,:)
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: xloc(:,:), becp_dist(:,:)
|
||||
REAL(DP), ALLOCATABLE :: qbephi(:,:,:), qbecp(:,:,:), bec_col(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: beigr(:,:)
|
||||
REAL(DP), ALLOCATABLE DEVICEATTR :: bec_row(:,:)
|
||||
REAL(DP), ALLOCATABLE DEVICEATTR :: qbephi(:,:,:), qbecp(:,:,:)
|
||||
COMPLEX(DP), ALLOCATABLE DEVICEATTR :: wrk2(:,:)
|
||||
|
||||
INTEGER :: nkbx
|
||||
INTEGER :: info, i, j, iss, iv, jv, ia, is, inl, jnl
|
||||
INTEGER :: n1, n2, m1, m2
|
||||
INTEGER :: nspin_sub, nx0, ngwx, nrcx
|
||||
REAL(DP) :: qqf, dum
|
||||
INTEGER :: nkbx, info, iss, nspin_sub, nx0, ngwx, nrcx
|
||||
!
|
||||
CALL start_clock( 'ortho' )
|
||||
!
|
||||
nkbx = nkb
|
||||
ngwx = SIZE( cp_bgrp, 1 )
|
||||
!
|
||||
nx0 = SIZE( x0, 1 )
|
||||
nx0 = SIZE( x0, 1 )
|
||||
nrcx = MAXVAL( idesc( LAX_DESC_NRCX, : ) )
|
||||
!
|
||||
! calculation of becp and bephi
|
||||
!
|
||||
CALL start_clock( 'ortho' )
|
||||
|
||||
nrcx = MAXVAL( idesc( LAX_DESC_NRCX, : ) )
|
||||
|
||||
ALLOCATE( becp_dist( nkbx, nrcx*nspin ), STAT = info )
|
||||
ALLOCATE( bec_row( nkbx, nrcx*nspin ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho ', ' allocating becp_dist ', ABS( info ) )
|
||||
CALL errore( ' ortho ', ' allocating bec_row ', ABS( info ) )
|
||||
|
||||
IF( nkbus > 0 ) THEN
|
||||
!
|
||||
ALLOCATE( beigr(ngw,nkb))
|
||||
becp_bgrp = 0.0d0
|
||||
!
|
||||
CALL beta_eigr ( beigr, 1, nsp, eigr, 2 )
|
||||
CALL nlsm1us ( nbsp_bgrp, beigr, phi_bgrp, becp_bgrp )
|
||||
!CALL nlsm1 ( nbsp_bgrp, 1, nsp, eigr, phi_bgrp, becp_bgrp, 2 )
|
||||
ALLOCATE( wrk2, MOLD = betae )
|
||||
CALL dev_memcpy( wrk2, betae )
|
||||
CALL keep_only_us( wrk2 )
|
||||
CALL nlsm1us ( nbsp_bgrp, wrk2, phi_bgrp, becp_bgrp )
|
||||
CALL bec_bgrp2ortho( becp_bgrp, bephi, nrcx, idesc )
|
||||
!
|
||||
becp_bgrp = 0.0d0
|
||||
!
|
||||
CALL nlsm1us ( nbsp_bgrp, beigr, cp_bgrp, becp_bgrp )
|
||||
!CALL nlsm1 ( nbsp_bgrp, 1, nsp, eigr, cp_bgrp, becp_bgrp, 2 )
|
||||
CALL bec_bgrp2ortho( becp_bgrp, becp_dist, nrcx, idesc )
|
||||
DEALLOCATE( beigr )
|
||||
CALL nlsm1us ( nbsp_bgrp, wrk2, cp_bgrp, becp_bgrp )
|
||||
CALL bec_bgrp2ortho( becp_bgrp, bec_row, nrcx, idesc )
|
||||
DEALLOCATE( wrk2 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
@ -353,95 +571,27 @@
|
|||
ALLOCATE( qbephi( nkbx, nx0, nspin ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho ', ' allocating qbephi ', ABS( info ) )
|
||||
!
|
||||
IF( nkbus > 0 ) THEN
|
||||
ALLOCATE( bec_col ( nkbx, nrcx*nspin ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho ', ' allocating bec_col ', ABS( info ) )
|
||||
CALL redist_row2col( nupdwn(1), bephi, bec_col, nkbx, nrcx, idesc(:,1) )
|
||||
IF( nspin == 2 ) THEN
|
||||
CALL redist_row2col( nupdwn(2), bephi(:,nrcx+1:), bec_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
|
||||
END IF
|
||||
END IF
|
||||
!
|
||||
qbephi = 0.d0
|
||||
!
|
||||
DO iss = 1, nspin
|
||||
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
|
||||
DO ia = 1, nat
|
||||
is = ityp(ia)
|
||||
IF( upf(is)%tvanp ) THEN
|
||||
DO iv=1,nh(is)
|
||||
inl = indv_ijkb0(ia) + iv
|
||||
DO jv=1,nh(is)
|
||||
jnl = indv_ijkb0(ia) + jv
|
||||
qqf = qq_nt(iv,jv,is)
|
||||
IF( ABS( qqf ) > 1.D-5 ) THEN
|
||||
DO i = 1, idesc( LAX_DESC_NC, iss )
|
||||
qbephi(inl,i,iss) = qbephi(inl,i,iss) + qqf * bec_col(jnl,i+(iss-1)*nrcx)
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
ENDIF
|
||||
END DO
|
||||
!
|
||||
ALLOCATE( qbecp ( nkbx, nx0, nspin ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho ', ' allocating qbecp ', ABS( info ) )
|
||||
|
||||
qbecp = 0.d0
|
||||
|
||||
IF( nkbus > 0 ) THEN
|
||||
CALL redist_row2col( nupdwn(1), becp_dist, bec_col, nkbx, nrcx, idesc(:,1) )
|
||||
IF( nspin == 2 ) THEN
|
||||
CALL redist_row2col( nupdwn(2), becp_dist(:,nrcx+1:), bec_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
|
||||
END IF
|
||||
DO iss = 1, nspin
|
||||
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
|
||||
DO ia = 1, nat
|
||||
is = ityp(ia)
|
||||
IF( upf(is)%tvanp ) THEN
|
||||
DO iv=1,nh(is)
|
||||
inl = indv_ijkb0(ia) + iv
|
||||
DO jv=1,nh(is)
|
||||
jnl = indv_ijkb0(ia) + jv
|
||||
qqf = qq_nt(iv,jv,is)
|
||||
IF( ABS( qqf ) > 1.D-5 ) THEN
|
||||
DO i = 1, idesc( LAX_DESC_NC, iss )
|
||||
qbecp(inl,i,iss) = qbecp(inl,i,iss) + qqf * bec_col(jnl,i+(iss-1)*nrcx)
|
||||
END DO
|
||||
ENDIF
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
END IF
|
||||
END DO
|
||||
DEALLOCATE( bec_col )
|
||||
END IF
|
||||
!
|
||||
CALL compute_qs_times_betas( bephi, bec_row, qbephi, qbecp, idesc )
|
||||
!
|
||||
! Expand cp and phi to contain all electronic band
|
||||
!
|
||||
CALL c_bgrp_expand( cp_bgrp )
|
||||
CALL c_bgrp_expand( phi_bgrp )
|
||||
!
|
||||
ALLOCATE( xloc( nx0, nx0 ), STAT = info )
|
||||
IF( info /= 0 ) &
|
||||
CALL errore( ' ortho ', ' allocating xloc ', ABS( info ) )
|
||||
!
|
||||
nspin_sub = nspin
|
||||
if( force_pairing ) nspin_sub = 1
|
||||
!
|
||||
DO iss = 1, nspin_sub
|
||||
|
||||
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) xloc = x0(:,:,iss) * ccc
|
||||
CALL x0_to_xloc( x0(:,:,iss), nx0, ccc, idesc(:,iss) )
|
||||
|
||||
CALL ortho_gamma( 0, cp_bgrp, ngwx, phi_bgrp, becp_dist(:,(iss-1)*nrcx+1:iss*nrcx), qbecp(:,:,iss), nkbx, &
|
||||
CALL ortho_gamma( cp_bgrp, ngwx, phi_bgrp, bec_row(:,(iss-1)*nrcx+1:iss*nrcx), qbecp(:,:,iss), nkbx, &
|
||||
bephi(:,((iss-1)*nrcx+1):iss*nrcx), &
|
||||
qbephi(:,:,iss), xloc, nx0, idesc(:,iss), diff, iter, nbsp, nupdwn(iss), iupdwn(iss) )
|
||||
qbephi(:,:,iss), nx0, idesc(:,iss), diff, iter, nbsp, nupdwn(iss), iupdwn(iss) )
|
||||
|
||||
IF( iter > ortho_max ) THEN
|
||||
WRITE( stdout, 100 ) diff, iter
|
||||
|
@ -452,16 +602,17 @@
|
|||
WRITE( stdout, 100 ) diff, iter
|
||||
ENDIF
|
||||
!
|
||||
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) x0( :, :, iss ) = xloc / ccc
|
||||
CALL xloc_to_x0( x0(:,:,iss), nx0, ccc, idesc(:,iss) )
|
||||
!
|
||||
END DO
|
||||
|
||||
IF( force_pairing ) cp_bgrp(:, iupdwn(2):iupdwn(2)+nupdwn(2)-1 ) = cp_bgrp(:,1:nupdwn(2))
|
||||
IF( force_pairing ) THEN
|
||||
CALL dev_memcpy(cp_bgrp(:,iupdwn(2):), cp_bgrp(:,1:), [1, ngw], 1 , [1, nupdwn(2)], 1)
|
||||
END IF
|
||||
!
|
||||
DEALLOCATE( xloc )
|
||||
DEALLOCATE( qbecp )
|
||||
DEALLOCATE( qbephi )
|
||||
DEALLOCATE( becp_dist )
|
||||
DEALLOCATE( bec_row )
|
||||
!
|
||||
! pack cp so that it contains only the bands in the band subgroup
|
||||
!
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -351,7 +351,11 @@
|
|||
ALLOCATE( ftmp( 3, SIZE( fion, 2 ) ) )
|
||||
|
||||
ftmp = 0.0d0
|
||||
|
||||
!$omp parallel do reduction(+:ftmp) default(none) &
|
||||
!$omp shared( gstart, dffts, sfac, rhops, screen_coul, rhoeg, nsp, gg, tpiba2, mill, g, &
|
||||
!$omp nat, ityp, vps, ei1, ei2, ei3, tscreen ) &
|
||||
!$omp private(ig, rp, is, rhet, rhog, fpibg, ig1, ig2, ig3, gxc, gyc, gzc, ia, cnvg, cvn, tx, &
|
||||
!$omp ty, tz, teigr )
|
||||
DO ig = gstart, dffts%ngm
|
||||
|
||||
RP = (0.D0,0.D0)
|
||||
|
@ -389,6 +393,8 @@
|
|||
|
||||
END DO
|
||||
!
|
||||
!$omp end parallel do
|
||||
!
|
||||
fion = fion + DBLE(ftmp) * 2.D0 * omega * tpiba
|
||||
|
||||
DEALLOCATE( ftmp )
|
||||
|
@ -444,6 +450,8 @@
|
|||
INTEGER, DIMENSION(6), PARAMETER :: ALPHA = (/ 1,2,3,2,3,3 /)
|
||||
INTEGER, DIMENSION(6), PARAMETER :: BETA = (/ 1,1,1,2,2,3 /)
|
||||
|
||||
INTEGER :: omp_get_num_threads
|
||||
|
||||
! ... SUBROUTINE BODY
|
||||
|
||||
ALLOCATE( rc( nsp, nsp ) )
|
||||
|
@ -474,9 +482,13 @@
|
|||
IA_S = gind_block( 1, nat, nproc_bgrp, me_bgrp )
|
||||
IA_E = IA_S + NA_LOC - 1
|
||||
|
||||
!$omp parallel do reduction(+:esr,desr) num_threads(min(max(1,na_loc),omp_get_num_threads())) default(none) &
|
||||
!$omp private(ia,ib,k,j,zv2_kj,rckj_m1,fact_pre,xlm,ylm,zlm,tzero,xlm0,ylm0,zlm0,ix,iy,iz,sxlm,tshift, &
|
||||
!$omp rxlm,erre2,rlm,arg,esrtzero,addesr,addpre,repand,i,fxx ) &
|
||||
!$omp shared(ia_s,ia_e,nat,ityp,zv2,rc,taus,iesr,hmat,fionloc,tstress,na_loc)
|
||||
DO ia = ia_s, ia_e
|
||||
k = ityp(ia)
|
||||
DO ib = ia, nat
|
||||
k = ityp(ia)
|
||||
j = ityp(ib)
|
||||
|
||||
zv2_kj = zv2(k,j)
|
||||
|
@ -536,15 +548,14 @@
|
|||
END DO ! IX
|
||||
END DO
|
||||
END DO
|
||||
!$omp end parallel do
|
||||
|
||||
!
|
||||
! each processor add its own contribution to the array FION
|
||||
!
|
||||
DO ia = 1, nat
|
||||
FION(1,ia) = FION(1,ia)+FIONLOC(1,ia)
|
||||
FION(2,ia) = FION(2,ia)+FIONLOC(2,ia)
|
||||
FION(3,ia) = FION(3,ia)+FIONLOC(3,ia)
|
||||
END DO
|
||||
! FION = FION+FIONLOC
|
||||
!
|
||||
CALL daxpy( 3*nat, 1.0d0, fionloc, 1, fion, 1 )
|
||||
|
||||
CALL mp_sum(esr, intra_bgrp_comm)
|
||||
|
||||
|
|
|
@ -615,6 +615,7 @@
|
|||
USE atom, ONLY: rgrid
|
||||
USE uspp, ONLY: indv
|
||||
use uspp, only: qq_nt, beta
|
||||
use uspp_gpum, ONLY: using_qq_nt, using_qq_nt_d, qq_nt_d
|
||||
USE betax, only: refg, qradx, mmx, dqradx
|
||||
use smallbox_gvec, only: ngb
|
||||
use control_flags, only: iprint, iverbosity
|
||||
|
@ -624,6 +625,9 @@
|
|||
use smallbox_gvec, only: gb, gxb
|
||||
use small_box, only: omegab, tpibab
|
||||
USE cp_interfaces, ONLY: fill_qrl
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -796,6 +800,22 @@
|
|||
end do
|
||||
|
||||
end do
|
||||
|
||||
CALL using_qq_nt(2)
|
||||
#if defined (__CUDA)
|
||||
CALL using_qq_nt_d(0)
|
||||
!$cuf kernel do (3)
|
||||
DO is = 1, SIZE(qq_nt_d,3)
|
||||
DO jv=1,SIZE(qq_nt_d,2)
|
||||
DO iv=1,SIZE(qq_nt_d,1)
|
||||
IF( ABS( qq_nt_d(iv,jv,is) ) <= 1.D-5 ) THEN
|
||||
qq_nt_d(iv,jv,is) = 0.0d0
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
#endif
|
||||
!
|
||||
!
|
||||
if (tpre) then
|
||||
! ---------------------------------------------------------------
|
||||
|
@ -1039,6 +1059,7 @@
|
|||
use gvecw, only: ngw
|
||||
use cell_base, only: ainv
|
||||
use uspp, only: qq_nt, nhtolm, beta
|
||||
use uspp_gpum, ONLY: using_qq_nt, using_qq_nt_d, qq_nt_d
|
||||
use constants, only: pi, fpi
|
||||
use ions_base, only: nsp
|
||||
use uspp_param, only: upf, lmaxq, lmaxkb, nbetam, nh
|
||||
|
@ -1046,6 +1067,9 @@
|
|||
use smallbox_gvec, only: gb, gxb, ngb
|
||||
use small_box, only: omegab, tpibab
|
||||
USE betax, ONLY: qradx, dqradx, refg, mmx
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
!
|
||||
implicit none
|
||||
|
||||
|
@ -1131,6 +1155,21 @@
|
|||
end do
|
||||
|
||||
end do
|
||||
|
||||
CALL using_qq_nt(2)
|
||||
#if defined (__CUDA)
|
||||
CALL using_qq_nt_d(0)
|
||||
!$cuf kernel do (3)
|
||||
DO is = 1, SIZE(qq_nt_d,3)
|
||||
DO jv=1,SIZE(qq_nt_d,2)
|
||||
DO iv=1,SIZE(qq_nt_d,1)
|
||||
IF( ABS( qq_nt_d(iv,jv,is) ) <= 1.D-5 ) THEN
|
||||
qq_nt_d(iv,jv,is) = 0.0d0
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
END DO
|
||||
#endif
|
||||
!
|
||||
if (tpre) then
|
||||
! ---------------------------------------------------------------
|
||||
|
|
|
@ -28,17 +28,19 @@ SUBROUTINE from_restart( )
|
|||
USE gvect, ONLY : mill, eigts1, eigts2, eigts3
|
||||
USE printout_base, ONLY : printout_pos
|
||||
USE gvecw, ONLY : ngw
|
||||
USE cp_interfaces, ONLY : phfacs, strucf, prefor, calbec_bgrp, caldbec_bgrp
|
||||
USE cp_interfaces, ONLY : phfacs, strucf, prefor, calbec, caldbec_bgrp
|
||||
USE energies, ONLY : eself, dft_energy_type
|
||||
USE wave_base, ONLY : rande_base
|
||||
USE efield_module, ONLY : efield_berry_setup, tefield, &
|
||||
efield_berry_setup2, tefield2
|
||||
USE uspp, ONLY : okvan, vkb, nkb, nlcc_any
|
||||
USE uspp_gpum, ONLY : vkb_d
|
||||
USE cp_main_variables, ONLY : ht0, htm, lambdap, lambda, lambdam, eigr, &
|
||||
sfac, taub, irb, eigrb, edft, bec_bgrp, dbec, idesc
|
||||
sfac, taub, irb, eigrb, edft, bec_bgrp, dbec, idesc, iabox, nabox
|
||||
USE time_step, ONLY : delt
|
||||
USE fft_base, ONLY : dfftp, dffts
|
||||
USE matrix_inversion
|
||||
USE device_memcpy_m, ONLY : dev_memcpy
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
||||
|
@ -153,7 +155,7 @@ SUBROUTINE from_restart( )
|
|||
! ... to starting cell (from ndr or again standard input)
|
||||
!
|
||||
IF ( okvan .or. nlcc_any ) THEN
|
||||
CALL initbox( tau0, alat, at, ainv, taub, irb )
|
||||
CALL initbox( tau0, alat, at, ainv, taub, irb, iabox, nabox )
|
||||
CALL phbox( taub, iverbosity, eigrb )
|
||||
END IF
|
||||
!
|
||||
|
@ -162,6 +164,7 @@ SUBROUTINE from_restart( )
|
|||
CALL strucf( sfac, eigts1, eigts2, eigts3, mill, dffts%ngm )
|
||||
!
|
||||
CALL prefor( eigr, vkb )
|
||||
CALL dev_memcpy( vkb_d, vkb )
|
||||
!
|
||||
CALL formf( .TRUE. , eself )
|
||||
!
|
||||
|
@ -187,7 +190,7 @@ SUBROUTINE from_restart( )
|
|||
!
|
||||
END IF
|
||||
!
|
||||
CALL calbec_bgrp( 1, nsp, eigr, c0_bgrp, bec_bgrp )
|
||||
CALL calbec( nbsp_bgrp, vkb, c0_bgrp, bec_bgrp, 0 )
|
||||
!
|
||||
IF ( tpre ) CALL caldbec_bgrp( eigr, c0_bgrp, dbec, idesc )
|
||||
!
|
||||
|
|
|
@ -10,9 +10,15 @@
|
|||
|
||||
!=----------------------------------------------------------------------------------=!
|
||||
|
||||
#if defined(__CUDA)
|
||||
#define DEVICEATTR ,DEVICE
|
||||
#else
|
||||
#define DEVICEATTR
|
||||
#endif
|
||||
|
||||
|
||||
SUBROUTINE runcp_uspp_x &
|
||||
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp, fromscra, restart )
|
||||
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart )
|
||||
!
|
||||
! This subroutine performs a Car-Parrinello or Steepest-Descent step
|
||||
! on the electronic variables, computing forces on electrons
|
||||
|
@ -31,7 +37,7 @@
|
|||
USE mp, ONLY : mp_sum
|
||||
USE fft_base, ONLY : dffts
|
||||
use wave_base, only : wave_steepest, wave_verlet
|
||||
use control_flags, only : lwf, tsde
|
||||
use control_flags, only : lwf, tsde, many_fft
|
||||
use uspp, only : deeq, vkb
|
||||
use gvect, only : gstart
|
||||
use electrons_base, only : nbsp_bgrp, ispin_bgrp, f_bgrp, nspin, nupdwn_bgrp, iupdwn_bgrp
|
||||
|
@ -41,6 +47,10 @@
|
|||
USE cp_interfaces, ONLY : dforce
|
||||
USE ldaU_cp, ONLY : lda_plus_u, vupsi
|
||||
USE fft_helper_subroutines
|
||||
#if defined (__CUDA)
|
||||
USE uspp_gpum, ONLY : vkb_d
|
||||
USE cudafor
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -50,6 +60,7 @@
|
|||
REAL(DP) :: rhos(:,:)
|
||||
REAL(DP) :: bec_bgrp(:,:)
|
||||
COMPLEX(DP) :: c0_bgrp(:,:), cm_bgrp(:,:)
|
||||
COMPLEX(DP) DEVICEATTR :: c0_d(:,:), cm_d(:,:)
|
||||
LOGICAL, OPTIONAL, INTENT(IN) :: fromscra
|
||||
LOGICAL, OPTIONAL, INTENT(IN) :: restart
|
||||
!
|
||||
|
@ -64,11 +75,22 @@
|
|||
real(DP), allocatable :: emaver(:)
|
||||
complex(DP), allocatable :: c2(:), c3(:), c2tmp(:), c3tmp(:)
|
||||
REAL(DP), ALLOCATABLE :: tg_rhos(:,:), ftmp(:)
|
||||
#if defined (__CUDA)
|
||||
REAL(DP), ALLOCATABLE, DEVICE :: rhos_d(:,:)
|
||||
#endif
|
||||
INTEGER, ALLOCATABLE :: itmp(:)
|
||||
integer :: i, nsiz, incr, idx, idx_in, ierr
|
||||
integer :: iwfc, nwfc, is, ii, tg_rhos_siz, c2_siz
|
||||
integer :: iflag
|
||||
logical :: ttsde
|
||||
INTEGER :: omp_get_num_threads
|
||||
|
||||
#if defined (__CUDA)
|
||||
IF( dffts%has_task_groups ) THEN
|
||||
CALL errore(' runcp_uspp ', ' task groups not implemented on GPU ',1)
|
||||
END IF
|
||||
ALLOCATE( rhos_d, SOURCE = rhos )
|
||||
#endif
|
||||
|
||||
iflag = 0
|
||||
!
|
||||
|
@ -85,6 +107,9 @@
|
|||
ELSE
|
||||
tg_rhos_siz = 1
|
||||
c2_siz = ngw
|
||||
#if defined (__CUDA)
|
||||
c2_siz = c2_siz * many_fft
|
||||
#endif
|
||||
END IF
|
||||
|
||||
!
|
||||
|
@ -117,12 +142,13 @@
|
|||
emadt2, emaver, verl1, verl2 )
|
||||
ELSE
|
||||
allocate( c2( c2_siz ), c3( c2_siz ) )
|
||||
allocate( tg_rhos( tg_rhos_siz, nspin ) )
|
||||
|
||||
c2 = 0D0
|
||||
c3 = 0D0
|
||||
|
||||
IF( dffts%has_task_groups ) THEN
|
||||
|
||||
ALLOCATE( tg_rhos( tg_rhos_siz, nspin ) )
|
||||
!
|
||||
! The potential in rhos is distributed across all processors
|
||||
! We need to redistribute it so that it is completely contained in the
|
||||
|
@ -136,7 +162,11 @@
|
|||
|
||||
ELSE
|
||||
|
||||
#if defined (__CUDA)
|
||||
incr = 2 * many_fft
|
||||
#else
|
||||
incr = 2
|
||||
#endif
|
||||
|
||||
END IF
|
||||
|
||||
|
@ -230,8 +260,14 @@
|
|||
|
||||
ELSE
|
||||
|
||||
#if defined (__CUDA)
|
||||
CALL dforce( i, bec_bgrp, vkb_d, c0_d, c2, c3, rhos_d, &
|
||||
SIZE(rhos_d,1), ispin_bgrp, f_bgrp, nbsp_bgrp, nspin )
|
||||
#else
|
||||
CALL dforce( i, bec_bgrp, vkb, c0_bgrp, c2, c3, rhos, &
|
||||
SIZE(rhos,1), ispin_bgrp, f_bgrp, nbsp_bgrp, nspin )
|
||||
#endif
|
||||
|
||||
IF ( lda_plus_u ) THEN
|
||||
c2(:) = c2(:) - vupsi(:,i)
|
||||
c3(:) = c3(:) - vupsi(:,i+1)
|
||||
|
@ -256,8 +292,10 @@
|
|||
ENDDO
|
||||
END IF
|
||||
|
||||
idx_in = 1
|
||||
!$omp parallel num_threads(min(incr,omp_get_num_threads())) default(shared) private(idx_in, idx)
|
||||
!$omp do
|
||||
DO idx = 1, incr, 2
|
||||
idx_in = idx/2+1
|
||||
IF( i + idx - 1 <= nbsp_bgrp ) THEN
|
||||
IF (tsde) THEN
|
||||
CALL wave_steepest( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), emaver, c2(:), ngw, idx_in )
|
||||
|
@ -271,21 +309,26 @@
|
|||
cm_bgrp(1,i+idx ) = CMPLX(real(cm_bgrp(1,i+idx )),0.0d0,kind=dp)
|
||||
END IF
|
||||
END IF
|
||||
!
|
||||
idx_in = idx_in + 1
|
||||
!
|
||||
END DO
|
||||
!$omp end do
|
||||
!$omp end parallel
|
||||
|
||||
end do
|
||||
END DO
|
||||
|
||||
DEALLOCATE( c2 )
|
||||
DEALLOCATE( c3 )
|
||||
DEALLOCATE( tg_rhos )
|
||||
|
||||
IF( dffts%has_task_groups ) THEN
|
||||
DEALLOCATE( tg_rhos )
|
||||
END IF
|
||||
|
||||
END IF
|
||||
|
||||
DEALLOCATE( emadt2 )
|
||||
DEALLOCATE( emaver )
|
||||
#if defined (__CUDA)
|
||||
DEALLOCATE( rhos_d )
|
||||
#endif
|
||||
!
|
||||
END SUBROUTINE runcp_uspp_x
|
||||
|
||||
|
|
|
@ -6,17 +6,18 @@
|
|||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE initbox ( tau0, alat, at, ainv, taub, irb )
|
||||
SUBROUTINE initbox ( tau0, alat, at, ainv, taub, irb, iabox, nabox )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! sets the indexes irb and positions taub for the small boxes
|
||||
! around atoms
|
||||
!
|
||||
USE kinds, ONLY: DP
|
||||
USE ions_base, ONLY: nat
|
||||
USE ions_base, ONLY: nat, ityp
|
||||
USE uspp_param, ONLY: upf
|
||||
USE control_flags, ONLY: iverbosity
|
||||
USE io_global, ONLY: stdout
|
||||
USE mp_global, ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm
|
||||
USE mp_global, ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, my_bgrp_id, nbgrp
|
||||
USE fft_base, ONLY: dfftb, dfftp, fft_type_descriptor
|
||||
USE fft_smallbox_type, ONLY: fft_box_set
|
||||
|
||||
|
@ -26,9 +27,10 @@
|
|||
! output
|
||||
INTEGER, INTENT(out) :: irb(3,nat)
|
||||
REAL(DP), INTENT(out) :: taub(3,nat)
|
||||
INTEGER, INTENT(out) :: iabox(nat), nabox
|
||||
! local
|
||||
REAL(DP) :: x(3), xmod
|
||||
INTEGER :: nr(3), nrb(3), xint, ia, i
|
||||
INTEGER :: nr(3), nrb(3), xint, ia, i, is
|
||||
!
|
||||
IF ( dfftb%nr1 < 1) CALL errore ('initbox', 'incorrect value for box grid dimensions', 1)
|
||||
IF ( dfftb%nr2 < 1) CALL errore ('initbox', 'incorrect value for box grid dimensions', 2)
|
||||
|
@ -96,6 +98,19 @@
|
|||
|
||||
CALL fft_box_set( dfftb, nat, irb, dfftp )
|
||||
|
||||
! build the local list of atom
|
||||
nabox = 0
|
||||
DO ia = 1, nat
|
||||
IF( .NOT. upf(ityp(ia))%tvanp ) CYCLE
|
||||
#if defined(__MPI)
|
||||
IF( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
|
||||
CYCLE
|
||||
END IF
|
||||
#endif
|
||||
nabox = nabox + 1
|
||||
iabox( nabox ) = ia
|
||||
END DO
|
||||
|
||||
IF( iverbosity > 1 ) THEN
|
||||
DO ia=1,nat
|
||||
WRITE( stdout,2000) ia, (irb(i,ia),i=1,3)
|
||||
|
|
|
@ -79,7 +79,7 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
|
|||
!
|
||||
INTEGER iss, isup, isdw, ig, ir, i, j, k, ij, is, ia, inlc
|
||||
REAL(DP) :: vtxc, vave, ebac, wz, eh, ehpre, enlc
|
||||
COMPLEX(DP) fp, fm, ci, drhop, zpseu, zh
|
||||
COMPLEX(DP) fp, fm, drhop, zpseu, zh
|
||||
COMPLEX(DP), ALLOCATABLE :: rhotmp(:), vtemp(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: drhot(:,:)
|
||||
REAL(DP), ALLOCATABLE :: gagb(:,:), rhosave(:,:), newrhosave(:,:), rhocsave(:)
|
||||
|
@ -104,6 +104,7 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
|
|||
! ... dalbe(:) = delta( alpha(:), beta(:) )
|
||||
REAL(DP), DIMENSION(6), PARAMETER :: dalbe = &
|
||||
(/ 1.0_DP, 0.0_DP, 0.0_DP, 1.0_DP, 0.0_DP, 1.0_DP /)
|
||||
COMPLEX(DP), PARAMETER :: ci = ( 0.0d0, 1.0d0 )
|
||||
|
||||
|
||||
CALL start_clock( 'vofrho' )
|
||||
|
@ -130,8 +131,6 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
|
|||
!
|
||||
END IF
|
||||
!
|
||||
ci = ( 0.0d0, 1.0d0 )
|
||||
!
|
||||
! wz = factor for g.neq.0 because of c*(g)=c(-g)
|
||||
!
|
||||
wz = 2.0d0
|
||||
|
@ -442,12 +441,12 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
|
|||
END IF
|
||||
|
||||
IF( nspin == 1 ) THEN
|
||||
rhog( 1:dfftp%ngm, 1 ) = rhog( 1:dfftp%ngm, 1 ) + vtemp(1:dfftp%ngm)
|
||||
CALL zaxpy(dfftp%ngm, (1.0d0,0.0d0) , vtemp, 1, rhog(1,1), 1)
|
||||
ELSE
|
||||
isup=1
|
||||
isdw=2
|
||||
rhog( 1:dfftp%ngm, isup ) = rhog( 1:dfftp%ngm, isup ) + vtemp(1:dfftp%ngm)
|
||||
rhog( 1:dfftp%ngm, isdw ) = rhog( 1:dfftp%ngm, isdw ) + vtemp(1:dfftp%ngm)
|
||||
CALL zaxpy(dfftp%ngm, (1.0d0,0.0d0) , vtemp, 1, rhog(1,isup), 1)
|
||||
CALL zaxpy(dfftp%ngm, (1.0d0,0.0d0) , vtemp, 1, rhog(1,isdw), 1)
|
||||
IF( ttsic ) THEN
|
||||
rhog( 1:dfftp%ngm, isup ) = rhog( 1:dfftp%ngm, isup ) - self_vloc(1:dfftp%ngm)
|
||||
rhog( 1:dfftp%ngm, isdw ) = rhog( 1:dfftp%ngm, isdw ) - self_vloc(1:dfftp%ngm)
|
||||
|
|
|
@ -464,6 +464,13 @@
|
|||
!
|
||||
! ... assign random values to wave functions
|
||||
!
|
||||
! 2.519 = 4^(2/3), equivalent to keep only (ngw_g/4) values
|
||||
fac = 2.519d0
|
||||
IF( ngw_g/4 < nbsp ) fac = 1.0d0
|
||||
IF( ngw_g < nbsp ) THEN
|
||||
CALL errore(' wave_rand_init ', ' too few plane waves, linear dependent electronic states! ', 1)
|
||||
END IF
|
||||
|
||||
DO ib = 1, nbsp
|
||||
|
||||
IF( local ) THEN
|
||||
|
@ -487,7 +494,7 @@
|
|||
IF( ibgrp > 0 ) THEN
|
||||
DO ig = 1, ngw
|
||||
IF( local ) THEN
|
||||
IF( gg(ig) < ggx / 2.519d0 ) THEN ! 2.519 = 4^(2/3), equivalent to keep only (ngw_g/4) values
|
||||
IF( gg(ig) < ggx / fac ) THEN
|
||||
rranf1 = rnd( 1, mill(1,ig) ) * rnd( 2, mill(2,ig) ) * rnd( 3, mill(3,ig) )
|
||||
rranf2 = 0.0d0
|
||||
cm_bgrp( ig, ibgrp ) = ampre * CMPLX( rranf1, rranf2 ,kind=DP) / ( 1.0d0 + gg(ig) )
|
||||
|
@ -599,3 +606,65 @@
|
|||
RETURN
|
||||
END SUBROUTINE c_bgrp_pack_x
|
||||
|
||||
#if defined (__CUDA)
|
||||
SUBROUTINE c_bgrp_expand_gpu_x( c_bgrp )
|
||||
USE kinds, ONLY: DP
|
||||
USE mp, ONLY: mp_sum
|
||||
USE electrons_base, ONLY: nspin, i2gupdwn_bgrp, nupdwn, iupdwn_bgrp, iupdwn, nupdwn_bgrp
|
||||
USE mp_global, ONLY: nbgrp, inter_bgrp_comm
|
||||
USE cudafor
|
||||
IMPLICIT NONE
|
||||
COMPLEX(DP), DEVICE :: c_bgrp(:,:)
|
||||
INTEGER :: iss, n1, n2, m1, m2, i
|
||||
IF( nbgrp < 2 ) &
|
||||
RETURN
|
||||
DO iss = nspin, 1, -1
|
||||
n1 = iupdwn_bgrp(iss)
|
||||
n2 = n1 + nupdwn_bgrp(iss) - 1
|
||||
m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
|
||||
m2 = m1 + nupdwn_bgrp(iss) - 1
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = m2, m1, -1
|
||||
c_bgrp(:,i) = c_bgrp(:,i-m1+n1)
|
||||
END DO
|
||||
END DO
|
||||
DO iss = 1, nspin
|
||||
m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
|
||||
m2 = m1 + nupdwn_bgrp(iss) - 1
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = iupdwn(iss), m1-1
|
||||
c_bgrp(:,i) = 0.0d0
|
||||
END DO
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = m2+1, iupdwn(iss) + nupdwn(iss) - 1
|
||||
c_bgrp(:,i) = 0.0d0
|
||||
END DO
|
||||
END DO
|
||||
CALL mp_sum( c_bgrp, inter_bgrp_comm )
|
||||
RETURN
|
||||
END SUBROUTINE c_bgrp_expand_gpu_x
|
||||
|
||||
|
||||
SUBROUTINE c_bgrp_pack_gpu_x( c_bgrp )
|
||||
USE kinds, ONLY: DP
|
||||
USE electrons_base, ONLY: nspin, i2gupdwn_bgrp, nupdwn, iupdwn_bgrp, iupdwn, nupdwn_bgrp
|
||||
USE mp_global, ONLY: nbgrp
|
||||
USE cudafor
|
||||
IMPLICIT NONE
|
||||
COMPLEX(DP), DEVICE :: c_bgrp(:,:)
|
||||
INTEGER :: iss, n1, n2, m1, m2, i
|
||||
IF( nbgrp < 2 ) &
|
||||
RETURN
|
||||
DO iss = 1, nspin
|
||||
n1 = iupdwn_bgrp(iss)
|
||||
n2 = n1 + nupdwn_bgrp(iss) - 1
|
||||
m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
|
||||
m2 = m1 + nupdwn_bgrp(iss) - 1
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = n1, n2
|
||||
c_bgrp(:,i) = c_bgrp(:,i-n1+m1)
|
||||
END DO
|
||||
END DO
|
||||
RETURN
|
||||
END SUBROUTINE c_bgrp_pack_gpu_x
|
||||
#endif
|
||||
|
|
|
@ -616,6 +616,9 @@ CONTAINS
|
|||
WRITE( stdout,*) ' Number of x-y planes for each processors: '
|
||||
WRITE( stdout, fmt = '( 5(" |",I4,",",I4) )' ) ( ( desc%nr2p(j), &
|
||||
desc%nr3p(i), i = 1, desc%nproc3 ), j=1,desc%nproc2 )
|
||||
|
||||
IF ( .not. desc%use_pencil_decomposition ) WRITE( stdout,*) ' Using Slab Decomposition'
|
||||
IF ( desc%use_pencil_decomposition ) WRITE( stdout,*) ' Using Pencil Decomposition'
|
||||
1000 FORMAT(3X, &
|
||||
'Global Dimensions Local Dimensions Processor Grid',/,3X, &
|
||||
'.X. .Y. .Z. .X. .Y. .Z. .X. .Y. .Z.',/, &
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
# Makefile for FFTXlib testing
|
||||
|
||||
include ../../make.inc
|
||||
|
||||
MODFLAGS= $(MOD_FLAG).. $(MOD_FLAG).
|
||||
|
||||
SRCS = test_fft_scalar_gpu.f90 \
|
||||
test_fft_scatter_mod_gpu.f90 \
|
||||
test_fwinv_gpu.f90
|
||||
|
||||
EXECS = $(SRCS:.f90=.x)
|
||||
|
||||
all: common $(EXECS)
|
||||
|
||||
common: tester.o utils.o
|
||||
|
||||
%.x: %.o
|
||||
$(LD) $(LDFLAGS) $< utils.o tester.o -o $@ ../libqefft.a $(QELIBS)
|
||||
|
||||
clean :
|
||||
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
|
||||
|
||||
# .PHONY forces execution of a rule irrespective of the presence of an
|
||||
# updated file with the same name of the rule. In this way, the script
|
||||
# that generates version.f90 always runs, updating the version if you
|
||||
# execute "svn update". The update_version script takes care of not
|
||||
# changing the file if the svn version did not change
|
||||
|
||||
.PHONY: all clean
|
||||
|
|
@ -0,0 +1,273 @@
|
|||
#if defined(__CUDA)
|
||||
program test_fft_scalar_gpu
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
CALL test%init()
|
||||
!
|
||||
test%tolerance64 = 1.d-13
|
||||
!
|
||||
CALL save_random_seed("test_fft_scalar_gpu", 0)
|
||||
!
|
||||
CALL test_cft_2xy_gpu(test)
|
||||
!
|
||||
CALL test_cft_1z_gpu(test)
|
||||
!
|
||||
CALL test_cfft3d_gpu(test)
|
||||
!
|
||||
CALL test_cfft3ds_gpu(test)
|
||||
!
|
||||
CALL test%print()
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE fill_random(c, c_d, n)
|
||||
USE cudafor
|
||||
USE fft_param, ONLY : DP
|
||||
implicit none
|
||||
complex(DP), device :: c_d(:)
|
||||
complex(DP) :: c(:)
|
||||
integer, intent(in) :: n
|
||||
!
|
||||
real(DP), ALLOCATABLE :: rnd_aux(:)
|
||||
!
|
||||
ALLOCATE (rnd_aux(2*n))
|
||||
CALL RANDOM_NUMBER(rnd_aux)
|
||||
c = CMPLX(rnd_aux(1:n), rnd_aux(n:2*n))
|
||||
c_d = c
|
||||
DEALLOCATE(rnd_aux)
|
||||
END SUBROUTINE fill_random
|
||||
!
|
||||
SUBROUTINE test_cft_1z_gpu(test)
|
||||
USE fft_scalar, ONLY : cft_1z, cft_2xy, cfft3d, cfft3ds
|
||||
USE fft_scalar, ONLY : cft_1z_gpu, cft_2xy_gpu, cfft3d_gpu, cfft3ds_gpu
|
||||
USE fft_param, ONLY : DP
|
||||
USE cudafor
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
! stream
|
||||
integer(kind = cuda_stream_kind) :: stream = 0
|
||||
! size
|
||||
integer, parameter :: nsl=100, nz=56, ldz=56
|
||||
!
|
||||
! array for random values
|
||||
real(DP) :: rnd_aux(2 * nsl * ldz)
|
||||
!
|
||||
! variables on device
|
||||
complex(DP), device :: c_d(nsl*ldz)
|
||||
complex(DP), device :: cout_d(nsl*ldz)
|
||||
! variables on host
|
||||
complex(DP) :: c(nsl*ldz)
|
||||
complex(DP) :: cout(nsl*ldz)
|
||||
!
|
||||
CALL fill_random(c, c_d, nsl*ldz)
|
||||
!
|
||||
! Check forward direction
|
||||
CALL cft_1z(c, nsl, nz, ldz, 1, cout)
|
||||
CALL cft_1z_gpu(c_d, nsl, nz, ldz, 1, cout_d, stream)
|
||||
!
|
||||
! Use c as auxiliary variable hosting GPU results
|
||||
c = (0.d0, 0.d0)
|
||||
c = cout_d
|
||||
CALL test%assert_close( c, cout )
|
||||
!
|
||||
!
|
||||
CALL fill_random(c, c_d, nsl*ldz)
|
||||
!
|
||||
! Check backward direction
|
||||
CALL cft_1z(c, nsl, nz, ldz, -1, cout)
|
||||
CALL cft_1z_gpu(c_d, nsl, nz, ldz, -1, cout_d, stream)
|
||||
!
|
||||
! Use c as auxiliary variable hosting GPU results
|
||||
c = (0.d0, 0.d0)
|
||||
c = cout_d
|
||||
CALL test%assert_close( c, cout )
|
||||
!
|
||||
! == Same as above, for inplace call ==
|
||||
!
|
||||
CALL fill_random(c, c_d, nsl*ldz)
|
||||
!
|
||||
! Check forward direction
|
||||
CALL cft_1z(c, nsl, nz, ldz, 1, cout)
|
||||
CALL cft_1z_gpu(c_d, nsl, nz, ldz, 1, cout_d, stream, in_place=.true.)
|
||||
!
|
||||
! Use c as auxiliary variable hosting GPU results
|
||||
c = (0.d0, 0.d0)
|
||||
c = c_d
|
||||
CALL test%assert_close( c, cout )
|
||||
!
|
||||
!
|
||||
CALL fill_random(c, c_d, nsl*ldz)
|
||||
!
|
||||
! Check backward direction
|
||||
CALL cft_1z(c, nsl, nz, ldz, -1, cout)
|
||||
CALL cft_1z_gpu(c_d, nsl, nz, ldz, -1, cout_d, stream, in_place=.true.)
|
||||
!
|
||||
! Use c as auxiliary variable hosting GPU results
|
||||
c = (0.d0, 0.d0)
|
||||
c = c_d
|
||||
CALL test%assert_close( c, cout )
|
||||
!
|
||||
END SUBROUTINE test_cft_1z_gpu
|
||||
!
|
||||
SUBROUTINE test_cft_2xy_gpu(test)
|
||||
USE fft_scalar, ONLY : cft_2xy
|
||||
USE fft_scalar, ONLY : cft_2xy_gpu
|
||||
USE fft_param, ONLY : DP
|
||||
USE cudafor
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
! stream
|
||||
integer(kind = cuda_stream_kind) :: stream = 0
|
||||
! size
|
||||
integer, parameter :: nx = 10, ny = 10, nzl = 5, ldx=10, ldy=10
|
||||
!
|
||||
! array for random values
|
||||
real(DP) :: rnd_aux(2 * nzl * ldx * ldy)
|
||||
! variables on device
|
||||
complex(DP), device :: c_d(nzl * ldx * ldy)
|
||||
complex(DP), device :: tmp_d(nzl * ldx * ldy)
|
||||
! variables on host
|
||||
complex(DP) :: c(nzl * ldx * ldy)
|
||||
complex(DP) :: tmp(nzl * ldx * ldy)
|
||||
!
|
||||
CALL fill_random(c, c_d, nzl * ldx * ldy)
|
||||
!
|
||||
CALL cft_2xy_gpu(c_d, tmp_d, nzl, nx, ny, ldx, ldy, 1, stream)
|
||||
CALL cft_2xy(c, nzl, nx, ny, ldx, ldy, 1)
|
||||
!
|
||||
! Use c as auxiliary variable hosting GPU results
|
||||
tmp = c_d
|
||||
CALL test%assert_close( c, tmp )
|
||||
!
|
||||
CALL fill_random(c, c_d, nzl * ldx * ldy)
|
||||
!
|
||||
CALL cft_2xy_gpu(c_d, tmp_d, nzl, nx, ny, ldx, ldy, -1, stream)
|
||||
CALL cft_2xy(c, nzl, nx, ny, ldx, ldy, -1)
|
||||
!
|
||||
! Use c as auxiliary variable hosting GPU results
|
||||
tmp = c_d
|
||||
CALL test%assert_close( c, tmp )
|
||||
!
|
||||
END SUBROUTINE test_cft_2xy_gpu
|
||||
!
|
||||
SUBROUTINE test_cfft3d_gpu(test)
|
||||
USE fft_scalar, ONLY : cfft3d
|
||||
USE fft_scalar, ONLY : cfft3d_gpu
|
||||
USE fft_param, ONLY : DP
|
||||
USE cudafor
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
! stream
|
||||
integer(kind = cuda_stream_kind) :: stream = 0
|
||||
! size
|
||||
integer, parameter :: nx = 10, ny = 10, nz = 10
|
||||
integer, parameter :: ldx= 10, ldy= 10, ldz= 10
|
||||
#if ! defined(__DFTI)
|
||||
integer, parameter :: howmany=1
|
||||
#else
|
||||
integer, parameter :: howmany=12
|
||||
#endif
|
||||
!
|
||||
! array for random values
|
||||
real(DP) :: rnd_aux(2 * howmany * ldx * ldy * ldz)
|
||||
! variables on device
|
||||
complex(DP), device :: c_d(howmany * ldx * ldy * ldz)
|
||||
complex(DP), device :: tmp_d(howmany * ldx * ldy * ldz)
|
||||
! variables on host
|
||||
complex(DP) :: c(howmany * ldx * ldy * ldz)
|
||||
complex(DP) :: tmp(howmany * ldx * ldy * ldz)
|
||||
!
|
||||
integer :: i, rs, re
|
||||
!
|
||||
#if ! defined(__DFTI)
|
||||
print *, 'The current CPU scalar driver does not support howmany. Reverting to howmany 1'
|
||||
#endif
|
||||
|
||||
CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
|
||||
!
|
||||
CALL cfft3d( c, nx, ny, nz, ldx, ldy, ldz, howmany, 1 )
|
||||
CALL cfft3d_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, 1, stream )
|
||||
!
|
||||
! Use c as auxiliary variable hosting GPU results
|
||||
tmp = c_d
|
||||
DO i=0, howmany - 1
|
||||
rs = i * ldx * ldy * ldz + 1
|
||||
re = rs + nx * ny * nz - 1
|
||||
CALL test%assert_close( c(rs:re), tmp(rs:re) )
|
||||
END DO
|
||||
!
|
||||
CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
|
||||
!
|
||||
CALL cfft3d( c, nx, ny, nz, ldx, ldy, ldz, howmany, -1 )
|
||||
CALL cfft3d_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, -1, stream )
|
||||
!
|
||||
! Use c as auxiliary variable hosting GPU results
|
||||
tmp = c_d
|
||||
DO i=0, howmany - 1
|
||||
rs = i * ldx * ldy * ldz + 1
|
||||
re = rs + nx * ny * nz - 1
|
||||
CALL test%assert_close( c(rs:re), tmp(rs:re) )
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE test_cfft3d_gpu
|
||||
!
|
||||
SUBROUTINE test_cfft3ds_gpu(test)
|
||||
USE fft_scalar, ONLY : cfft3ds
|
||||
USE fft_scalar, ONLY : cfft3ds_gpu
|
||||
USE fft_param, ONLY : DP
|
||||
USE cudafor
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
! stream
|
||||
integer(kind = cuda_stream_kind) :: stream = 0
|
||||
! size
|
||||
integer, parameter :: nx = 10, ny = 10, nz = 10
|
||||
integer, parameter :: ldx= 10, ldy= 10, ldz= 10, howmany=1
|
||||
!
|
||||
! array for random values
|
||||
real(DP) :: rnd_aux(2 * howmany * ldx * ldy * ldz)
|
||||
! variables on device
|
||||
complex(DP), device :: c_d(howmany * ldx * ldy * ldz)
|
||||
complex(DP), device :: tmp_d(howmany * ldx * ldy * ldz)
|
||||
! variables on host
|
||||
complex(DP) :: c(howmany * ldx * ldy * ldz)
|
||||
complex(DP) :: tmp(howmany * ldx * ldy * ldz)
|
||||
integer :: do_fft_y(ldx), do_fft_z(ldx*ldy)
|
||||
!
|
||||
CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
|
||||
do_fft_y = 1; do_fft_z = 1
|
||||
!
|
||||
CALL cfft3ds( c, nx, ny, nz, ldx, ldy, ldz, howmany, 1, do_fft_z, do_fft_y)
|
||||
CALL cfft3ds_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, 1, do_fft_z, do_fft_y, stream)
|
||||
!
|
||||
! Use c as auxiliary variable hosting GPU results
|
||||
tmp = c_d
|
||||
CALL test%assert_close( c, tmp )
|
||||
!
|
||||
CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
|
||||
!
|
||||
CALL cfft3ds( c, nx, ny, nz, ldx, ldy, ldz, howmany, -1, do_fft_z, do_fft_y)
|
||||
CALL cfft3ds_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, -1, do_fft_z, do_fft_y, stream )
|
||||
!
|
||||
! Use c as auxiliary variable hosting GPU results
|
||||
tmp = c_d
|
||||
CALL test%assert_close( c, tmp )
|
||||
!
|
||||
END SUBROUTINE test_cfft3ds_gpu
|
||||
|
||||
end program test_fft_scalar_gpu
|
||||
#else
|
||||
program test_fft_scalar_gpu
|
||||
end program test_fft_scalar_gpu
|
||||
#endif
|
|
@ -0,0 +1,406 @@
|
|||
#if defined(__CUDA)
|
||||
program test_fft_scatter_mod_gpu
|
||||
#if defined(__MPI)
|
||||
USE mpi
|
||||
#endif
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
! MPI type
|
||||
type mpi_t
|
||||
integer :: me, n, root, comm
|
||||
end type mpi_t
|
||||
TYPE(mpi_t) :: mp
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
INTEGER :: ierr, level, i
|
||||
!
|
||||
#if defined(__MPI)
|
||||
#if defined(_OPENMP)
|
||||
CALL MPI_Init_thread(MPI_THREAD_FUNNELED,level, ierr)
|
||||
#else
|
||||
CALL MPI_Init(ierr)
|
||||
#endif
|
||||
#endif
|
||||
!
|
||||
CALL mpi_data_init(mp%me, mp%n, mp%root, mp%comm)
|
||||
!
|
||||
CALL test%init()
|
||||
!
|
||||
test%tolerance64 = 1.d-14
|
||||
!
|
||||
CALL save_random_seed("test_fft_scatter_mod_gpu", mp%me)
|
||||
!
|
||||
DO i = 1, mp%n
|
||||
IF (MOD(mp%n,i) == 0 ) THEN
|
||||
! gamma case
|
||||
CALL test_fft_scatter_xy_gpu_1(mp, test, .true., i)
|
||||
! k case
|
||||
CALL test_fft_scatter_xy_gpu_1(mp, test, .false., i)
|
||||
!
|
||||
! gamma case
|
||||
CALL test_fft_scatter_yz_gpu_1(mp, test, .true., i)
|
||||
! k case
|
||||
CALL test_fft_scatter_yz_gpu_1(mp, test, .false., i)
|
||||
END IF
|
||||
END DO
|
||||
CALL test_fft_scatter_many_yz_gpu_1(mp, test, .true., 1)
|
||||
CALL test_fft_scatter_many_yz_gpu_1(mp, test, .false., 1)
|
||||
|
||||
!
|
||||
CALL collect_results(test)
|
||||
!
|
||||
IF (mp%me == mp%root) CALL test%print()
|
||||
!
|
||||
#if defined(__MPI)
|
||||
CALL MPI_Finalize(ierr)
|
||||
#endif
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE mpi_data_init(mpme, npes, mproot, comm)
|
||||
implicit none
|
||||
integer, intent(out) :: mpme, npes, mproot, comm
|
||||
integer :: ierr
|
||||
mpme=0; npes=1; mproot=0; comm=0
|
||||
#if defined(__MPI)
|
||||
CALL mpi_comm_rank(MPI_COMM_WORLD, mpme, ierr)
|
||||
CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
|
||||
comm = MPI_COMM_WORLD
|
||||
#endif
|
||||
END SUBROUTINE mpi_data_init
|
||||
SUBROUTINE fft_desc_init(dfft, smap, flavor, gamma_only, parallel, comm, nyfft)
|
||||
USE fft_types, ONLY : fft_type_descriptor, fft_type_init
|
||||
USE stick_base, ONLY : sticks_map
|
||||
USE fft_param, ONLY : DP
|
||||
implicit none
|
||||
TYPE(fft_type_descriptor) :: dfft
|
||||
TYPE(sticks_map) :: smap
|
||||
CHARACTER(LEN=*), INTENT(IN) :: flavor
|
||||
LOGICAL :: gamma_only
|
||||
LOGICAL :: parallel
|
||||
INTEGER :: comm, nyfft
|
||||
REAL(DP), PARAMETER :: pi=4.D0*DATAN(1.D0)
|
||||
!
|
||||
REAL(DP) :: at(3,3), bg(3,3)
|
||||
!
|
||||
at = RESHAPE((/1.d0, 0.d0, 0.d0, 0.d0, 1.d0, 0.d0, 0.d0, 0.d0, 1.d0/), shape(at))
|
||||
bg = RESHAPE((/1.d0, 0.d0, 0.d0, 0.d0, 1.d0, 0.d0, 0.d0, 0.d0, 1.d0/), shape(bg))
|
||||
bg = 2.d0*pi
|
||||
!
|
||||
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 4.d0, nyfft=nyfft)
|
||||
!
|
||||
END SUBROUTINE fft_desc_init
|
||||
|
||||
SUBROUTINE fft_desc_finalize(dfft, smap)
|
||||
USE fft_types, ONLY : fft_type_descriptor, fft_type_deallocate
|
||||
USE stick_base, ONLY : sticks_map, sticks_map_deallocate
|
||||
implicit none
|
||||
TYPE(fft_type_descriptor) :: dfft
|
||||
TYPE(sticks_map) :: smap
|
||||
!
|
||||
CALL fft_type_deallocate(dfft)
|
||||
CALL sticks_map_deallocate( smap )
|
||||
END SUBROUTINE fft_desc_finalize
|
||||
!
|
||||
SUBROUTINE fill_random(c, c_d, n)
|
||||
USE cudafor
|
||||
USE fft_param, ONLY : DP
|
||||
implicit none
|
||||
complex(DP), device :: c_d(:)
|
||||
complex(DP) :: c(:)
|
||||
integer, intent(in) :: n
|
||||
!
|
||||
real(DP), ALLOCATABLE :: rnd_aux(:)
|
||||
!
|
||||
ALLOCATE (rnd_aux(2*n))
|
||||
CALL RANDOM_NUMBER(rnd_aux)
|
||||
c = CMPLX(rnd_aux(1:n), rnd_aux(n:2*n))
|
||||
c_d = c
|
||||
DEALLOCATE(rnd_aux)
|
||||
END SUBROUTINE fill_random
|
||||
!
|
||||
SUBROUTINE test_fft_scatter_xy_gpu_1(mp, test, gamma_only, ny)
|
||||
USE cudafor
|
||||
USE fft_param, ONLY : DP
|
||||
USE fft_types, ONLY : fft_type_descriptor
|
||||
USE stick_base, ONLY : sticks_map
|
||||
USE scatter_mod, ONLY : fft_scatter_xy
|
||||
USE scatter_mod_gpu, ONLY : fft_scatter_xy_gpu
|
||||
implicit none
|
||||
TYPE(mpi_t) :: mp
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
TYPE(fft_type_descriptor) :: dfft
|
||||
TYPE(sticks_map) :: smap
|
||||
LOGICAL, INTENT(IN) :: gamma_only
|
||||
INTEGER, INTENT(IN) :: ny
|
||||
!
|
||||
LOGICAL :: parallel
|
||||
COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_out(:), aux(:)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: scatter_in_d(:), scatter_out_d(:)
|
||||
integer(kind = cuda_stream_kind) :: stream = 0
|
||||
integer :: fft_sign = 2
|
||||
integer :: vsiz, nr1p_, compare_len, me2
|
||||
!
|
||||
parallel = mp%n .gt. 1
|
||||
|
||||
CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
|
||||
me2 = dfft%mype2 + 1
|
||||
vsiz = dfft%nnr
|
||||
compare_len = dfft%nr1x * dfft%my_nr2p * dfft%my_nr3p
|
||||
if (ny > 1) then
|
||||
! When using task groups, wave FFTs are not distributed along Y
|
||||
fft_sign = 3
|
||||
vsiz = dfft%nnr_tg
|
||||
compare_len = dfft%nr1x * dfft%nr2x * dfft%my_nr3p
|
||||
end if
|
||||
!
|
||||
! Allocate variables
|
||||
ALLOCATE(scatter_in(vsiz), scatter_out(vsiz), aux(vsiz))
|
||||
ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz))
|
||||
!
|
||||
! Test 1
|
||||
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
||||
!
|
||||
CALL fft_scatter_xy( dfft, scatter_in, scatter_out, vsiz, fft_sign )
|
||||
CALL fft_scatter_xy_gpu( dfft, scatter_in_d, scatter_out_d, vsiz, fft_sign, stream )
|
||||
aux(1:compare_len) = scatter_out_d(1:compare_len)
|
||||
!
|
||||
! Check
|
||||
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
|
||||
!
|
||||
! Test 2
|
||||
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
||||
!
|
||||
CALL fft_scatter_xy( dfft, scatter_out, scatter_in, vsiz, -1*fft_sign )
|
||||
CALL fft_scatter_xy_gpu( dfft, scatter_out_d, scatter_in_d, vsiz, -1*fft_sign, stream )
|
||||
!
|
||||
compare_len = dfft%nr2x * dfft%nr1w(me2) * dfft%my_nr3p
|
||||
IF (ny > 1) compare_len = dfft%nr2x * dfft%nr1w_tg * dfft%my_nr3p
|
||||
!
|
||||
aux(1:compare_len) = scatter_out_d(1:compare_len)
|
||||
! Check
|
||||
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
|
||||
!
|
||||
CALL fft_desc_finalize(dfft, smap)
|
||||
DEALLOCATE(scatter_in, scatter_out, aux, scatter_in_d, scatter_out_d)
|
||||
!
|
||||
END SUBROUTINE test_fft_scatter_xy_gpu_1
|
||||
!
|
||||
SUBROUTINE test_fft_scatter_yz_gpu_1(mp, test, gamma_only, ny)
|
||||
!
|
||||
! This test checks wave fft scatter, with parallel = .true. if
|
||||
! called with more than 1 MPI.
|
||||
!
|
||||
USE cudafor
|
||||
USE fft_param, ONLY : DP
|
||||
USE fft_types, ONLY : fft_type_descriptor
|
||||
USE stick_base, ONLY : sticks_map
|
||||
USE scatter_mod, ONLY : fft_scatter_yz
|
||||
USE scatter_mod_gpu, ONLY : fft_scatter_yz_gpu
|
||||
implicit none
|
||||
TYPE(mpi_t) :: mp
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
TYPE(fft_type_descriptor) :: dfft
|
||||
TYPE(sticks_map) :: smap
|
||||
LOGICAL, INTENT(IN) :: gamma_only
|
||||
INTEGER, INTENT(IN) :: ny
|
||||
!
|
||||
LOGICAL :: parallel
|
||||
COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_out(:), aux(:)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: scatter_in_d(:), scatter_out_d(:)
|
||||
integer(kind = cuda_stream_kind) :: stream = 0
|
||||
integer :: fft_sign = 2
|
||||
integer :: vsiz, compare_len, my_nr1p_
|
||||
!
|
||||
parallel = mp%n .gt. 1
|
||||
CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
|
||||
vsiz = dfft%nnr
|
||||
my_nr1p_ = count(dfft%ir1w > 0)
|
||||
if (ny > 1) then
|
||||
fft_sign = 3
|
||||
vsiz = dfft%nnr_tg
|
||||
my_nr1p_ = count(dfft%ir1w_tg > 0)
|
||||
end if
|
||||
!
|
||||
! Allocate variables
|
||||
ALLOCATE(scatter_in(vsiz), scatter_out(vsiz), aux(vsiz))
|
||||
ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz))
|
||||
!
|
||||
! Test 1
|
||||
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
||||
!
|
||||
CALL fft_scatter_yz( dfft, scatter_in, scatter_out, vsiz, fft_sign )
|
||||
CALL fft_scatter_yz_gpu( dfft, scatter_in_d, scatter_out_d, vsiz, fft_sign )
|
||||
! Set the number of elements that should be strictly equivalent in the
|
||||
! two implementations.
|
||||
compare_len = dfft%my_nr3p*my_nr1p_*dfft%nr2x
|
||||
aux(1:compare_len) = scatter_out_d(1:compare_len)
|
||||
! Check
|
||||
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
|
||||
!
|
||||
! Test 2
|
||||
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
||||
!
|
||||
CALL fft_scatter_yz( dfft, scatter_out, scatter_in, vsiz, -1*fft_sign )
|
||||
CALL fft_scatter_yz_gpu( dfft, scatter_out_d, scatter_in_d, vsiz, -1*fft_sign )
|
||||
!
|
||||
compare_len = dfft%nsw(mp%me+1)*dfft%nr3x
|
||||
aux(1:compare_len) = scatter_out_d(1:compare_len)
|
||||
! Check
|
||||
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
|
||||
!
|
||||
CALL fft_desc_finalize(dfft, smap)
|
||||
DEALLOCATE(scatter_in, scatter_out, aux, scatter_in_d, scatter_out_d)
|
||||
!
|
||||
END SUBROUTINE test_fft_scatter_yz_gpu_1
|
||||
!
|
||||
SUBROUTINE test_fft_scatter_many_yz_gpu_1(mp, test, gamma_only, ny)
|
||||
!
|
||||
! This test checks wave fft scatter, with parallel = .true. if
|
||||
! called with more than 1 MPI.
|
||||
!
|
||||
USE cudafor
|
||||
USE fft_param, ONLY : DP
|
||||
USE fft_types, ONLY : fft_type_descriptor
|
||||
USE stick_base, ONLY : sticks_map
|
||||
USE scatter_mod, ONLY : fft_scatter_yz
|
||||
USE scatter_mod_gpu, ONLY : fft_scatter_yz_gpu, fft_scatter_many_yz_gpu
|
||||
implicit none
|
||||
TYPE(mpi_t) :: mp
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
TYPE(fft_type_descriptor) :: dfft
|
||||
TYPE(sticks_map) :: smap
|
||||
LOGICAL, INTENT(IN) :: gamma_only
|
||||
INTEGER, INTENT(IN) :: ny
|
||||
INTEGER, PARAMETER :: howmany = 4
|
||||
!
|
||||
LOGICAL :: parallel
|
||||
COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_in_cpy(:), scatter_out(:), aux(:)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: scatter_in_d(:), scatter_out_d(:), aux_d(:)
|
||||
! convenient variables for slices
|
||||
integer :: i, l, start_in, end_in, start_out, end_out, nstick_zx, n3, n3x, vsiz
|
||||
integer :: start_sl, end_sl
|
||||
!integer(kind = cuda_stream_kind) :: streams(5)
|
||||
!
|
||||
parallel = mp%n .gt. 1
|
||||
IF (ny > 1) print *, 'scatter_many does not support task grouping'
|
||||
CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
|
||||
!
|
||||
! Allocate variables
|
||||
vsiz = dfft%nnr*howmany
|
||||
ALLOCATE(scatter_in(vsiz), scatter_in_cpy(vsiz), scatter_out(vsiz), aux(vsiz))
|
||||
ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz), aux_d(vsiz))
|
||||
!
|
||||
! How FFT allocates bunches in this case:
|
||||
nstick_zx = MAXVAL(dfft%nsw)
|
||||
n3 = dfft%nr3
|
||||
n3x = dfft%nr3x
|
||||
|
||||
! Test 1
|
||||
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
||||
scatter_in_cpy = scatter_in
|
||||
!
|
||||
!print *, 'dfft%nnr, nr3, nr2, nr1 : ', dfft%nnr, dfft%nr3, dfft%nr2, dfft%nr1
|
||||
! Test 1.1, compare scatter of slices
|
||||
DO i=0,howmany-1
|
||||
start_in = i*dfft%nnr + 1
|
||||
end_in = (i+1)*dfft%nnr
|
||||
start_out = start_in
|
||||
!
|
||||
end_out = (start_out-1) + dfft%my_nr3p*dfft%nr1w(dfft%mype2 +1)*dfft%nr2x
|
||||
!
|
||||
CALL fft_scatter_yz( dfft, scatter_in(start_in:end_in), scatter_out(start_out:end_out), dfft%nnr, 2 )
|
||||
CALL fft_scatter_yz_gpu( dfft, scatter_in_d(start_in:end_in), scatter_out_d(start_in:end_out), dfft%nnr, 2 )
|
||||
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
|
||||
! Check
|
||||
CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
|
||||
END DO
|
||||
!
|
||||
scatter_in = scatter_in_cpy
|
||||
! Store data as expected in input
|
||||
DO i=0,howmany-1
|
||||
start_in = i*dfft%nnr + 1
|
||||
end_in = (i+1)*dfft%nnr
|
||||
start_out= i*nstick_zx*n3x+1
|
||||
end_out = (i+1)*nstick_zx*n3
|
||||
scatter_in_d( start_out : end_out ) = scatter_in(start_in:start_in+nstick_zx*n3)
|
||||
END DO
|
||||
CALL fft_scatter_many_yz_gpu ( dfft, scatter_in_d, scatter_out_d, vsiz, 2, howmany )
|
||||
|
||||
DO i=0,howmany-1
|
||||
start_out = i*dfft%nnr + 1
|
||||
end_out = (start_out-1) + dfft%my_nr3p*dfft%nr1w(dfft%mype2 +1)*dfft%nr2x
|
||||
!
|
||||
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
|
||||
!
|
||||
CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
|
||||
END DO
|
||||
!
|
||||
!
|
||||
! Test 2
|
||||
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
||||
scatter_in_cpy = scatter_in
|
||||
!
|
||||
DO i=0,howmany-1
|
||||
! Input data for fft_scatter_yz call
|
||||
start_in = i*dfft%nnr + 1
|
||||
end_in = (i+1)*dfft%nnr
|
||||
!
|
||||
! Where to store output data
|
||||
start_out = start_in
|
||||
end_out = end_in
|
||||
CALL fft_scatter_yz( dfft, scatter_out(start_out:end_out), scatter_in(start_in:end_in), dfft%nnr, -2 )
|
||||
CALL fft_scatter_yz_gpu( dfft, scatter_out_d(start_out:end_out), scatter_in_d(start_in:end_in), dfft%nnr, -2 )
|
||||
!
|
||||
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
|
||||
!
|
||||
! Check only relevant part
|
||||
end_out = start_out + dfft%nsw(mp%me+1)*n3
|
||||
CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
|
||||
END DO
|
||||
!
|
||||
!
|
||||
! Now repeat the test, but doing all the FFTs in a single shot
|
||||
scatter_in_d(1:vsiz) = scatter_in_cpy(1:vsiz)
|
||||
!
|
||||
CALL fft_scatter_many_yz_gpu ( dfft, scatter_out_d, scatter_in_d, vsiz, -2, howmany )
|
||||
|
||||
DO i=0,howmany-1
|
||||
! Extract data from GPU. Data are spaced by nstick_zx*n3x
|
||||
start_out = 1 + i*nstick_zx*n3x
|
||||
end_out = (i+1)*nstick_zx*n3x
|
||||
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
|
||||
!
|
||||
start_in = i*dfft%nnr + 1
|
||||
end_in = i*dfft%nnr + dfft%nsw(mp%me+1)*n3x !nstick_zx*nx3!
|
||||
!
|
||||
! Extract only data tofft_scatter_yz compare the two methods. Results from the
|
||||
! previous call to fft_scatter_yz are separated by nnr, while the
|
||||
! new results are separated by nstick_zx*n3x. We read start_out
|
||||
! and add from there dfft%nsw(mp%me+1)*n3x to be compared (don't forget the -1!)
|
||||
start_sl = start_out
|
||||
end_sl = (start_out - 1) + dfft%nsw(mp%me+1)*n3x
|
||||
CALL test%assert_close( aux(start_sl:end_sl), scatter_out(start_in:end_in) )
|
||||
END DO
|
||||
!
|
||||
CALL fft_desc_finalize(dfft, smap)
|
||||
DEALLOCATE(scatter_in, scatter_in_cpy, scatter_out, aux, scatter_in_d, scatter_out_d)
|
||||
!
|
||||
END SUBROUTINE test_fft_scatter_many_yz_gpu_1
|
||||
|
||||
end program test_fft_scatter_mod_gpu
|
||||
! dummy subroutines
|
||||
subroutine stop_clock( label )
|
||||
character(len=*) :: label
|
||||
end subroutine stop_clock
|
||||
subroutine start_clock( label )
|
||||
character(len=*) :: label
|
||||
end subroutine start_clock
|
||||
!
|
||||
#else
|
||||
program test_fft_scatter_mod_gpu
|
||||
end program test_fft_scatter_mod_gpu
|
||||
#endif
|
|
@ -0,0 +1,424 @@
|
|||
#if defined(__CUDA)
|
||||
program test_fwinv_gpu
|
||||
#if defined(__MPI)
|
||||
USE mpi
|
||||
#endif
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
! MPI type
|
||||
type mpi_t
|
||||
integer :: me, n, root, comm
|
||||
end type mpi_t
|
||||
TYPE(mpi_t) :: mp
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
INTEGER :: ierr, level, i
|
||||
!
|
||||
#if defined(__MPI)
|
||||
#if defined(_OPENMP)
|
||||
CALL MPI_Init_thread(MPI_THREAD_FUNNELED,level, ierr)
|
||||
#else
|
||||
CALL MPI_Init(ierr)
|
||||
#endif
|
||||
#endif
|
||||
!
|
||||
CALL mpi_data_init(mp%me, mp%n, mp%root, mp%comm)
|
||||
!
|
||||
CALL test%init()
|
||||
!
|
||||
test%tolerance64 = 1.d-12
|
||||
!
|
||||
CALL save_random_seed("test_fwinv_gpu", mp%me)
|
||||
!
|
||||
DO i = 1, mp%n
|
||||
IF (MOD(mp%n,i) == 0 ) THEN
|
||||
CALL test_fwfft_gpu_1(mp, test, .true., i)
|
||||
CALL test_fwfft_gpu_1(mp, test, .false., i)
|
||||
!
|
||||
CALL test_invfft_gpu_1(mp, test, .true., i)
|
||||
CALL test_invfft_gpu_1(mp, test, .false., i)
|
||||
END IF
|
||||
END DO
|
||||
CALL test_fwfft_many_gpu_1(mp, test, .true., 1)
|
||||
CALL test_fwfft_many_gpu_1(mp, test, .false., 1)
|
||||
!
|
||||
CALL test_invfft_many_gpu_1(mp, test, .true., 1)
|
||||
CALL test_invfft_many_gpu_1(mp, test, .false., 1)
|
||||
!
|
||||
CALL collect_results(test)
|
||||
!
|
||||
IF (mp%me == mp%root) CALL test%print()
|
||||
!
|
||||
#if defined(__MPI)
|
||||
CALL MPI_Finalize(ierr)
|
||||
#endif
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE mpi_data_init(mpme, npes, mproot, comm)
|
||||
implicit none
|
||||
integer, intent(out) :: mpme, npes, mproot, comm
|
||||
integer :: ierr
|
||||
mpme=0; npes=1; mproot=0; comm=0
|
||||
#if defined(__MPI)
|
||||
CALL mpi_comm_rank(MPI_COMM_WORLD, mpme, ierr)
|
||||
CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
|
||||
comm = MPI_COMM_WORLD
|
||||
#endif
|
||||
END SUBROUTINE mpi_data_init
|
||||
!
|
||||
SUBROUTINE calc_bg(at, bg)
|
||||
USE fft_param, ONLY : DP
|
||||
implicit none
|
||||
REAL(DP), PARAMETER :: pi=4.D0*DATAN(1.D0)
|
||||
REAL(DP), INTENT(IN) :: at(3,3)
|
||||
REAL(DP), INTENT(OUT) :: bg(3,3)
|
||||
REAL(DP) :: ucvol
|
||||
!
|
||||
ucvol=at(1,1)*(at(2,2)*at(3,3)-at(3,2)*at(2,3))+&
|
||||
& at(2,1)*(at(3,2)*at(1,2)-at(1,2)*at(3,3))+&
|
||||
at(3,1)*(at(1,2)*at(2,3)-at(2,2)*at(1,3))
|
||||
|
||||
!calculate reciprocal-space lattice vectors
|
||||
bg(1,1)=2.0*pi*(at(2,2)*at(3,3)-at(3,2)*at(2,3))/ucvol
|
||||
bg(2,1)=2.0*pi*(at(3,2)*at(1,3)-at(1,2)*at(3,3))/ucvol
|
||||
bg(3,1)=2.0*pi*(at(1,2)*at(2,3)-at(2,2)*at(1,3))/ucvol
|
||||
bg(1,2)=2.0*pi*(at(2,3)*at(3,1)-at(3,3)*at(2,1))/ucvol
|
||||
bg(2,2)=2.0*pi*(at(3,3)*at(1,1)-at(1,3)*at(3,1))/ucvol
|
||||
bg(3,2)=2.0*pi*(at(1,3)*at(2,1)-at(2,3)*at(1,1))/ucvol
|
||||
bg(1,3)=2.0*pi*(at(2,1)*at(3,2)-at(3,1)*at(2,2))/ucvol
|
||||
bg(2,3)=2.0*pi*(at(3,1)*at(1,2)-at(1,1)*at(3,2))/ucvol
|
||||
bg(3,3)=2.0*pi*(at(1,1)*at(2,2)-at(2,1)*at(1,2))/ucvol
|
||||
END SUBROUTINE calc_bg
|
||||
!
|
||||
SUBROUTINE fft_desc_init(dfft, smap, flavor, gamma_only, parallel, comm, nyfft)
|
||||
USE stick_base
|
||||
USE fft_types, ONLY : fft_type_descriptor, fft_type_init
|
||||
USE fft_param, ONLY : DP
|
||||
implicit none
|
||||
TYPE(fft_type_descriptor) :: dfft
|
||||
TYPE(sticks_map) :: smap
|
||||
CHARACTER(LEN=*), INTENT(IN) :: flavor
|
||||
LOGICAL :: gamma_only
|
||||
LOGICAL :: parallel
|
||||
INTEGER :: comm, nyfft
|
||||
!
|
||||
REAL(DP) :: at(3,3), bg(3,3)
|
||||
!
|
||||
at = RESHAPE((/10.d0, 0.d0, 0.d0, 0.d0, 10.d0, 0.d0, 0.d0, 0.d0, 10.d0/), shape(at))
|
||||
CALL calc_bg(at, bg)
|
||||
!
|
||||
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 6.d0, nyfft=nyfft)
|
||||
!
|
||||
END SUBROUTINE fft_desc_init
|
||||
|
||||
SUBROUTINE fft_desc_finalize(dfft, smap)
|
||||
USE fft_types, ONLY : fft_type_descriptor, fft_type_deallocate
|
||||
USE stick_base, ONLY : sticks_map, sticks_map_deallocate
|
||||
implicit none
|
||||
TYPE(fft_type_descriptor) :: dfft
|
||||
TYPE(sticks_map) :: smap
|
||||
!
|
||||
CALL fft_type_deallocate(dfft)
|
||||
CALL sticks_map_deallocate( smap )
|
||||
END SUBROUTINE fft_desc_finalize
|
||||
!
|
||||
SUBROUTINE fill_random(c, c_d, n)
|
||||
USE cudafor
|
||||
USE fft_param, ONLY : DP
|
||||
implicit none
|
||||
complex(DP), device :: c_d(:)
|
||||
complex(DP) :: c(:)
|
||||
integer, intent(in) :: n
|
||||
!
|
||||
real(DP), ALLOCATABLE :: rnd_aux(:)
|
||||
!
|
||||
ALLOCATE (rnd_aux(2*n))
|
||||
CALL RANDOM_NUMBER(rnd_aux)
|
||||
c = CMPLX(rnd_aux(1:n), rnd_aux(n:2*n))
|
||||
c_d = c
|
||||
DEALLOCATE(rnd_aux)
|
||||
END SUBROUTINE fill_random
|
||||
!
|
||||
SUBROUTINE test_fwfft_gpu_1(mp, test, gamma_only, ny)
|
||||
USE cudafor
|
||||
USE fft_param, ONLY : DP
|
||||
USE fft_types, ONLY : fft_type_descriptor
|
||||
USE stick_base, ONLY : sticks_map
|
||||
USE fft_interfaces, ONLY : fwfft
|
||||
implicit none
|
||||
TYPE(mpi_t) :: mp
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
TYPE(fft_type_descriptor) :: dfft
|
||||
TYPE(sticks_map) :: smap
|
||||
LOGICAL, INTENT(IN) :: gamma_only
|
||||
INTEGER, INTENT(IN) :: ny
|
||||
!
|
||||
LOGICAL :: parallel
|
||||
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
|
||||
INTEGER :: i
|
||||
!
|
||||
parallel = mp%n .gt. 1
|
||||
CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
|
||||
dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
|
||||
!
|
||||
! Test 1
|
||||
!
|
||||
IF ( ny .gt. 1 ) THEN
|
||||
! Allocate variables
|
||||
ALLOCATE(data_in(dfft%nnr_tg), aux(dfft%nnr_tg))
|
||||
ALLOCATE(data_in_d(dfft%nnr_tg))
|
||||
CALL fill_random(data_in, data_in_d, dfft%nnr_tg)
|
||||
!
|
||||
CALL fwfft( 'tgWave' , data_in, dfft, 1 )
|
||||
CALL fwfft( 'tgWave' , data_in_d, dfft, 1 )
|
||||
ELSE
|
||||
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
|
||||
ALLOCATE(data_in_d(dfft%nnr))
|
||||
CALL fill_random(data_in, data_in_d, dfft%nnr)
|
||||
!
|
||||
CALL fwfft( 'Wave' , data_in, dfft, 1 )
|
||||
CALL fwfft( 'Wave' , data_in_d, dfft, 1 )
|
||||
ENDIF
|
||||
aux = data_in_d
|
||||
! Check
|
||||
CALL test%assert_close( data_in(1:dfft%ngw), aux(1:dfft%ngw) )
|
||||
!
|
||||
! Test 2
|
||||
!
|
||||
DEALLOCATE(data_in, data_in_d, aux)
|
||||
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
|
||||
ALLOCATE(data_in_d(dfft%nnr))
|
||||
CALL fill_random(data_in, data_in_d, dfft%nnr)
|
||||
!
|
||||
CALL fwfft( 'Rho' , data_in, dfft, 1 )
|
||||
CALL fwfft( 'Rho' , data_in_d, dfft, 1 )
|
||||
aux = data_in_d
|
||||
! Check
|
||||
CALL test%assert_close( data_in(1:dfft%ngm), aux(1:dfft%ngm) )
|
||||
!
|
||||
CALL fft_desc_finalize(dfft, smap)
|
||||
DEALLOCATE(data_in, data_in_d, aux)
|
||||
!
|
||||
END SUBROUTINE test_fwfft_gpu_1
|
||||
!
|
||||
SUBROUTINE test_invfft_gpu_1(mp, test, gamma_only, ny)
|
||||
USE cudafor
|
||||
USE fft_param, ONLY : DP
|
||||
USE fft_types, ONLY : fft_type_descriptor
|
||||
USE stick_base, ONLY : sticks_map
|
||||
USE fft_interfaces, ONLY : invfft
|
||||
implicit none
|
||||
TYPE(mpi_t) :: mp
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
TYPE(fft_type_descriptor) :: dfft
|
||||
TYPE(sticks_map) :: smap
|
||||
LOGICAL, INTENT(IN) :: gamma_only
|
||||
INTEGER, INTENT(IN) :: ny
|
||||
!
|
||||
LOGICAL :: parallel
|
||||
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
|
||||
!
|
||||
parallel = mp%n .gt. 1
|
||||
CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
|
||||
dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
|
||||
!
|
||||
! Test 1
|
||||
!
|
||||
IF ( ny .gt. 1 ) THEN
|
||||
! Allocate variables
|
||||
ALLOCATE(data_in(dfft%nnr_tg), aux(dfft%nnr_tg))
|
||||
ALLOCATE(data_in_d(dfft%nnr_tg))
|
||||
CALL fill_random(data_in, data_in_d, dfft%nnr_tg)
|
||||
!
|
||||
CALL invfft( 'tgWave' , data_in, dfft, 1 )
|
||||
CALL invfft( 'tgWave' , data_in_d, dfft, 1 )
|
||||
ELSE
|
||||
!
|
||||
! Allocate variables
|
||||
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
|
||||
ALLOCATE(data_in_d(dfft%nnr))
|
||||
CALL fill_random(data_in, data_in_d, dfft%nnr)
|
||||
!
|
||||
CALL invfft( 'Wave' , data_in, dfft, 1 )
|
||||
CALL invfft( 'Wave' , data_in_d, dfft, 1 )
|
||||
ENDIF
|
||||
aux = data_in_d
|
||||
! Check
|
||||
CALL test%assert_close( data_in, aux )
|
||||
!
|
||||
! Test 2
|
||||
!
|
||||
DEALLOCATE(data_in, data_in_d, aux)
|
||||
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
|
||||
ALLOCATE(data_in_d(dfft%nnr))
|
||||
CALL fill_random(data_in, data_in_d, dfft%nnr)
|
||||
!
|
||||
CALL invfft( 'Rho' , data_in, dfft, 1 )
|
||||
CALL invfft( 'Rho' , data_in_d, dfft, 1 )
|
||||
aux = data_in_d
|
||||
! Check
|
||||
CALL test%assert_close( data_in, aux )
|
||||
!
|
||||
CALL fft_desc_finalize(dfft, smap)
|
||||
DEALLOCATE(data_in, data_in_d, aux)
|
||||
!
|
||||
END SUBROUTINE test_invfft_gpu_1
|
||||
!
|
||||
SUBROUTINE test_fwfft_many_gpu_1(mp, test, gamma_only, ny)
|
||||
USE cudafor
|
||||
USE fft_param, ONLY : DP
|
||||
USE fft_types, ONLY : fft_type_descriptor
|
||||
USE stick_base, ONLY : sticks_map
|
||||
USE fft_interfaces, ONLY : fwfft
|
||||
implicit none
|
||||
TYPE(mpi_t) :: mp
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
TYPE(fft_type_descriptor) :: dfft
|
||||
TYPE(sticks_map) :: smap
|
||||
LOGICAL, INTENT(IN) :: gamma_only
|
||||
INTEGER, INTENT(IN) :: ny
|
||||
!
|
||||
LOGICAL :: parallel
|
||||
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
|
||||
integer, parameter :: howmany=4
|
||||
INTEGER :: i, start
|
||||
!
|
||||
parallel = mp%n .gt. 1
|
||||
CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
|
||||
dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
|
||||
!
|
||||
! Test 1
|
||||
!
|
||||
IF ( ny .gt. 1 ) THEN
|
||||
! Not (yet?) possible
|
||||
RETURN
|
||||
ELSE
|
||||
ALLOCATE(data_in(dfft%nnr*howmany), aux(dfft%nnr*howmany))
|
||||
ALLOCATE(data_in_d(dfft%nnr*howmany))
|
||||
CALL fill_random(data_in, data_in_d, dfft%nnr*howmany)
|
||||
!
|
||||
CALL fwfft( 'Wave' , data_in_d, dfft, howmany=howmany)
|
||||
!
|
||||
DO i=0,howmany-1
|
||||
start = i*dfft%nnr
|
||||
CALL fwfft( 'Wave' , data_in(1+start:), dfft, 1 )
|
||||
aux(start+1:start+dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
|
||||
! Check
|
||||
CALL test%assert_close( data_in(start+1:start+dfft%ngw), aux(start+1:start+dfft%ngw) )
|
||||
END DO
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! Test 2
|
||||
!
|
||||
DEALLOCATE(data_in, data_in_d, aux)
|
||||
ALLOCATE(data_in(dfft%nnr*howmany), aux(dfft%nnr))
|
||||
ALLOCATE(data_in_d(dfft%nnr*howmany))
|
||||
!
|
||||
CALL fill_random(data_in, data_in_d, dfft%nnr*howmany)
|
||||
!
|
||||
CALL fwfft( 'Rho' , data_in_d, dfft, howmany)
|
||||
DO i=0,howmany-1
|
||||
start = i*dfft%nnr
|
||||
CALL fwfft( 'Rho' , data_in(1+start:), dfft, 1 )
|
||||
aux(1:dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
|
||||
! Check
|
||||
CALL test%assert_close( data_in(start+1:start+dfft%ngm), aux(1:dfft%ngm) )
|
||||
END DO
|
||||
!
|
||||
CALL fft_desc_finalize(dfft, smap)
|
||||
DEALLOCATE(data_in, data_in_d, aux)
|
||||
!
|
||||
END SUBROUTINE test_fwfft_many_gpu_1
|
||||
!
|
||||
SUBROUTINE test_invfft_many_gpu_1(mp, test, gamma_only, ny)
|
||||
USE cudafor
|
||||
USE fft_param, ONLY : DP
|
||||
USE fft_types, ONLY : fft_type_descriptor
|
||||
USE stick_base, ONLY : sticks_map
|
||||
USE fft_interfaces, ONLY : invfft
|
||||
implicit none
|
||||
TYPE(mpi_t) :: mp
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
TYPE(fft_type_descriptor) :: dfft
|
||||
TYPE(sticks_map) :: smap
|
||||
LOGICAL, INTENT(IN) :: gamma_only
|
||||
INTEGER, INTENT(IN) :: ny
|
||||
!
|
||||
LOGICAL :: parallel
|
||||
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
|
||||
INTEGER(kind = cuda_stream_kind) :: strm = 0
|
||||
integer, parameter :: howmany=4
|
||||
integer :: start, i
|
||||
!
|
||||
parallel = mp%n .gt. 1
|
||||
CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
|
||||
dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
|
||||
!
|
||||
! Test 1
|
||||
!
|
||||
IF ( ny .gt. 1 ) THEN
|
||||
! Not (yet?) possible
|
||||
RETURN
|
||||
ELSE
|
||||
!
|
||||
! Allocate variables
|
||||
ALLOCATE(data_in(howmany*dfft%nnr), aux(howmany*dfft%nnr))
|
||||
ALLOCATE(data_in_d(howmany*dfft%nnr))
|
||||
CALL fill_random(data_in, data_in_d, howmany*dfft%nnr)
|
||||
!
|
||||
!CALL invfft( 'Wave' , data_in, dfft, 1 )
|
||||
CALL invfft( 'Wave' , data_in_d, dfft, howmany=howmany, stream=strm )
|
||||
DO i=0,howmany-1
|
||||
start = i*dfft%nnr
|
||||
CALL invfft( 'Wave' , data_in(1+start:), dfft, 1 )
|
||||
aux(start+1:start+dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
|
||||
! Check
|
||||
CALL test%assert_close( data_in(start+1:start+dfft%nnr), aux(start+1:start+dfft%nnr) )
|
||||
END DO
|
||||
ENDIF
|
||||
!
|
||||
! Test 2
|
||||
!
|
||||
DEALLOCATE(data_in, data_in_d, aux)
|
||||
ALLOCATE(data_in(dfft%nnr*howmany), aux(dfft%nnr))
|
||||
ALLOCATE(data_in_d(dfft%nnr*howmany))
|
||||
CALL fill_random(data_in, data_in_d, dfft%nnr*howmany)
|
||||
!
|
||||
CALL invfft( 'Rho' , data_in_d, dfft, howmany )
|
||||
!
|
||||
DO i=0,howmany-1
|
||||
start = i*dfft%nnr
|
||||
CALL invfft( 'Rho' , data_in(1+start:), dfft, 1 )
|
||||
aux(1:dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
|
||||
! Check
|
||||
CALL test%assert_close( data_in(start+1:start+dfft%nnr), aux(1:dfft%nnr) )
|
||||
END DO
|
||||
!
|
||||
CALL fft_desc_finalize(dfft, smap)
|
||||
DEALLOCATE(data_in, data_in_d, aux)
|
||||
!
|
||||
END SUBROUTINE test_invfft_many_gpu_1
|
||||
|
||||
end program test_fwinv_gpu
|
||||
! dummy subroutines
|
||||
subroutine stop_clock( label )
|
||||
character(len=*) :: label
|
||||
end subroutine stop_clock
|
||||
subroutine start_clock( label )
|
||||
character(len=*) :: label
|
||||
end subroutine start_clock
|
||||
!
|
||||
#else
|
||||
program test_fwinv_gpu
|
||||
end program test_fwinv_gpu
|
||||
#endif
|
|
@ -0,0 +1,862 @@
|
|||
! This file is part of fortran_tester
|
||||
! Copyright 2015 Pierre de Buyl and Peter Colberg
|
||||
! 2016 Pierre de Buyl and Stefano Szaghi
|
||||
! 2018 Pierre de Buyl and Pietro Bonfa
|
||||
! License: BSD
|
||||
|
||||
!> Routines to test Fortran programs
|
||||
!!
|
||||
!! fortran_tester is a pure-Fortran module. It provides a datatype to hold test results and
|
||||
!! routines to test for equality, closeness, and positivity of variables. The routines are
|
||||
!! overloaded and the resulting interface consists of a small number of names.
|
||||
|
||||
module tester
|
||||
use, intrinsic :: iso_fortran_env, only : int8, int16, int32, int64, real32, real64
|
||||
|
||||
implicit none
|
||||
private
|
||||
public :: tester_t
|
||||
|
||||
!> The main **tester** class.
|
||||
type :: tester_t
|
||||
integer(int32) :: n_errors=0_int32 !< Number of errors.
|
||||
integer(int32) :: n_tests=0_int32 !< Number of tests.
|
||||
real(real32) :: tolerance32=2._real32*epsilon(1._real32) !< Real tolerance, 32 bits.
|
||||
real(real64) :: tolerance64=2._real64*epsilon(1._real64) !< Real tolerance, 64 bits.
|
||||
contains
|
||||
procedure :: init !< Initialize the tester.
|
||||
procedure :: print !< Print tests results.
|
||||
generic, public :: assert_equal => &
|
||||
assert_equal_i8, &
|
||||
assert_equal_i16, &
|
||||
assert_equal_i32, &
|
||||
assert_equal_i64, &
|
||||
assert_equal_r32, &
|
||||
assert_equal_r64, &
|
||||
assert_equal_c32, &
|
||||
assert_equal_c64, &
|
||||
assert_equal_l, &
|
||||
assert_equal_i8_1, &
|
||||
assert_equal_i16_1, &
|
||||
assert_equal_i32_1, &
|
||||
assert_equal_i64_1, &
|
||||
assert_equal_r32_1, &
|
||||
assert_equal_r64_1, &
|
||||
assert_equal_c32_1, &
|
||||
assert_equal_c64_1, &
|
||||
assert_equal_l_1 !< Check if two values (integer, real, complex or logical) are equal.
|
||||
procedure, private :: assert_equal_i8 !< Check if two integers (8 bits) are equal.
|
||||
procedure, private :: assert_equal_i16 !< Check if two integers (16 bits) are equal.
|
||||
procedure, private :: assert_equal_i32 !< Check if two integers (32 bits) are equal.
|
||||
procedure, private :: assert_equal_i64 !< Check if two integers (64 bits) are equal.
|
||||
procedure, private :: assert_equal_r32 !< Check if two reals (32 bits) are equal.
|
||||
procedure, private :: assert_equal_r64 !< Check if two reals (64 bits) are equal.
|
||||
procedure, private :: assert_equal_c32 !< Check if two complex numbers (32 bits) are equal.
|
||||
procedure, private :: assert_equal_c64 !< Check if two complex numbers (64 bits) are equal.
|
||||
procedure, private :: assert_equal_l !< Check if two logicals are equal.
|
||||
procedure, private :: assert_equal_i8_1 !< Check if two integer (8 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_i16_1 !< Check if two integer (16 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_i32_1 !< Check if two integer (32 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_i64_1 !< Check if two integer (64 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_r32_1 !< Check if two real (32 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_r64_1 !< Check if two real (64 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_c32_1 !< Check if two complex (32 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_c64_1 !< Check if two complex (64 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_l_1 !< Check if two logical arrays (rank 1) are equal.
|
||||
generic, public :: assert_positive => &
|
||||
assert_positive_i8, &
|
||||
assert_positive_i16, &
|
||||
assert_positive_i32, &
|
||||
assert_positive_i64, &
|
||||
assert_positive_r32, &
|
||||
assert_positive_r64, &
|
||||
assert_positive_i8_1, &
|
||||
assert_positive_i16_1, &
|
||||
assert_positive_i32_1, &
|
||||
assert_positive_i64_1, &
|
||||
assert_positive_r32_1, &
|
||||
assert_positive_r64_1 !< Check if a number (integer or real) is positive.
|
||||
procedure, private :: assert_positive_i8 !< Check if a integer (8 bits) is positive.
|
||||
procedure, private :: assert_positive_i16 !< Check if a integer (16 bits) is positive.
|
||||
procedure, private :: assert_positive_i32 !< Check if a integer (32 bits) is positive.
|
||||
procedure, private :: assert_positive_i64 !< Check if a integer (64 bits) is positive.
|
||||
procedure, private :: assert_positive_r32 !< Check if a real (32 bits) is positive.
|
||||
procedure, private :: assert_positive_r64 !< Check if a real (64 bits) is positive.
|
||||
procedure, private :: assert_positive_i8_1 !< Check if a integer (8 bits) array (rank 1) is positive.
|
||||
procedure, private :: assert_positive_i16_1 !< Check if a integer (16 bits) array (rank 1) is positive.
|
||||
procedure, private :: assert_positive_i32_1 !< Check if a integer (32 bits) array (rank 1) is positive.
|
||||
procedure, private :: assert_positive_i64_1 !< Check if a integer (64 bits) array (rank 1) is positive.
|
||||
procedure, private :: assert_positive_r32_1 !< Check if a real (32 bits) array (rank 1) is positive.
|
||||
procedure, private :: assert_positive_r64_1 !< Check if a real (64 bits) array (rank 1) is positive.
|
||||
generic, public :: assert_close => &
|
||||
assert_close_r32, &
|
||||
assert_close_r64, &
|
||||
assert_close_c32, &
|
||||
assert_close_c64, &
|
||||
assert_close_r32_1, &
|
||||
assert_close_r64_1, &
|
||||
assert_close_c32_1, &
|
||||
assert_close_c64_1 !< Check if two values (real or complex) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_r32 !< Check if two reals (32 bits) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_r64 !< Check if two reals (64 bits) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_c32 !< Check if two complex numbers (32 bits) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_c64 !< Check if two complex numbers (64 bits) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_r32_1 !< Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_r64_1 !< Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_c32_1 !< Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_c64_1 !< Check if two complex (64 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
end type tester_t
|
||||
|
||||
contains
|
||||
|
||||
!> Initialize the tester.
|
||||
subroutine init(this, tolerance32, tolerance64)
|
||||
class(tester_t), intent(out) :: this !< The tester.
|
||||
real(real32), intent(in), optional :: tolerance32 !< Real tolerance, 32 bits.
|
||||
real(real64), intent(in), optional :: tolerance64 !< Real tolerance, 64 bits.
|
||||
|
||||
this% n_errors = 0
|
||||
this% n_tests = 0
|
||||
|
||||
if (present(tolerance64)) then
|
||||
this% tolerance64 = tolerance64
|
||||
else
|
||||
this% tolerance64 = 2._real64*epsilon(1._real64)
|
||||
end if
|
||||
|
||||
if (present(tolerance32)) then
|
||||
this% tolerance32 = tolerance32
|
||||
else
|
||||
this% tolerance32 = 2._real32*epsilon(1._real32)
|
||||
end if
|
||||
|
||||
end subroutine init
|
||||
|
||||
!> Print tests results.
|
||||
subroutine print(this, errorstop)
|
||||
class(tester_t), intent(in) :: this !< The tester.
|
||||
logical, intent(in), optional :: errorstop !< Flag to activate error stop if one test fails.
|
||||
|
||||
logical :: do_errorstop
|
||||
if (present(errorstop)) then
|
||||
do_errorstop = errorstop
|
||||
else
|
||||
do_errorstop = .true.
|
||||
end if
|
||||
|
||||
write(*,*) 'fortran_tester:', this% n_errors, ' error(s) for', this% n_tests, 'test(s)'
|
||||
|
||||
if (this% n_errors == 0) then
|
||||
write(*,*) 'fortran_tester: all tests succeeded'
|
||||
else
|
||||
write(*,*) 'fortran_tester: tests failed'
|
||||
if (do_errorstop) then
|
||||
stop 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine print
|
||||
|
||||
!> Check if two integers (8 bits) are equal.
|
||||
subroutine assert_equal_i8(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int8), intent(in) :: i1 !< Value to compare.
|
||||
integer(int8), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i1 .ne. i2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i8
|
||||
|
||||
!> Check if two integers (16 bits) are equal.
|
||||
subroutine assert_equal_i16(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int16), intent(in) :: i1 !< Value to compare.
|
||||
integer(int16), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i1 .ne. i2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i16
|
||||
|
||||
!> Check if two integers (32 bits) are equal.
|
||||
subroutine assert_equal_i32(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int32), intent(in) :: i1 !< Value to compare.
|
||||
integer(int32), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i1 .ne. i2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i32
|
||||
|
||||
!> Check if two integers (64 bits) are equal.
|
||||
subroutine assert_equal_i64(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int64), intent(in) :: i1 !< Value to compare.
|
||||
integer(int64), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i1 .ne. i2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i64
|
||||
|
||||
!> Check if two reals (32 bits) are equal.
|
||||
subroutine assert_equal_r32(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), intent(in) :: r1 !< Value to compare.
|
||||
real(real32), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (r1 .ne. r2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_r32
|
||||
|
||||
!> Check if two reals (64 bits) are equal.
|
||||
subroutine assert_equal_r64(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), intent(in) :: r1 !< Value to compare.
|
||||
real(real64), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (r1 .ne. r2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_r64
|
||||
|
||||
!> Check if two complex numbers (32 bits) are equal.
|
||||
subroutine assert_equal_c32(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real32), intent(in) :: c1 !< Value to compare.
|
||||
complex(real32), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (c1 .ne. c2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_c32
|
||||
|
||||
!> Check if two complex numbers (64 bits) are equal.
|
||||
subroutine assert_equal_c64(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real64), intent(in) :: c1 !< Value to compare.
|
||||
complex(real64), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (c1 .ne. c2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_c64
|
||||
|
||||
!> Check if two logicals are equal.
|
||||
subroutine assert_equal_l(this, l1, l2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
logical, intent(in) :: l1 !< Value to compare.
|
||||
logical, intent(in) :: l2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (l1 .neqv. l2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_l
|
||||
|
||||
!> Check if two integer (8 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_i8_1(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int8), dimension(:), intent(in) :: i1 !< Value to compare.
|
||||
integer(int8), dimension(:), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(i1) .ne. size(i2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(i1-i2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i8_1
|
||||
|
||||
!> Check if two integer (16 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_i16_1(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int16), dimension(:), intent(in) :: i1 !< Value to compare.
|
||||
integer(int16), dimension(:), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(i1) .ne. size(i2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(i1-i2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i16_1
|
||||
|
||||
!> Check if two integer (32 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_i32_1(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int32), dimension(:), intent(in) :: i1 !< Value to compare.
|
||||
integer(int32), dimension(:), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(i1) .ne. size(i2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(i1-i2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i32_1
|
||||
|
||||
!> Check if two integer (64 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_i64_1(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int64), dimension(:), intent(in) :: i1 !< Value to compare.
|
||||
integer(int64), dimension(:), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(i1) .ne. size(i2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(i1-i2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i64_1
|
||||
|
||||
!> Check if two real (32 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_r32_1(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), dimension(:), intent(in) :: r1 !< Value to compare.
|
||||
real(real32), dimension(:), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(r1) .ne. size(r2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(r1-r2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_r32_1
|
||||
|
||||
!> Check if two real (64 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_r64_1(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), dimension(:), intent(in) :: r1 !< Value to compare.
|
||||
real(real64), dimension(:), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(r1) .ne. size(r2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(r1-r2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_r64_1
|
||||
|
||||
!> Check if two complex (32 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_c32_1(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real32), dimension(:), intent(in) :: c1 !< Value to compare.
|
||||
complex(real32), dimension(:), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(c1) .ne. size(c2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(c1-c2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_c32_1
|
||||
|
||||
!> Check if two complex (64 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_c64_1(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real64), dimension(:), intent(in) :: c1 !< Value to compare.
|
||||
complex(real64), dimension(:), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(c1) .ne. size(c2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(c1-c2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_c64_1
|
||||
|
||||
!> Check if two logical arrays (rank 1) are equal.
|
||||
subroutine assert_equal_l_1(this, l1, l2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
logical, intent(in), dimension(:) :: l1 !< Value to compare.
|
||||
logical, intent(in), dimension(:) :: l2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
integer :: k
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(l1) .ne. size(l2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
do k = 1, size(l1)
|
||||
if (l1(k) .neqv. l2(k)) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
exit
|
||||
end if
|
||||
end do
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_l_1
|
||||
|
||||
!> Check if a integer (32 bits) is positive.
|
||||
subroutine assert_positive_i8(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int8), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i8
|
||||
|
||||
!> Check if a integer (16 bits) is positive.
|
||||
subroutine assert_positive_i16(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int16), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i16
|
||||
|
||||
!> Check if a integer (32 bits) is positive.
|
||||
subroutine assert_positive_i32(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int32), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i32
|
||||
|
||||
!> Check if a integer (32 bits) is positive.
|
||||
subroutine assert_positive_i64(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int64), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i64
|
||||
|
||||
!> Check if a real (32 bits) is positive.
|
||||
subroutine assert_positive_r32(this, r, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), intent(in) :: r !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (r < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_r32
|
||||
|
||||
!> Check if a real (64 bits) is positive.
|
||||
subroutine assert_positive_r64(this, r, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), intent(in) :: r !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (r < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_r64
|
||||
|
||||
!> Check if a integer (8 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_i8_1(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int8), dimension(:), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(i) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i8_1
|
||||
|
||||
!> Check if a integer (16 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_i16_1(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int16), dimension(:), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(i) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i16_1
|
||||
|
||||
!> Check if a integer (32 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_i32_1(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int32), dimension(:), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(i) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i32_1
|
||||
|
||||
!> Check if a integer (64 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_i64_1(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int64), dimension(:), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(i) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i64_1
|
||||
|
||||
!> Check if a real (32 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_r32_1(this, r, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), dimension(:), intent(in) :: r !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(r) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_r32_1
|
||||
|
||||
!> Check if a real (64 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_r64_1(this, r, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), dimension(:), intent(in) :: r !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(r) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_r64_1
|
||||
|
||||
!> Check if two reals (32 bits) are close with respect a tolerance.
|
||||
subroutine assert_close_r32(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), intent(in) :: r1 !< Value to compare.
|
||||
real(real32), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( abs(r1-r2) > this% tolerance32 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_r32
|
||||
|
||||
!> Check if two reals (64 bits) are close with respect a tolerance.
|
||||
subroutine assert_close_r64(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), intent(in) :: r1 !< Value to compare.
|
||||
real(real64), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( abs(r1-r2) > this% tolerance64 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_r64
|
||||
|
||||
!> Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
subroutine assert_close_r32_1(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), intent(in), dimension(:) :: r1 !< Value to compare.
|
||||
real(real32), intent(in), dimension(:) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(r1) .ne. size(r2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_r32_1
|
||||
|
||||
!> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
subroutine assert_close_r64_1(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), intent(in), dimension(:) :: r1 !< Value to compare.
|
||||
real(real64), intent(in), dimension(:) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(r1) .ne. size(r2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_r64_1
|
||||
|
||||
!> Check if two complex numbers (32 bits) are close with respect a tolerance.
|
||||
subroutine assert_close_c32(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real32), intent(in) :: c1 !< Value to compare.
|
||||
complex(real32), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( abs(c1-c2) > this% tolerance32 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_c32
|
||||
|
||||
!> Check if two complex numbers (64 bits) are close with respect a tolerance.
|
||||
subroutine assert_close_c64(this, r1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real64), intent(in) :: r1 !< Value to compare.
|
||||
complex(real64), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( abs(r1-c2) > this% tolerance64 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_c64
|
||||
|
||||
!> Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
subroutine assert_close_c32_1(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real32), intent(in), dimension(:) :: c1 !< Value to compare.
|
||||
complex(real32), intent(in), dimension(:) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(c1) .ne. size(c2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(c1-c2)) > this% tolerance32 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_c32_1
|
||||
|
||||
!> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
subroutine assert_close_c64_1(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real64), intent(in), dimension(:) :: c1 !< Value to compare.
|
||||
complex(real64), intent(in), dimension(:) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(c1) .ne. size(c2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(c1-c2)) > this% tolerance64 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_c64_1
|
||||
|
||||
end module tester
|
|
@ -0,0 +1,94 @@
|
|||
SUBROUTINE collect_results(test)
|
||||
#if defined(__MPI)
|
||||
USE mpi
|
||||
#endif
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
INTEGER :: itottests, itoterr, ierr, me
|
||||
!
|
||||
#if defined(__MPI)
|
||||
!
|
||||
CALL MPI_REDUCE(test%n_errors, itoterr, 1, MPI_INTEGER, MPI_SUM, &
|
||||
0, MPI_COMM_WORLD, ierr)
|
||||
! Fail in case MPI fails...
|
||||
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
|
||||
!
|
||||
CALL MPI_REDUCE(test%n_tests, itottests, 1, MPI_INTEGER, MPI_SUM, &
|
||||
0, MPI_COMM_WORLD, ierr)
|
||||
! Fail in case MPI fails...
|
||||
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
|
||||
!
|
||||
test%n_tests = itottests
|
||||
test%n_errors = itoterr
|
||||
!
|
||||
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
|
||||
!
|
||||
CALL MPI_Comm_rank(MPI_COMM_WORLD, me, ierr);
|
||||
!
|
||||
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
|
||||
!
|
||||
#endif
|
||||
END SUBROUTINE collect_results
|
||||
|
||||
SUBROUTINE save_random_seed(test_name, mpime)
|
||||
IMPLICIT NONE
|
||||
CHARACTER(len=*), INTENT(IN) :: test_name
|
||||
INTEGER, INTENT(IN) :: mpime
|
||||
!
|
||||
INTEGER, PARAMETER :: in_unit=20, out_unit=21
|
||||
CHARACTER(len=80) :: fname
|
||||
INTEGER :: n, istat
|
||||
INTEGER, ALLOCATABLE :: seed(:)
|
||||
!
|
||||
CALL random_seed(size = n)
|
||||
ALLOCATE(seed(n))
|
||||
|
||||
! First try if the OS provides a random number generator
|
||||
OPEN(UNIT=in_unit, file="/dev/urandom", access="stream", &
|
||||
form="unformatted", action="read", status="old", iostat=istat)
|
||||
|
||||
IF (istat == 0) THEN
|
||||
READ(in_unit) seed
|
||||
CLOSE(in_unit)
|
||||
ELSE
|
||||
! Fallback to stupid algorithm. Actually we do not really need
|
||||
! high-quality random numbers
|
||||
CALL random_seed(get=seed)
|
||||
seed = seed + mpime
|
||||
END IF
|
||||
!
|
||||
CALL random_seed(put=seed)
|
||||
!
|
||||
WRITE(fname, '("rnd_seed_",A,I4.4)') TRIM(test_name), mpime
|
||||
fname = TRIM(fname)
|
||||
!
|
||||
OPEN (UNIT=out_unit,FILE=fname,ACTION="write",STATUS="replace")
|
||||
!
|
||||
WRITE (out_unit,*) n
|
||||
WRITE (out_unit,*) seed
|
||||
CLOSE (out_unit)
|
||||
DEALLOCATE(seed)
|
||||
!
|
||||
END SUBROUTINE save_random_seed
|
||||
|
||||
|
||||
SUBROUTINE no_test
|
||||
#if defined(__MPI)
|
||||
USE mpi
|
||||
#endif
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!TYPE(tester_t) :: test
|
||||
INTEGER :: ierr
|
||||
!
|
||||
#if defined(__MPI)
|
||||
CALL MPI_Init(ierr)
|
||||
#endif
|
||||
!CALL test%init()
|
||||
!CALL print_results(test)
|
||||
#if defined(__MPI)
|
||||
CALL mpi_finalize(ierr)
|
||||
#endif
|
||||
END SUBROUTINE no_test
|
|
@ -9,6 +9,8 @@ CG = \
|
|||
ccgdiagg.o \
|
||||
rcgdiagg.o
|
||||
|
||||
-include make.gpu
|
||||
|
||||
all : libcg.a
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,526 @@
|
|||
!
|
||||
! Copyright (C) 2001-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
#define ZERO ( 0.D0, 0.D0 )
|
||||
#define ONE ( 1.D0, 0.D0 )
|
||||
|
||||
FUNCTION KSDdot( n, A, incx, B, incy) result( res )
|
||||
!
|
||||
USE util_param, ONLY : DP
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: incx,incy,n
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
REAL(DP), DEVICE, INTENT(IN) :: A(n), B(n)
|
||||
#else
|
||||
REAL(DP), INTENT(IN) :: A(n), B(n)
|
||||
REAL(DP), EXTERNAL :: ddot
|
||||
#endif
|
||||
!
|
||||
REAL(DP) :: res
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
res = cublasDdot( n, A, incx, B, incy )
|
||||
#else
|
||||
res = ddot( n, A, incx, B, incy )
|
||||
#endif
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END FUNCTION KSDdot
|
||||
|
||||
! define __VERBOSE to print a message after each eigenvalue is computed
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE ccgdiagg_gpu( hs_1psi_gpu, s_1psi_gpu, precondition_d, &
|
||||
npwx, npw, nbnd, npol, psi_d, e_d, btype, &
|
||||
ethr, maxter, reorder, notconv, avg_iter )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... "poor man" iterative diagonalization of a complex hermitian matrix
|
||||
! ... through preconditioned conjugate gradient algorithm
|
||||
! ... Band-by-band algorithm with minimal use of memory
|
||||
! ... Calls hs_1psi and s_1psi to calculate H|psi> + S|psi> and S|psi>
|
||||
! ... Works for generalized eigenvalue problem (US pseudopotentials) as well
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
#if defined(__VERBOSE)
|
||||
USE util_param, ONLY : stdout
|
||||
#endif
|
||||
USE device_memcpy_m, ONLY : dev_memset, dev_memcpy
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... Mathematical constants
|
||||
!
|
||||
REAL(DP), PARAMETER :: pi = 3.14159265358979323846_DP
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npwx, npw, nbnd, npol, maxter
|
||||
INTEGER, INTENT(IN) :: btype(nbnd)
|
||||
REAL(DP), INTENT(IN) :: precondition_d(npwx*npol), ethr
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nbnd)
|
||||
REAL(DP), INTENT(INOUT) :: e_d(nbnd)
|
||||
INTEGER, INTENT(OUT) :: notconv
|
||||
REAL(DP), INTENT(OUT) :: avg_iter
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: precondition_d, psi_d, e_d
|
||||
#endif
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: i, j, k, m, m_start, m_end, iter, moved
|
||||
COMPLEX(DP), ALLOCATABLE :: hpsi_d(:), spsi_d(:), g_d(:), cg_d(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: scg_d(:), ppsi_d(:), g0_d(:), lagrange_d(:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: hpsi_d, spsi_d, g_d, cg_d, scg_d, ppsi_d, g0_d, lagrange_d
|
||||
#endif
|
||||
COMPLEX(DP), ALLOCATABLE :: lagrange(:)
|
||||
REAL(DP) :: gamma, ddot_temp, es_1, es(2)
|
||||
REAL(DP), ALLOCATABLE :: e(:)
|
||||
REAL(DP) :: a0, b0, gg0, gg, gg1, cg0, e0, psi_norm, sint, cost
|
||||
REAL(DP) :: theta, cos2t, sin2t
|
||||
LOGICAL :: reorder
|
||||
INTEGER :: kdim, kdmx, kdim2, ierr, istat
|
||||
REAL(DP) :: empty_ethr, ethr_m
|
||||
!
|
||||
! ... external functions
|
||||
!
|
||||
REAL (DP), EXTERNAL :: ksDdot
|
||||
EXTERNAL hs_1psi_gpu, s_1psi_gpu
|
||||
! hs_1psi( npwx, npw, psi, hpsi, spsi )
|
||||
! s_1psi( npwx, npw, psi, spsi )
|
||||
!
|
||||
CALL start_clock( 'ccgdiagg' )
|
||||
!
|
||||
empty_ethr = MAX( ( ethr * 5.D0 ), 1.D-5 )
|
||||
!
|
||||
IF ( npol == 1 ) THEN
|
||||
!
|
||||
kdim = npw
|
||||
kdmx = npwx
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
kdim = npwx * npol
|
||||
kdmx = npwx * npol
|
||||
!
|
||||
END IF
|
||||
!
|
||||
kdim2 = 2 * kdim
|
||||
!
|
||||
ALLOCATE( hpsi_d(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate hpsi_d ', ABS(ierr) )
|
||||
ALLOCATE( spsi_d(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate spsi_d ', ABS(ierr) )
|
||||
ALLOCATE( g_d(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate g_d ', ABS(ierr) )
|
||||
ALLOCATE( cg_d(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate cg_d ', ABS(ierr) )
|
||||
ALLOCATE( scg_d(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate scg_d ', ABS(ierr) )
|
||||
ALLOCATE( ppsi_d(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate ppsi_d ', ABS(ierr) )
|
||||
ALLOCATE( g0_d(kdmx), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate g0_d ', ABS(ierr) )
|
||||
!
|
||||
ALLOCATE( lagrange_d(nbnd), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate lagrange_d ', ABS(ierr) )
|
||||
!
|
||||
ALLOCATE( lagrange(nbnd), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' ccgdiagg ',' cannot allocate lagrange ', ABS(ierr) )
|
||||
ALLOCATE( e(nbnd) )
|
||||
!
|
||||
avg_iter = 0.D0
|
||||
notconv = 0
|
||||
moved = 0
|
||||
!
|
||||
! ... every eigenfunction is calculated separately
|
||||
!
|
||||
DO m = 1, nbnd
|
||||
!
|
||||
IF ( btype(m) == 1 ) THEN
|
||||
!
|
||||
ethr_m = ethr
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
ethr_m = empty_ethr
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL dev_memset( spsi_d , ZERO )
|
||||
CALL dev_memset( scg_d , ZERO )
|
||||
CALL dev_memset( hpsi_d , ZERO )
|
||||
CALL dev_memset( g_d , ZERO )
|
||||
CALL dev_memset( cg_d , ZERO )
|
||||
CALL dev_memset( g0_d , ZERO )
|
||||
CALL dev_memset( ppsi_d , ZERO )
|
||||
CALL dev_memset( lagrange_d , ZERO )
|
||||
!
|
||||
! ... calculate S|psi>
|
||||
!
|
||||
CALL s_1psi_gpu( npwx, npw, psi_d(1,m), spsi_d )
|
||||
!
|
||||
! ... orthogonalize starting eigenfunction to those already calculated
|
||||
!
|
||||
call divide(inter_bgrp_comm,m,m_start,m_end); !write(*,*) m,m_start,m_end
|
||||
lagrange = ZERO
|
||||
if(m_start.le.m_end) &
|
||||
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi_d(1,m_start), &
|
||||
kdmx, spsi_d, 1, ZERO, lagrange_d(m_start), 1 )
|
||||
if(m_start.le.m_end) lagrange(m_start:m_end) = lagrange_d(m_start:m_end)
|
||||
|
||||
CALL mp_sum( lagrange( 1:m ), inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( lagrange( 1:m ), intra_bgrp_comm )
|
||||
!
|
||||
psi_norm = DBLE( lagrange(m) )
|
||||
lagrange_d(1:m) = lagrange(1:m)
|
||||
!
|
||||
DO j = 1, m - 1
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, kdmx
|
||||
!
|
||||
psi_d(i,m) = psi_d(i,m) - lagrange_d(j) * psi_d(i,j)
|
||||
!
|
||||
END DO
|
||||
!
|
||||
psi_norm = psi_norm - ( DBLE( lagrange(j) )**2 + AIMAG( lagrange(j) )**2 )
|
||||
!
|
||||
END DO
|
||||
!
|
||||
psi_norm = SQRT( psi_norm )
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, kdmx
|
||||
psi_d(i,m) = psi_d(i,m) / psi_norm
|
||||
END DO
|
||||
!
|
||||
! ... calculate starting gradient (|hpsi> = H|psi>) ...
|
||||
!
|
||||
CALL hs_1psi_gpu( npwx, npw, psi_d(1,m), hpsi_d, spsi_d )
|
||||
!
|
||||
! ... and starting eigenvalue (e = <y|PHP|y> = <psi|H|psi>)
|
||||
!
|
||||
! ... NB: ddot(2*npw,a,1,b,1) = REAL( zdotc(npw,a,1,b,1) )
|
||||
!
|
||||
e(m) = ksDdot( kdim2, psi_d(1,m), 1, hpsi_d, 1 )
|
||||
!
|
||||
CALL mp_sum( e(m), intra_bgrp_comm )
|
||||
!
|
||||
! ... start iteration for this band
|
||||
!
|
||||
iterate: DO iter = 1, maxter
|
||||
!
|
||||
! ... calculate P (PHP)|y>
|
||||
! ... ( P = preconditioning matrix, assumed diagonal )
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, kdmx
|
||||
g_d(i) = hpsi_d(i) / precondition_d(i)
|
||||
ppsi_d(i) = spsi_d(i) / precondition_d(i)
|
||||
END DO
|
||||
!
|
||||
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
|
||||
!
|
||||
es(1) = ksDdot( kdim2, spsi_d(1), 1, g_d(1), 1 )
|
||||
es(2) = ksDdot( kdim2, spsi_d(1), 1, ppsi_d(1), 1 )
|
||||
!
|
||||
CALL mp_sum( es , intra_bgrp_comm )
|
||||
!
|
||||
es(1) = es(1) / es(2)
|
||||
es_1 = es(1)
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, kdmx
|
||||
g_d(i) = g_d(i) - es_1 * ppsi_d(i)
|
||||
END DO
|
||||
!
|
||||
! ... e1 = <y| S P^2 PHP|y> / <y| S S P^2|y> ensures that
|
||||
! ... <g| S P^2|y> = 0
|
||||
! ... orthogonalize to lowest eigenfunctions (already calculated)
|
||||
!
|
||||
! ... scg is used as workspace
|
||||
!
|
||||
CALL s_1psi_gpu( npwx, npw, g_d(1), scg_d(1) )
|
||||
!
|
||||
lagrange(1:m-1) = ZERO
|
||||
call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end
|
||||
if(m_start.le.m_end) &
|
||||
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi_d(1,m_start), &
|
||||
kdmx, scg_d, 1, ZERO, lagrange_d(m_start), 1 )
|
||||
if(m_start.le.m_end) lagrange(m_start:m_end) = lagrange_d(m_start:m_end)
|
||||
CALL mp_sum( lagrange( 1:m-1 ), inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( lagrange( 1:m-1 ), intra_bgrp_comm )
|
||||
!
|
||||
lagrange_d(1:m) = lagrange(1:m)
|
||||
!
|
||||
DO j = 1, ( m - 1 )
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, kdmx
|
||||
g_d(i) = g_d(i) - lagrange_d(j) * psi_d(i,j)
|
||||
scg_d(i) = scg_d(i) - lagrange_d(j) * psi_d(i,j)
|
||||
END DO
|
||||
!
|
||||
END DO
|
||||
!
|
||||
IF ( iter /= 1 ) THEN
|
||||
!
|
||||
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
|
||||
!
|
||||
gg1 = ksDdot( kdim2, g_d(1), 1, g0_d(1), 1 )
|
||||
!
|
||||
CALL mp_sum( gg1, intra_bgrp_comm )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... gg is <g(n+1)|S|g(n+1)>
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, kdmx
|
||||
g0_d(i) = scg_d(i) * precondition_d(i)
|
||||
END DO
|
||||
!
|
||||
gg = ksDdot( kdim2, g_d(1), 1, g0_d(1), 1 )
|
||||
!
|
||||
CALL mp_sum( gg, intra_bgrp_comm )
|
||||
!
|
||||
IF ( iter == 1 ) THEN
|
||||
!
|
||||
! ... starting iteration, the conjugate gradient |cg> = |g>
|
||||
!
|
||||
gg0 = gg
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, kdmx
|
||||
cg_d(i) = g_d(i)
|
||||
END DO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! ... |cg(n+1)> = |g(n+1)> + gamma(n) * |cg(n)>
|
||||
!
|
||||
! ... Polak-Ribiere formula :
|
||||
!
|
||||
gamma = ( gg - gg1 ) / gg0
|
||||
gg0 = gg
|
||||
!
|
||||
!
|
||||
!
|
||||
! See comment below
|
||||
!!DO i = 1, kdmx
|
||||
!! cg_d(i) = g_d(i) + cg_d(i) * gamma
|
||||
!!END DO
|
||||
!
|
||||
! ... The following is needed because <y(n+1)| S P^2 |cg(n+1)>
|
||||
! ... is not 0. In fact :
|
||||
! ... <y(n+1)| S P^2 |cg(n)> = sin(theta)*<cg(n)|S|cg(n)>
|
||||
!
|
||||
psi_norm = gamma * cg0 * sint
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, kdmx
|
||||
! v== this breaks the logic, done here for performance
|
||||
cg_d(i) = (g_d(i) + cg_d(i) * gamma) - psi_norm * psi_d(i,m)
|
||||
END DO
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... |cg> contains now the conjugate gradient
|
||||
!
|
||||
! ... |scg> is S|cg>
|
||||
!
|
||||
CALL hs_1psi_gpu( npwx, npw, cg_d(1), ppsi_d(1), scg_d(1) )
|
||||
!
|
||||
cg0 = ksDdot( kdim2, cg_d(1), 1, scg_d(1), 1 )
|
||||
!
|
||||
CALL mp_sum( cg0 , intra_bgrp_comm )
|
||||
!
|
||||
cg0 = SQRT( cg0 )
|
||||
!
|
||||
! ... |ppsi> contains now HP|cg>
|
||||
! ... minimize <y(t)|PHP|y(t)> , where :
|
||||
! ... |y(t)> = cos(t)|y> + sin(t)/cg0 |cg>
|
||||
! ... Note that <y|P^2S|y> = 1, <y|P^2S|cg> = 0 ,
|
||||
! ... <cg|P^2S|cg> = cg0^2
|
||||
! ... so that the result is correctly normalized :
|
||||
! ... <y(t)|P^2S|y(t)> = 1
|
||||
!
|
||||
a0 = 2.D0 * ksDdot( kdim2, psi_d(1,m), 1, ppsi_d(1), 1 ) / cg0
|
||||
!
|
||||
CALL mp_sum( a0 , intra_bgrp_comm )
|
||||
!
|
||||
b0 = ksDdot( kdim2, cg_d(1), 1, ppsi_d(1), 1 ) / cg0**2
|
||||
!
|
||||
CALL mp_sum( b0 , intra_bgrp_comm )
|
||||
!
|
||||
e0 = e(m)
|
||||
!
|
||||
theta = 0.5D0 * ATAN( a0 / ( e0 - b0 ) )
|
||||
!
|
||||
cost = COS( theta )
|
||||
sint = SIN( theta )
|
||||
!
|
||||
cos2t = cost*cost - sint*sint
|
||||
sin2t = 2.D0*cost*sint
|
||||
!
|
||||
es(1) = 0.5D0 * ( ( e0 - b0 ) * cos2t + a0 * sin2t + e0 + b0 )
|
||||
es(2) = 0.5D0 * ( - ( e0 - b0 ) * cos2t - a0 * sin2t + e0 + b0 )
|
||||
!
|
||||
! ... there are two possible solutions, choose the minimum
|
||||
!
|
||||
IF ( es(2) < es(1) ) THEN
|
||||
!
|
||||
theta = theta + 0.5D0 * pi
|
||||
!
|
||||
cost = COS( theta )
|
||||
sint = SIN( theta )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... new estimate of the eigenvalue
|
||||
!
|
||||
e(m) = MIN( es(1), es(2) )
|
||||
! ... upgrade |psi>
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, kdmx
|
||||
psi_d(i,m) = cost * psi_d(i,m) + sint / cg0 * cg_d(i)
|
||||
END DO
|
||||
!
|
||||
! ... here one could test convergence on the energy
|
||||
!
|
||||
IF ( ABS( e(m) - e0 ) < ethr_m ) EXIT iterate
|
||||
!
|
||||
! ... upgrade H|psi> and S|psi>
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, kdmx
|
||||
spsi_d(i) = cost * spsi_d(i) + sint / cg0 * scg_d(i)
|
||||
!
|
||||
hpsi_d(i) = cost * hpsi_d(i) + sint / cg0 * ppsi_d(i)
|
||||
END DO
|
||||
!
|
||||
END DO iterate
|
||||
!
|
||||
#if defined(__VERBOSE)
|
||||
IF ( iter >= maxter ) THEN
|
||||
WRITE(stdout,'("e(",i4,") = ",f12.6," eV (not converged after ",i3,&
|
||||
& " iterations)")') m, e(m)*13.6058, iter
|
||||
ELSE
|
||||
WRITE(stdout,'("e(",i4,") = ",f12.6," eV (",i3," iterations)")') &
|
||||
m, e(m)*13.6058, iter
|
||||
END IF
|
||||
FLUSH (stdout)
|
||||
#endif
|
||||
IF ( iter >= maxter ) notconv = notconv + 1
|
||||
!
|
||||
avg_iter = avg_iter + iter + 1
|
||||
! ... reorder eigenvalues if they are not in the right order
|
||||
! ... ( this CAN and WILL happen in not-so-special cases )
|
||||
!
|
||||
!
|
||||
IF ( m > 1 .AND. reorder ) THEN
|
||||
!
|
||||
IF ( e(m) - e(m-1) < - 2.D0 * ethr_m ) THEN
|
||||
! ... if the last calculated eigenvalue is not the largest...
|
||||
!
|
||||
DO i = m - 2, 1, - 1
|
||||
!
|
||||
IF ( e(m) - e(i) > 2.D0 * ethr_m ) EXIT
|
||||
!
|
||||
END DO
|
||||
!
|
||||
i = i + 1
|
||||
!
|
||||
moved = moved + 1
|
||||
!
|
||||
! ... last calculated eigenvalue should be in the
|
||||
! ... i-th position: reorder
|
||||
!
|
||||
e0 = e(m)
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO k = 1, kdmx
|
||||
ppsi_d(k) = psi_d(k,m)
|
||||
END DO
|
||||
!
|
||||
DO j = m, i + 1, - 1
|
||||
!
|
||||
e(j) = e(j-1)
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO k = 1, kdmx
|
||||
psi_d(k,j) = psi_d(k,j-1)
|
||||
END DO
|
||||
!
|
||||
END DO
|
||||
!
|
||||
e(i) = e0
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO k = 1, kdmx
|
||||
psi_d(k,i) = ppsi_d(k)
|
||||
END DO
|
||||
!
|
||||
! ... this procedure should be good if only a few inversions occur,
|
||||
! ... extremely inefficient if eigenvectors are often in bad order
|
||||
! ... ( but this should not happen )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
avg_iter = avg_iter / DBLE( nbnd )
|
||||
!
|
||||
! STORING e in e_d since eigenvalues are always on the host
|
||||
CALL dev_memcpy(e_d, e)
|
||||
!
|
||||
DEALLOCATE( lagrange )
|
||||
DEALLOCATE( e )
|
||||
DEALLOCATE( lagrange_d )
|
||||
DEALLOCATE( ppsi_d )
|
||||
DEALLOCATE( g0_d )
|
||||
DEALLOCATE( cg_d )
|
||||
DEALLOCATE( g_d )
|
||||
DEALLOCATE( hpsi_d )
|
||||
DEALLOCATE( scg_d )
|
||||
DEALLOCATE( spsi_d )
|
||||
!
|
||||
CALL stop_clock( 'ccgdiagg' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE ccgdiagg_gpu
|
|
@ -0,0 +1,6 @@
|
|||
# Makefile for CG GPU
|
||||
|
||||
CG += \
|
||||
rcgdiagg_gpu.o \
|
||||
ccgdiagg_gpu.o
|
||||
|
|
@ -0,0 +1,543 @@
|
|||
!
|
||||
! Copyright (C) 2002-2006 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
|
||||
|
||||
SUBROUTINE cgcudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
#if defined(__CUDA)
|
||||
use cudafor
|
||||
use cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
DOUBLE PRECISION :: ALPHA,BETA
|
||||
INTEGER :: INCX,INCY,LDA,M,N
|
||||
CHARACTER :: TRANS
|
||||
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: A, X, Y
|
||||
#endif
|
||||
!
|
||||
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
!
|
||||
END SUBROUTINE cgcudaDGEMV
|
||||
|
||||
! define __VERBOSE to print a message after each eigenvalue is computed
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rcgdiagg_gpu( hs_1psi_gpu, s_1psi_gpu, precondition_d, &
|
||||
npwx, npw, nbnd, psi_d, e_d, btype, &
|
||||
ethr, maxter, reorder, notconv, avg_iter )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... "poor man" iterative diagonalization of a complex hermitian matrix
|
||||
! ... through preconditioned conjugate gradient algorithm
|
||||
! ... Band-by-band algorithm with minimal use of memory
|
||||
! ... Calls hs_1psi and s_1psi to calculate H|psi> + S|psi> and S|psi>
|
||||
! ... Works for generalized eigenvalue problem (US pseudopotentials) as well
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, gstart
|
||||
USE mp, ONLY : mp_sum
|
||||
#if defined(__VERBOSE)
|
||||
USE cg_param, ONLY : stdout
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... Mathematical constants
|
||||
!
|
||||
REAL(DP), PARAMETER :: pi = 3.14159265358979323846_DP
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npwx, npw, nbnd, maxter
|
||||
INTEGER, INTENT(IN) :: btype(nbnd)
|
||||
REAL(DP), INTENT(IN) :: precondition_d(npw), ethr
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx,nbnd)
|
||||
REAL(DP), INTENT(INOUT) :: e_d(nbnd)
|
||||
INTEGER, INTENT(OUT) :: notconv
|
||||
REAL(DP), INTENT(OUT) :: avg_iter
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: precondition_d, psi_d, e_d
|
||||
#endif
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: i, j, l, m, m_start, m_end, iter, moved
|
||||
REAL(DP), ALLOCATABLE :: lagrange_d(:)
|
||||
REAL(DP), ALLOCATABLE :: lagrange(:), e(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: hpsi_d(:), spsi_d(:), g_d(:), cg_d(:), &
|
||||
scg_d(:), ppsi_d(:), g0_d(:), psi_aux(:)
|
||||
COMPLEX(DP) :: psi1, hpsi1, spsi1, ppsi1, scg1, cg1, g1, g01
|
||||
REAL(DP) :: psi_norm, a0, b0, gg0, gamma, gg, gg1, &
|
||||
cg0, e0, es(2), aux
|
||||
REAL(DP) :: es1
|
||||
REAL(DP) :: theta, cost, sint, cos2t, sin2t
|
||||
LOGICAL :: reorder
|
||||
INTEGER :: npw2, npwx2
|
||||
REAL(DP) :: empty_ethr, ethr_m
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: lagrange_d, hpsi_d, spsi_d, g_d, cg_d, scg_d, ppsi_d, g0_d
|
||||
#endif
|
||||
!
|
||||
! ... external functions
|
||||
!
|
||||
REAL(DP), EXTERNAL :: ksDdot
|
||||
EXTERNAL hs_1psi_gpu, s_1psi_gpu
|
||||
! hs_1psi( npwx, npw, psi, hpsi, spsi )
|
||||
! s_1psi( npwx, npw, psi, spsi )
|
||||
!
|
||||
!
|
||||
CALL start_clock( 'rcgdiagg' )
|
||||
!
|
||||
IF ( gstart == -1 ) CALL errore( 'regter', 'gstart variable not initialized', 1 )
|
||||
!
|
||||
empty_ethr = MAX( ( ethr * 5.D0 ), 1.D-5 )
|
||||
!
|
||||
npw2 = 2 * npw
|
||||
npwx2 = 2 * npwx
|
||||
!
|
||||
ALLOCATE( spsi_d( npwx ) )
|
||||
ALLOCATE( scg_d( npwx ) )
|
||||
ALLOCATE( hpsi_d( npwx ) )
|
||||
ALLOCATE( g_d( npwx ) )
|
||||
ALLOCATE( cg_d( npwx ) )
|
||||
ALLOCATE( g0_d( npwx ) )
|
||||
ALLOCATE( ppsi_d( npwx ) )
|
||||
!
|
||||
ALLOCATE( lagrange_d( nbnd ) )
|
||||
ALLOCATE( lagrange ( nbnd ) )
|
||||
ALLOCATE( e ( nbnd ) )
|
||||
ALLOCATE( psi_aux ( nbnd ) )
|
||||
!
|
||||
! Sync eigenvalues that will remain on the Host
|
||||
e(1:nbnd) = e_d(1:nbnd)
|
||||
!print *, 'init ', e(1:nbnd)
|
||||
!
|
||||
avg_iter = 0.D0
|
||||
notconv = 0
|
||||
moved = 0
|
||||
!
|
||||
! ... every eigenfunction is calculated separately
|
||||
!
|
||||
DO m = 1, nbnd
|
||||
|
||||
IF ( btype(m) == 1 ) THEN
|
||||
!
|
||||
ethr_m = ethr
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
ethr_m = empty_ethr
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... calculate S|psi>
|
||||
!
|
||||
CALL s_1psi_gpu( npwx, npw, psi_d(1,m), spsi_d )
|
||||
!
|
||||
! ... orthogonalize starting eigenfunction to those already calculated
|
||||
!
|
||||
call divide(inter_bgrp_comm,m,m_start,m_end); !write(*,*) m,m_start,m_end
|
||||
lagrange = 0.d0
|
||||
if(m_start.le.m_end) &
|
||||
CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi_d(1,m_start), npwx2, spsi_d, 1, 0.D0, lagrange_d(m_start), 1 )
|
||||
if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
|
||||
!print *, 'lagrange ', lagrange(1:m)
|
||||
|
||||
CALL mp_sum( lagrange( 1:m ), inter_bgrp_comm )
|
||||
IF ( gstart == 2 ) THEN
|
||||
psi_aux(1:m) = psi_d(1,1:m)
|
||||
spsi1 = spsi_d(1)
|
||||
lagrange(1:m) = lagrange(1:m) - psi_aux(1:m) * spsi1
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( lagrange( 1:m ), intra_bgrp_comm )
|
||||
!
|
||||
psi_norm = lagrange(m)
|
||||
lagrange_d(1:m) = lagrange(1:m)
|
||||
!
|
||||
DO j = 1, m - 1
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, npwx
|
||||
psi_d(i,m) = psi_d(i,m) - lagrange_d(j) * psi_d(i,j)
|
||||
END DO
|
||||
!
|
||||
!print *, 'psi_norm ', j, psi_norm
|
||||
psi_norm = psi_norm - lagrange(j)**2
|
||||
!
|
||||
END DO
|
||||
!
|
||||
psi_norm = SQRT( psi_norm )
|
||||
!print *, 'psi_norm 178', psi_norm
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, npwx
|
||||
psi_d(i,m) = psi_d(i,m) / psi_norm
|
||||
! ... set Im[ psi(G=0) ] - needed for numerical stability
|
||||
IF (i == 1) THEN
|
||||
IF ( gstart == 2 ) psi_d(1,m) = CMPLX( DBLE(psi_d(1,m)), 0.D0 ,kind=DP)
|
||||
END IF
|
||||
END DO
|
||||
!
|
||||
! ... calculate starting gradient (|hpsi> = H|psi>) ...
|
||||
!
|
||||
CALL hs_1psi_gpu( npwx, npw, psi_d(1,m), hpsi_d, spsi_d )
|
||||
!
|
||||
! ... and starting eigenvalue (e = <y|PHP|y> = <psi|H|psi>)
|
||||
!
|
||||
! ... NB: ddot(2*npw,a,1,b,1) = DBLE( zdotc(npw,a,1,b,1) )
|
||||
!
|
||||
e(m) = 2.D0 * ksDdot( npw2, psi_d(1,m), 1, hpsi_d, 1 )
|
||||
!print *, 'e(m)', e(m)
|
||||
IF ( gstart == 2 ) THEN
|
||||
psi1 = psi_d(1,m)
|
||||
hpsi1 = hpsi_d(1)
|
||||
e(m) = e(m) - psi1 * hpsi1
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( e(m), intra_bgrp_comm )
|
||||
!print *, 'before iterate', psi1, hpsi1, spsi1, e(1:nbnd)
|
||||
!
|
||||
! ... start iteration for this band
|
||||
!
|
||||
iterate: DO iter = 1, maxter
|
||||
!
|
||||
! ... calculate P (PHP)|y>
|
||||
! ... ( P = preconditioning matrix, assumed diagonal )
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i = 1, npw
|
||||
g_d(i) = hpsi_d(i) / precondition_d(i)
|
||||
ppsi_d(i) = spsi_d(i) / precondition_d(i)
|
||||
END DO
|
||||
!
|
||||
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
|
||||
!
|
||||
es(1) = 2.D0 * ksDdot( npw2, spsi_d(1), 1, g_d(1), 1 )
|
||||
es(2) = 2.D0 * ksDdot( npw2, spsi_d(1), 1, ppsi_d(1), 1 )
|
||||
!
|
||||
IF ( gstart == 2 ) THEN
|
||||
!
|
||||
g1 = g_d(1); ppsi1 = ppsi_d(1); spsi1 = spsi_d(1)
|
||||
!
|
||||
es(1) = es(1) - spsi1 * g1
|
||||
es(2) = es(2) - spsi1 * ppsi1
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( es , intra_bgrp_comm )
|
||||
!
|
||||
es(1) = es(1) / es(2)
|
||||
!
|
||||
es1 = es(1)
|
||||
!$cuf kernel do
|
||||
DO i=1, npwx
|
||||
g_d(i) = g_d(i) - es1 * ppsi_d(i)
|
||||
END DO
|
||||
!
|
||||
! ... e1 = <y| S P^2 PHP|y> / <y| S S P^2|y> ensures that
|
||||
! ... <g| S P^2|y> = 0
|
||||
!
|
||||
! ... orthogonalize to lowest eigenfunctions (already calculated)
|
||||
!
|
||||
! ... scg is used as workspace
|
||||
!
|
||||
CALL s_1psi_gpu( npwx, npw, g_d(1), scg_d(1) )
|
||||
!
|
||||
lagrange(1:m-1) = 0.d0
|
||||
call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end
|
||||
if(m_start.le.m_end) &
|
||||
CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi_d(1,m_start), npw2, scg_d, 1, 0.D0, lagrange_d(m_start), 1 )
|
||||
if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
|
||||
CALL mp_sum( lagrange( 1:m-1 ), inter_bgrp_comm )
|
||||
IF ( gstart == 2 ) THEN
|
||||
psi_aux(1:m-1) = psi_d(1,1:m-1)
|
||||
scg1 = scg_d(1)
|
||||
lagrange(1:m-1) = lagrange(1:m-1) - psi_aux(1:m-1) * scg1
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( lagrange( 1:m-1 ), intra_bgrp_comm )
|
||||
!
|
||||
DO j = 1, ( m - 1 )
|
||||
!
|
||||
aux = lagrange(j)
|
||||
!$cuf kernel do(1)
|
||||
DO i = 1, npwx
|
||||
g_d(i) = g_d(i) - aux * psi_d(i,j)
|
||||
scg_d(i) = scg_d(i) - aux * psi_d(i,j)
|
||||
END DO
|
||||
!
|
||||
END DO
|
||||
!
|
||||
IF ( iter /= 1 ) THEN
|
||||
!
|
||||
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
|
||||
!
|
||||
gg1 = 2.D0 * ksDdot( npw2, g_d(1), 1, g0_d(1), 1 )
|
||||
IF ( gstart == 2 ) THEN
|
||||
g1 = g_d(1) ; g01 = g0_d(1)
|
||||
gg1 = gg1 - g1 * g01
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( gg1 , intra_bgrp_comm )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... gg is <g(n+1)|S|g(n+1)>
|
||||
!
|
||||
!$cuf kernel do
|
||||
do i=1, npwx
|
||||
g0_d(i) = scg_d(i)
|
||||
end do
|
||||
!
|
||||
!$cuf kernel do
|
||||
do i=1, npw
|
||||
g0_d(i) = g0_d(i) * precondition_d(i)
|
||||
end do
|
||||
!
|
||||
gg = 2.D0 * ksDdot( npw2, g_d(1), 1, g0_d(1), 1 )
|
||||
IF ( gstart == 2 ) THEN
|
||||
g1 = g_d(1) ; g01 = g0_d(1)
|
||||
gg = gg - g1*g01
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( gg , intra_bgrp_comm )
|
||||
!
|
||||
IF ( iter == 1 ) THEN
|
||||
!
|
||||
! ... starting iteration, the conjugate gradient |cg> = |g>
|
||||
!
|
||||
gg0 = gg
|
||||
!
|
||||
!$cuf kernel do
|
||||
DO i=1, npwx
|
||||
cg_d(i) = g_d(i)
|
||||
! ... |cg> contains now the conjugate gradient
|
||||
! ... set Im[ cg(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 .and. i == 1 ) cg_d(1) = CMPLX( DBLE(cg_d(1)), 0.D0 ,kind=DP)
|
||||
END DO
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! ... |cg(n+1)> = |g(n+1)> + gamma(n) * |cg(n)>
|
||||
!
|
||||
! ... Polak-Ribiere formula :
|
||||
!
|
||||
gamma = ( gg - gg1 ) / gg0
|
||||
gg0 = gg
|
||||
!
|
||||
!$cuf kernel do
|
||||
do i=1, npwx
|
||||
cg_d(i) = g_d(i) + cg_d(i) * gamma
|
||||
end do
|
||||
!
|
||||
! ... The following is needed because <y(n+1)| S P^2 |cg(n+1)>
|
||||
! ... is not 0. In fact :
|
||||
! ... <y(n+1)| S P^2 |cg(n)> = sin(theta)*<cg(n)|S|cg(n)>
|
||||
!
|
||||
psi_norm = gamma * cg0 * sint
|
||||
!
|
||||
!$cuf kernel do
|
||||
do i=1, npwx
|
||||
cg_d(i) = cg_d(i) - psi_norm * psi_d(i,m)
|
||||
! ... |cg> contains now the conjugate gradient
|
||||
! ... set Im[ cg(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 .and. i == 1 ) cg_d(1) = CMPLX( DBLE(cg_d(1)), 0.D0 ,kind=DP)
|
||||
end do
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... |scg> is S|cg>
|
||||
!
|
||||
CALL hs_1psi_gpu( npwx, npw, cg_d(1), ppsi_d(1), scg_d(1) )
|
||||
!
|
||||
cg0 = 2.D0 * ksDdot( npw2, cg_d(1), 1, scg_d(1), 1 )
|
||||
IF ( gstart == 2 ) THEN
|
||||
cg1 = cg_d(1) ; scg1 = scg_d(1)
|
||||
cg0 = cg0 - cg1*scg1
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( cg0 , intra_bgrp_comm )
|
||||
!
|
||||
cg0 = SQRT( cg0 )
|
||||
!
|
||||
! ... |ppsi> contains now HP|cg>
|
||||
! ... minimize <y(t)|PHP|y(t)> , where :
|
||||
! ... |y(t)> = cos(t)|y> + sin(t)/cg0 |cg>
|
||||
! ... Note that <y|P^2S|y> = 1, <y|P^2S|cg> = 0 ,
|
||||
! ... <cg|P^2S|cg> = cg0^2
|
||||
! ... so that the result is correctly normalized :
|
||||
! ... <y(t)|P^2S|y(t)> = 1
|
||||
!
|
||||
a0 = 4.D0 * ksDdot( npw2, psi_d(1,m), 1, ppsi_d(1), 1 )
|
||||
IF ( gstart == 2 ) THEN
|
||||
psi1 = psi_d(1,m)
|
||||
ppsi1 = ppsi_d(1)
|
||||
a0 = a0 - 2.D0 * psi1 * ppsi1
|
||||
END IF
|
||||
!
|
||||
a0 = a0 / cg0
|
||||
!
|
||||
CALL mp_sum( a0 , intra_bgrp_comm )
|
||||
!
|
||||
b0 = 2.D0 * ksDdot( npw2, cg_d(1), 1, ppsi_d(1), 1 )
|
||||
IF ( gstart == 2 ) THEN
|
||||
cg1 = cg_d(1)
|
||||
ppsi1 = ppsi_d(1)
|
||||
b0 = b0 - cg1 * ppsi1
|
||||
END IF
|
||||
!
|
||||
b0 = b0 / cg0**2
|
||||
!
|
||||
CALL mp_sum( b0 , intra_bgrp_comm )
|
||||
!
|
||||
e0 = e(m)
|
||||
!
|
||||
theta = 0.5D0 * ATAN( a0 / ( e0 - b0 ) )
|
||||
!
|
||||
cost = COS( theta )
|
||||
sint = SIN( theta )
|
||||
!
|
||||
cos2t = cost*cost - sint*sint
|
||||
sin2t = 2.D0*cost*sint
|
||||
!
|
||||
es(1) = 0.5D0 * ( ( e0 - b0 ) * cos2t + a0 * sin2t + e0 + b0 )
|
||||
es(2) = 0.5D0 * ( - ( e0 - b0 ) * cos2t - a0 * sin2t + e0 + b0 )
|
||||
!
|
||||
! ... there are two possible solutions, choose the minimum
|
||||
!
|
||||
IF ( es(2) < es(1) ) THEN
|
||||
!
|
||||
theta = theta + 0.5D0 * pi
|
||||
!
|
||||
cost = COS( theta )
|
||||
sint = SIN( theta )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... new estimate of the eigenvalue
|
||||
!
|
||||
e(m) = MIN( es(1), es(2) )
|
||||
!
|
||||
! ... upgrade |psi>
|
||||
!
|
||||
!$cuf kernel do
|
||||
do i=1, npwx
|
||||
psi_d(i,m) = cost * psi_d(i,m) + sint / cg0 * cg_d(i)
|
||||
end do
|
||||
!
|
||||
! ... here one could test convergence on the energy
|
||||
!
|
||||
IF ( ABS( e(m) - e0 ) < ethr_m ) EXIT iterate
|
||||
!
|
||||
! ... upgrade H|psi> and S|psi>
|
||||
!
|
||||
!$cuf kernel do
|
||||
do i=1, npwx
|
||||
spsi_d(i) = cost * spsi_d(i) + sint / cg0 * scg_d(i)
|
||||
!
|
||||
hpsi_d(i) = cost * hpsi_d(i) + sint / cg0 * ppsi_d(i)
|
||||
end do
|
||||
!
|
||||
END DO iterate
|
||||
!
|
||||
#if defined(__VERBOSE)
|
||||
IF ( iter >= maxter ) THEN
|
||||
WRITE(stdout,'("e(",i4,") = ",f12.6," eV (not converged after ",i3,&
|
||||
& " iterations)")') m, e(m)*13.6058, iter
|
||||
ELSE
|
||||
WRITE(stdout,'("e(",i4,") = ",f12.6," eV (",i3," iterations)")') &
|
||||
m, e(m)*13.6058, iter
|
||||
END IF
|
||||
FLUSH (stdout)
|
||||
#endif
|
||||
IF ( iter >= maxter ) notconv = notconv + 1
|
||||
!
|
||||
avg_iter = avg_iter + iter + 1
|
||||
!
|
||||
! ... reorder eigenvalues if they are not in the right order
|
||||
! ... ( this CAN and WILL happen in not-so-special cases )
|
||||
!
|
||||
IF ( m > 1 .AND. reorder ) THEN
|
||||
!
|
||||
IF ( e(m) - e(m-1) < - 2.D0 * ethr_m ) THEN
|
||||
!
|
||||
! ... if the last calculated eigenvalue is not the largest...
|
||||
!
|
||||
DO i = m - 2, 1, - 1
|
||||
!
|
||||
IF ( e(m) - e(i) > 2.D0 * ethr_m ) EXIT
|
||||
!
|
||||
END DO
|
||||
!
|
||||
i = i + 1
|
||||
!
|
||||
moved = moved + 1
|
||||
!
|
||||
! ... last calculated eigenvalue should be in the
|
||||
! ... i-th position: reorder
|
||||
!
|
||||
e0 = e(m)
|
||||
!
|
||||
!$cuf kernel do
|
||||
do l=1, npwx
|
||||
ppsi_d(l) = psi_d(l,m)
|
||||
end do
|
||||
!
|
||||
DO j = m, i + 1, - 1
|
||||
!
|
||||
e(j) = e(j-1)
|
||||
!
|
||||
!$cuf kernel do
|
||||
do l=1, npwx
|
||||
psi_d(l,j) = psi_d(l,j-1)
|
||||
end do
|
||||
!
|
||||
END DO
|
||||
!
|
||||
e(i) = e0
|
||||
!
|
||||
!$cuf kernel do
|
||||
do l=1, npwx
|
||||
psi_d(l,i) = ppsi_d(l)
|
||||
end do
|
||||
!
|
||||
! ... this procedure should be good if only a few inversions occur,
|
||||
! ... extremely inefficient if eigenvectors are often in bad order
|
||||
! ... ( but this should not happen )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
avg_iter = avg_iter / DBLE( nbnd )
|
||||
e_d(1:nbnd) = e(1:nbnd)
|
||||
!
|
||||
DEALLOCATE( lagrange )
|
||||
DEALLOCATE( lagrange_d )
|
||||
DEALLOCATE( e )
|
||||
DEALLOCATE( psi_aux )
|
||||
DEALLOCATE( ppsi_d )
|
||||
DEALLOCATE( g0_d )
|
||||
DEALLOCATE( cg_d )
|
||||
DEALLOCATE( g_d )
|
||||
DEALLOCATE( hpsi_d )
|
||||
DEALLOCATE( scg_d )
|
||||
DEALLOCATE( spsi_d )
|
||||
!
|
||||
CALL stop_clock( 'rcgdiagg' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE rcgdiagg_gpu
|
|
@ -11,6 +11,8 @@ rotate_HSpsi_k.o \
|
|||
rotate_wfc_gamma.o \
|
||||
rotate_wfc_k.o
|
||||
|
||||
-include make.gpu
|
||||
|
||||
all : libdense.a
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
# Makefile for CG GPU
|
||||
|
||||
DENSE += \
|
||||
rotate_wfc_k_gpu.o \
|
||||
rotate_wfc_gamma_gpu.o \
|
||||
rotate_HSpsi_k_gpu.o \
|
||||
rotate_HSpsi_gamma_gpu.o
|
||||
|
||||
rotate_HSpsi_gamma_gpu.o : ../../UtilXlib/mp.o
|
||||
rotate_HSpsi_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
rotate_HSpsi_gamma_gpu.o : ../../UtilXlib/util_param.o
|
||||
rotate_wfc_gamma_gpu.o : ../../LAXlib/la_module.o
|
||||
rotate_wfc_gamma_gpu.o : ../../UtilXlib/mp.o
|
||||
rotate_wfc_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
rotate_wfc_gamma_gpu.o : ../../UtilXlib/util_param.o
|
||||
rotate_wfc_k_gpu.o : ../../LAXlib/la_module.o
|
||||
rotate_wfc_k_gpu.o : ../../UtilXlib/mp.o
|
||||
rotate_wfc_k_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
rotate_wfc_k_gpu.o : ../../UtilXlib/util_param.o
|
|
@ -0,0 +1,222 @@
|
|||
!
|
||||
! Copyright (C) 2003-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
#define ZERO ( 0.D0, 0.D0 )
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rotate_HSpsi_gamma_gpu( npwx, npw, nstart, nbnd, psi_d, hpsi_d, overlap, spsi_d, e_d )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... Serial version of rotate_wfc for Gamma-only calculations
|
||||
! ... This version assumes real wavefunctions (k=0) with only
|
||||
! ... half plane waves stored: psi(-G)=psi*(G), except G=0
|
||||
!
|
||||
! GPU version by Ivan Carnimeo
|
||||
!
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
|
||||
me_bgrp, root_bgrp
|
||||
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
|
||||
USE mp, ONLY : mp_sum, mp_barrier, mp_allgather, mp_type_create_column_section, mp_type_free
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INCLUDE 'laxlib.fh'
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: &
|
||||
npw, & ! dimension of the matrices (psi,Hpsi,Spsi) to be rotated
|
||||
npwx, & ! leading dimension of the wavefunction-related matrices
|
||||
nstart, & ! input number of states
|
||||
nbnd ! output number of states
|
||||
LOGICAL, INTENT(IN) :: overlap ! if .FALSE. : spsi is not needed (and not used)
|
||||
REAL(DP), INTENT(OUT) :: e_d(nbnd) ! eigenvalues of the reduced H matrix
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx,nstart), hpsi_d(npwx,nstart) ! input and output psi, Hpsi,
|
||||
COMPLEX(DP), INTENT(INOUT), OPTIONAL :: spsi_d(npwx,nstart) ! ... and optionnally Spsi
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: kdim, kdmx
|
||||
INTEGER :: n_start, n_end, my_n, recv_counts(nbgrp), displs(nbgrp), column_type
|
||||
INTEGER :: ii, jj ! indexes for cuf kernel loops
|
||||
!
|
||||
! ... device variables
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: hh_d(:,:), ss_d(:,:), vv_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: en_d(:)
|
||||
#if defined (__CUDA)
|
||||
attributes (device) :: psi_d, hpsi_d, spsi_d, e_d
|
||||
attributes (device) :: aux_d, hh_d, ss_d, vv_d, en_d
|
||||
#endif
|
||||
!
|
||||
IF ( gstart == -1 ) CALL errore( 'rotHSw', 'gstart variable not initialized', 1 )
|
||||
IF ( overlap .AND..NOT.present(spsi_d) ) call errore( 'rotHSw','spsi array needed with overlap=.TRUE.',1)
|
||||
!
|
||||
call start_clock('rotHSw'); !write(*,*) 'start rotHSw' ; FLUSH(6)
|
||||
!
|
||||
! ... set Im[ psi(G=0) ] etc - needed for numerical stability
|
||||
!
|
||||
IF ( gstart == 2 ) then
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, nstart
|
||||
psi_d (1,ii) = CMPLX( DBLE( psi_d (1,ii) ), 0.D0,kind=DP)
|
||||
hpsi_d(1,ii) = CMPLX( DBLE( hpsi_d(1,ii) ), 0.D0,kind=DP)
|
||||
END DO
|
||||
IF (overlap) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, nstart
|
||||
spsi_d(1,ii) = CMPLX( DBLE( spsi_d(1,ii) ), 0.D0,kind=DP)
|
||||
END DO
|
||||
END IF
|
||||
END IF
|
||||
|
||||
kdim = 2 * npw
|
||||
kdmx = 2 * npwx
|
||||
!
|
||||
ALLOCATE( hh_d( nstart, nstart ) )
|
||||
ALLOCATE( ss_d( nstart, nstart ) )
|
||||
ALLOCATE( vv_d( nstart, nstart ) )
|
||||
ALLOCATE( en_d( nstart ) )
|
||||
!
|
||||
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
|
||||
!
|
||||
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
|
||||
!
|
||||
call start_clock('rotHSw:hc'); !write(*,*) 'start rotHSw:hc' ; FLUSH(6)
|
||||
CALL mp_type_create_column_section(hh_d(1,1), 0, nstart, nstart, column_type)
|
||||
CALL divide_all(inter_bgrp_comm,nstart,n_start,n_end,recv_counts,displs)
|
||||
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_DGEMM( 'T','N', nstart,my_n,kdim, 2.D0, psi_d,kdmx, hpsi_d(1,n_start),kdmx, 0.D0, hh_d(1,n_start),nstart )
|
||||
IF ( gstart == 2 ) call gpu_DGER( nstart, my_n, -1.D0, psi_d,kdmx, hpsi_d(1,n_start),kdmx, hh_d(1,n_start),nstart )
|
||||
call start_clock('rotHSw:hc:s1')
|
||||
CALL mp_sum( hh_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
|
||||
call stop_clock('rotHSw:hc:s1')
|
||||
! call start_clock('rotHSw:hc:b1'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b1')
|
||||
call start_clock('rotHSw:hc:s2')
|
||||
CALL mp_allgather(hh_d, column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
call stop_clock('rotHSw:hc:s2')
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_DGEMM('T','N', nstart,my_n,kdim, 2.D0, psi_d,kdmx, spsi_d(1,n_start),kdmx, 0.D0, ss_d(1,n_start),nstart)
|
||||
IF ( gstart == 2 ) CALL gpu_DGER(nstart, my_n, -1.D0, psi_d,kdmx, spsi_d(1,n_start),kdmx, ss_d(1,n_start),nstart)
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_DGEMM('T','N', nstart,my_n,kdim, 2.D0, psi_d,kdmx, psi_d(1,n_start),kdmx, 0.D0, ss_d(1,n_start),nstart)
|
||||
IF ( gstart == 2 ) CALL gpu_DGER(nstart, my_n, -1.D0, psi_d,kdmx, psi_d(1,n_start),kdmx, ss_d(1,n_start),nstart)
|
||||
!
|
||||
END IF
|
||||
call start_clock('rotHSw:hc:s3')
|
||||
CALL mp_sum( ss_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
|
||||
call stop_clock('rotHSw:hc:s3')
|
||||
! call start_clock('rotHSw:hc:b2'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b2')
|
||||
call start_clock('rotHSw:hc:s4')
|
||||
CALL mp_allgather(ss_d, column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
call stop_clock('rotHSw:hc:s4')
|
||||
|
||||
CALL mp_type_free( column_type )
|
||||
|
||||
call stop_clock('rotHSw:hc'); !write(*,*) 'stop rotHSw:hc' ; FLUSH(6)
|
||||
!
|
||||
! ... Diagonalize
|
||||
!
|
||||
call start_clock('rotHSw:diag'); !write(*,*) 'start rotHSw:diag' ; FLUSH(6)
|
||||
CALL diaghg( nstart, nbnd, hh_d, ss_d, nstart, en_d, vv_d, me_bgrp, root_bgrp, intra_bgrp_comm )
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, nbnd
|
||||
e_d(ii) = en_d(ii)
|
||||
END DO
|
||||
call stop_clock('rotHSw:diag'); !write(*,*) 'stop rotHSw:diag' ; FLUSH(6)
|
||||
!
|
||||
! ... update the basis set
|
||||
!
|
||||
call start_clock('rotHSw:evc'); !write(*,*) 'start rotHSw:evc' ; FLUSH(6)
|
||||
|
||||
CALL mp_type_create_column_section(psi_d(1,1), 0, npwx, npwx, column_type)
|
||||
CALL divide_all(inter_bgrp_comm,nbnd,n_start,n_end,recv_counts,displs)
|
||||
|
||||
ALLOCATE( aux_d ( npwx, nbnd ) )
|
||||
|
||||
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_DGEMM( 'N','N', kdim,my_n,nstart, 1.D0, psi_d,kdmx,vv_d(1,n_start),nstart, 0.D0, aux_d(1,n_start),kdmx )
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx
|
||||
DO jj = n_start, n_end
|
||||
psi_d(ii,jj) = aux_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
! call start_clock('rotHSw:ev:b3'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b3')
|
||||
call start_clock('rotHSw:ev:s5')
|
||||
CALL mp_allgather(psi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
call stop_clock('rotHSw:ev:s5')
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_DGEMM( 'N','N', kdim,my_n,nstart, 1.D0,hpsi_d,kdmx,vv_d(1,n_start),nstart, 0.D0, aux_d(1,n_start),kdmx )
|
||||
!$cuf kernel do (2)
|
||||
DO ii = 1, npwx
|
||||
DO jj = n_start, n_end
|
||||
hpsi_d(ii,jj) = aux_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
! call start_clock('rotHSw:ev:b4'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b4')
|
||||
call start_clock('rotHSw:ev:s6')
|
||||
CALL mp_allgather(hpsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
call stop_clock('rotHSw:ev:s6')
|
||||
|
||||
IF (overlap) THEN
|
||||
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_DGEMM( 'N','N', kdim,my_n,nstart, 1.D0,spsi_d,kdmx,vv_d(1,n_start),nstart, 0.D0, aux_d(1,n_start),kdmx )
|
||||
!$cuf kernel do (2)
|
||||
DO ii = 1, npwx
|
||||
DO jj = n_start, n_end
|
||||
spsi_d(ii,jj) = aux_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
! call start_clock('rotHSw:ev:b5'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b5')
|
||||
call start_clock('rotHSw:ev:s7')
|
||||
CALL mp_allgather(spsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
call stop_clock('rotHSw:ev:s7')
|
||||
|
||||
ELSE IF (present(spsi_d)) THEN
|
||||
|
||||
!$cuf kernel do (2)
|
||||
DO ii = 1, npwx
|
||||
DO jj = 1, nbnd
|
||||
spsi_d(ii,jj) = psi_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
|
||||
END IF
|
||||
|
||||
DEALLOCATE( aux_d )
|
||||
CALL mp_type_free( column_type )
|
||||
|
||||
call stop_clock('rotHSw:evc'); !write(*,*) 'stop rotHSw:evc' ; FLUSH(6)
|
||||
!
|
||||
DEALLOCATE( en_d )
|
||||
DEALLOCATE( vv_d )
|
||||
DEALLOCATE( ss_d )
|
||||
DEALLOCATE( hh_d )
|
||||
call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6)
|
||||
!call print_clock('rotHSw')
|
||||
!call print_clock('rotHSw:hc')
|
||||
!call print_clock('rotHSw:diag')
|
||||
!call print_clock('rotHSw:evc')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE rotate_HSpsi_gamma_gpu
|
|
@ -0,0 +1,286 @@
|
|||
!
|
||||
! Copyright (C) 2003-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
#define ZERO ( 0.D0, 0.D0 )
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, overlap, spsi_d, e_d )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... Serial version of rotate_wfc for colinear, k-point calculations
|
||||
!
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
|
||||
me_bgrp, root_bgrp
|
||||
USE mp, ONLY : mp_sum, mp_barrier, mp_allgather, mp_type_create_column_section, mp_type_free
|
||||
USE device_memcpy_m, ONLY: dev_memcpy, dev_memset
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INCLUDE 'laxlib.fh'
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: &
|
||||
npw, & ! dimension of the matrices (psi,Hpsi,Spsi) to be rotated
|
||||
npwx, & ! leading dimension of the wavefunction-related matrices
|
||||
nstart, & ! input number of states
|
||||
nbnd, & ! output number of states
|
||||
npol ! number of spin polarizations
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nstart), hpsi_d(npwx*npol,nstart) ! input and output psi, Hpsi,
|
||||
COMPLEX(DP), INTENT(INOUT), OPTIONAL :: spsi_d(npwx*npol,nstart) ! ... and optionnally Spsi
|
||||
LOGICAL, INTENT(IN) :: overlap ! if .FALSE. : spsi is not needed (and not used)
|
||||
REAL(DP), INTENT(OUT) :: e_d(nbnd) ! eigenvalues of the reduced H matrix
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: kdim, kdmx
|
||||
INTEGER :: n_start, n_end, my_n, recv_counts(nbgrp), displs(nbgrp), column_type
|
||||
INTEGER :: ii, jj ! indexes for cuf kernel loops
|
||||
!
|
||||
! ... device variables
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: hh_d(:,:), ss_d(:,:), vv_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: en_d(:)
|
||||
#if defined (__CUDA)
|
||||
attributes(device) :: aux_d, psi_d, hpsi_d, spsi_d
|
||||
attributes(device) :: hh_d, ss_d, vv_d, en_d, e_d
|
||||
#endif
|
||||
!
|
||||
IF ( overlap .AND..NOT.present(spsi_d) ) call errore( 'rotHSw','spsi_d array needed with overlap=.TRUE.',1)
|
||||
!
|
||||
call start_clock('rotHSw'); !write(*,*) 'start rotHSw' ; FLUSH(6)
|
||||
!
|
||||
if (npol == 2 .and. npw < npwx ) then ! pack wfcs so that pw's are contiguous
|
||||
call start_clock('rotHSw:move'); !write(*,*) 'start rotHSw:move' ; FLUSH(6)
|
||||
ALLOCATE ( aux_d ( npwx, nstart ) )
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nstart
|
||||
aux_d(ii,jj) = psi_d (npwx+ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nstart
|
||||
psi_d (npw+ii,jj) = aux_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nstart
|
||||
aux_d(ii,jj) = hpsi_d(npwx+ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nstart
|
||||
hpsi_d(npw+ii,jj) = aux_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nstart
|
||||
aux_d(ii,jj) = spsi_d(npwx+ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nstart
|
||||
spsi_d(npw+ii,jj) = aux_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
DEALLOCATE( aux_d )
|
||||
call stop_clock('rotHSw:move'); !write(*,*) 'stop rotHSw:move' ; FLUSH(6)
|
||||
end if
|
||||
|
||||
kdim = npol * npw
|
||||
kdmx = npol * npwx
|
||||
!
|
||||
ALLOCATE( hh_d( nstart, nstart ) )
|
||||
ALLOCATE( ss_d( nstart, nstart ) )
|
||||
ALLOCATE( vv_d( nstart, nstart ) )
|
||||
ALLOCATE( en_d( nstart ) )
|
||||
!
|
||||
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
|
||||
!
|
||||
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
|
||||
!
|
||||
call start_clock('rotHSw:hc'); !write(*,*) 'start rotHSw:hc' ; FLUSH(6)
|
||||
|
||||
CALL mp_type_create_column_section(hh_d(1,1), 0, nstart, nstart, column_type)
|
||||
CALL divide_all(inter_bgrp_comm,nstart,n_start,n_end,recv_counts,displs)
|
||||
|
||||
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, kdmx, hpsi_d(1,n_start), kdmx, (0.D0,0.D0), &
|
||||
hh_d(1,n_start), nstart )
|
||||
call start_clock('rotHSw:hc:s1')
|
||||
CALL mp_sum( hh_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
|
||||
call stop_clock('rotHSw:hc:s1')
|
||||
!call start_clock('rotHSw:hc:b1'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b1')
|
||||
call start_clock('rotHSw:hc:s2')
|
||||
CALL mp_allgather(hh_d, column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
call stop_clock('rotHSw:hc:s2')
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, kdmx, spsi_d(1,n_start), kdmx, &
|
||||
(0.D0,0.D0), ss_d(1,n_start), nstart )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, kdmx, psi_d(1,n_start), kdmx, &
|
||||
(0.D0,0.D0), ss_d(1,n_start), nstart )
|
||||
!
|
||||
END IF
|
||||
call start_clock('rotHSw:hc:s3')
|
||||
CALL mp_sum( ss_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
|
||||
call stop_clock('rotHSw:hc:s3')
|
||||
!call start_clock('rotHSw:hc:b2'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b2')
|
||||
call start_clock('rotHSw:hc:s4')
|
||||
CALL mp_allgather(ss_d, column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
call stop_clock('rotHSw:hc:s4')
|
||||
|
||||
CALL mp_type_free( column_type )
|
||||
|
||||
call stop_clock('rotHSw:hc'); !write(*,*) 'stop rotHSw:hc' ; FLUSH(6)
|
||||
!
|
||||
! ... Diagonalize
|
||||
!
|
||||
call start_clock('rotHSw:diag'); !write(*,*) 'start rotHSw:diag' ; FLUSH(6)
|
||||
CALL diaghg( nstart, nbnd, hh_d, ss_d, nstart, en_d, vv_d, me_bgrp, root_bgrp, intra_bgrp_comm )
|
||||
CALL dev_memcpy(e_d, en_d, [1,nbnd])
|
||||
call stop_clock('rotHSw:diag'); !write(*,*) 'stop rotHSw:diag' ; FLUSH(6)
|
||||
!
|
||||
! ... update the basis set
|
||||
!
|
||||
call start_clock('rotHSw:evc'); !write(*,*) 'start rotHSw:evc' ; FLUSH(6)
|
||||
|
||||
CALL mp_type_create_column_section(psi_d(1,1), 0, npwx, npwx, column_type)
|
||||
CALL divide_all(inter_bgrp_comm,nbnd,n_start,n_end,recv_counts,displs)
|
||||
|
||||
ALLOCATE( aux_d ( kdmx, nbnd ) )
|
||||
|
||||
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), psi_d, kdmx, vv_d(1,n_start), nstart, &
|
||||
(0.D0,0.D0), aux_d(1,n_start), kdmx )
|
||||
|
||||
CALL dev_memcpy(psi_d, aux_d, [1, kdmx], 1, [n_start,n_end])
|
||||
!call start_clock('rotHSw:ev:b3'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b3')
|
||||
call start_clock('rotHSw:ev:s5')
|
||||
CALL mp_allgather(psi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
call stop_clock('rotHSw:ev:s5')
|
||||
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), hpsi_d, kdmx, vv_d(1,n_start), nstart, &
|
||||
(0.D0,0.D0), aux_d(1,n_start), kdmx )
|
||||
CALL dev_memcpy(hpsi_d, aux_d, [1, kdmx], 1, [n_start,n_end]) !call start_clock('rotHSw:ev:b4'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b4')
|
||||
call start_clock('rotHSw:ev:s6')
|
||||
CALL mp_allgather(hpsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
call stop_clock('rotHSw:ev:s6')
|
||||
|
||||
IF (overlap) THEN
|
||||
if (n_start .le. n_end) &
|
||||
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), spsi_d, kdmx, vv_d(1,n_start), &
|
||||
nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
|
||||
CALL dev_memcpy(spsi_d, aux_d, [1, kdmx], 1, [n_start,n_end]) !call start_clock('rotHSw:ev:b5'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b5')
|
||||
call start_clock('rotHSw:ev:s7')
|
||||
CALL mp_allgather(spsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
call stop_clock('rotHSw:ev:s7')
|
||||
|
||||
ELSE IF (present(spsi_d)) THEN
|
||||
|
||||
CALL dev_memcpy(spsi_d, psi_d, [1, kdmx], 1, [n_start,n_end])
|
||||
END IF
|
||||
|
||||
DEALLOCATE( aux_d )
|
||||
CALL mp_type_free( column_type )
|
||||
call stop_clock('rotHSw:evc'); !write(*,*) 'stop rotHSw:evc' ; FLUSH(6)
|
||||
!
|
||||
DEALLOCATE( vv_d )
|
||||
DEALLOCATE( ss_d )
|
||||
DEALLOCATE( hh_d )
|
||||
DEALLOCATE( en_d )
|
||||
!
|
||||
call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6)
|
||||
!call print_clock('rotHSw')
|
||||
!call print_clock('rotHSw:hc')
|
||||
!call print_clock('rotHSw:diag')
|
||||
!call print_clock('rotHSw:evc')
|
||||
!
|
||||
if (npol== 2 .and. npw < npwx ) then ! unpack wfcs to its original order
|
||||
call start_clock('rotHSw:move'); !write(*,*) 'start rotHSw:move' ; FLUSH(6)
|
||||
ALLOCATE ( aux_d ( npwx, nbnd ) )
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nbnd
|
||||
aux_d(ii,jj) = psi_d (npw+ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nbnd
|
||||
psi_d(npwx+ii,jj) = aux_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nbnd
|
||||
aux_d(ii,jj) = hpsi_d(npw+ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nbnd
|
||||
hpsi_d(npwx+ii,jj) = aux_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nbnd
|
||||
aux_d(ii,jj) = spsi_d(npw+ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npw
|
||||
DO jj = 1, nbnd
|
||||
spsi_d(npwx+ii,jj) = aux_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = npw+1, npwx
|
||||
DO jj = 1, nbnd
|
||||
psi_d(ii,jj) = ZERO
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = npw+1, npwx
|
||||
DO jj = 1, nbnd
|
||||
hpsi_d(ii,jj) = ZERO
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = npw+1, npwx
|
||||
DO jj = 1, nbnd
|
||||
spsi_d(ii,jj) = ZERO
|
||||
END DO
|
||||
END DO
|
||||
DEALLOCATE( aux_d )
|
||||
call stop_clock('rotHSw:move'); !write(*,*) 'stop rotHSw:move' ; FLUSH(6)
|
||||
end if
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE rotate_HSpsi_k_gpu
|
|
@ -0,0 +1,198 @@
|
|||
!
|
||||
! Copyright (C) 2003-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rotate_wfc_gamma_gpu( h_psi_gpu, s_psi_gpu, overlap, &
|
||||
npwx, npw, nstart, nbnd, psi_d, evc_d, e_d )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... Serial version of rotate_wfc for Gamma-only calculations
|
||||
! ... This version assumes real wavefunctions (k=0) with only
|
||||
! ... half plane waves stored: psi_d(-G)=psi_d*(G), except G=0
|
||||
!
|
||||
USE LAXlib, ONLY : diaghg
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#else
|
||||
#define cublasDGEMM dgemm
|
||||
#endif
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, &
|
||||
nbgrp, my_bgrp_id, me_bgrp, root_bgrp
|
||||
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
!INCLUDE 'laxlib.fh'
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER :: npw, npwx, nstart, nbnd
|
||||
! dimension of the matrix to be diagonalized
|
||||
! leading dimension of matrix psi_d, as declared in the calling pgm unit
|
||||
! input number of states
|
||||
! output number of states
|
||||
LOGICAL :: overlap
|
||||
! if .FALSE. : S|psi_d> not needed
|
||||
COMPLEX(DP) :: psi_d(npwx,nstart), evc_d(npwx,nbnd)
|
||||
! input and output eigenvectors (may overlap)
|
||||
REAL(DP) :: e_d(nbnd)
|
||||
! eigenvalues
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: psi_d, evc_d, e_d
|
||||
#endif
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: npw2, npwx2
|
||||
COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: hr_d(:,:), sr_d(:,:), vr_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: en_d(:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: aux_d, hr_d, sr_d, vr_d, en_d
|
||||
#endif
|
||||
INTEGER :: n_start, n_end, my_n, i, j
|
||||
!
|
||||
EXTERNAL h_psi_gpu, s_psi_gpu
|
||||
! h_psi(npwx,npw,nvec,psi_d,hpsi)
|
||||
! calculates H|psi_d>
|
||||
! s_psi(npwx,npw,nvec,spsi)
|
||||
! calculates S|psi_d> (if needed)
|
||||
! Vectors psi_d,hpsi,spsi are dimensioned (npwx,npol,nvec)
|
||||
|
||||
npw2 = 2 * npw
|
||||
npwx2 = 2 * npwx
|
||||
|
||||
IF ( gstart == -1 ) CALL errore( 'regter', 'gstart variable not initialized', 1 )
|
||||
|
||||
!
|
||||
ALLOCATE( aux_d( npwx, nstart ) )
|
||||
ALLOCATE( hr_d( nstart, nstart ) )
|
||||
ALLOCATE( sr_d( nstart, nstart ) )
|
||||
ALLOCATE( vr_d( nstart, nstart ) )
|
||||
ALLOCATE( en_d( nstart ) )
|
||||
call start_clock('rotwfcg'); !write(*,*) 'start rotwfcg' ; FLUSH(6)
|
||||
!
|
||||
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
|
||||
!
|
||||
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
|
||||
!
|
||||
! ... set Im[ psi_d(G=0) ] - needed for numerical stability
|
||||
!
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO i=1,nstart
|
||||
psi_d(1,i) = CMPLX( DBLE( psi_d(1,i) ), 0.D0,kind=DP)
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
call start_clock('rotwfcg:hpsi'); !write(*,*) 'start rotwfcg:hpsi' ; FLUSH(6)
|
||||
CALL h_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
|
||||
call stop_clock('rotwfcg:hpsi'); !write(*,*) 'stop rotwfcg:hpsi' ; FLUSH(6)
|
||||
!
|
||||
call start_clock('rotwfcg:hc'); !write(*,*) 'start rotwfcg:hc' ; FLUSH(6)
|
||||
hr_d=0.D0
|
||||
CALL divide(inter_bgrp_comm,nstart,n_start,n_end)
|
||||
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
|
||||
if (n_start .le. n_end) &
|
||||
CALL cublasDGEMM( 'T','N', nstart, my_n, npw2, 2.D0, psi_d, &
|
||||
npwx2, aux_d(1,n_start), npwx2, 0.D0, hr_d(1,n_start), nstart )
|
||||
IF ( gstart == 2 ) call CGcudaDGER( nstart, my_n, -1.D0, psi_d, &
|
||||
npwx2, aux_d(1,n_start), npwx2, hr_d(1,n_start), nstart )
|
||||
CALL mp_sum( hr_d, inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( hr_d, intra_bgrp_comm )
|
||||
!
|
||||
sr_d=0.D0
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
CALL s_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL cublasDGEMM( 'T','N', nstart, my_n, npw2, 2.D0, psi_d, &
|
||||
npwx2, aux_d(1,n_start), npwx2, 0.D0, sr_d(1,n_start), nstart )
|
||||
IF ( gstart == 2 ) CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, &
|
||||
npwx2, aux_d(1,n_start), npwx2, sr_d(1,n_start), nstart )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL cublasDGEMM( 'T','N', nstart, my_n, npw2, 2.D0, psi_d, &
|
||||
npwx2, psi_d(1,n_start), npwx2, 0.D0, sr_d(1,n_start), nstart )
|
||||
IF ( gstart == 2 ) CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, &
|
||||
npwx2, psi_d(1,n_start), npwx2, sr_d(1,n_start), nstart )
|
||||
!
|
||||
END IF
|
||||
CALL mp_sum( sr_d, inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( sr_d, intra_bgrp_comm )
|
||||
call stop_clock('rotwfcg:hc'); !write(*,*) 'stop rotwfcg:hc' ; FLUSH(6)
|
||||
!
|
||||
! ... Diagonalize
|
||||
!
|
||||
call start_clock('rotwfcg:diag'); !write(*,*) 'start rotwfcg:diag' ; FLUSH(6)
|
||||
CALL diaghg( nstart, nbnd, hr_d, sr_d, nstart, en_d, vr_d, me_bgrp, root_bgrp, intra_bgrp_comm )
|
||||
call stop_clock('rotwfcg:diag'); !write(*,*) 'stop rotwfcg:diag' ; FLUSH(6)
|
||||
call start_clock('rotwfcg:evc_d'); !write(*,*) 'start rotwfcg:evc_d' ; FLUSH(6)
|
||||
!
|
||||
!$cuf kernel do(1)
|
||||
DO i=1, nbnd
|
||||
e_d(i) = en_d(i)
|
||||
END DO
|
||||
!
|
||||
! ... update the basis set
|
||||
!
|
||||
aux_d=(0.D0,0.D0)
|
||||
if (n_start .le. n_end) &
|
||||
CALL cublasDGEMM( 'N','N', npw2, nbnd, my_n, 1.D0, psi_d(1,n_start), &
|
||||
npwx2, vr_d(n_start,1), nstart, 0.D0, aux_d, npwx2 )
|
||||
CALL mp_sum( aux_d, inter_bgrp_comm )
|
||||
!
|
||||
!$cuf kernel do(2)
|
||||
DO i=1, nbnd
|
||||
DO j=1, npwx
|
||||
evc_d(j,i) = aux_d(j,i)
|
||||
END DO
|
||||
END DO
|
||||
call stop_clock('rotwfcg:evc_d'); !write(*,*) 'stop rotwfcg:evc_d' ; FLUSH(6)
|
||||
!
|
||||
DEALLOCATE( en_d )
|
||||
DEALLOCATE( vr_d )
|
||||
DEALLOCATE( sr_d )
|
||||
DEALLOCATE( hr_d )
|
||||
DEALLOCATE( aux_d )
|
||||
call stop_clock('rotwfcg'); !write(*,*) 'stop rotwfcg' ; FLUSH(6)
|
||||
!call print_clock('rotwfcg')
|
||||
!call print_clock('rotwfcg:hpsi')
|
||||
!call print_clock('rotwfcg:hc')
|
||||
!call print_clock('rotwfcg:diag')
|
||||
!call print_clock('rotwfcg:evc_d')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE rotate_wfc_gamma_gpu
|
||||
|
||||
! In principle this can go away .......
|
||||
SUBROUTINE CGcudaDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
#if defined(__CUDA)
|
||||
use cudafor
|
||||
use cublas
|
||||
#endif
|
||||
! .. Scalar Arguments ..
|
||||
DOUBLE PRECISION :: ALPHA
|
||||
INTEGER :: INCX, INCY, LDA, M, N
|
||||
! .. Array Arguments ..
|
||||
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: A, X, Y
|
||||
#endif
|
||||
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
|
||||
END SUBROUTINE CGcudaDGER
|
|
@ -0,0 +1,170 @@
|
|||
!
|
||||
! Copyright (C) 2001-2007 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rotate_wfc_k_gpu( h_psi_gpu, s_psi_gpu, overlap, &
|
||||
npwx, npw, nstart, nbnd, npol, psi_d, evc_d, e_d )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... Serial version of rotate_wfc for colinear, k-point calculations
|
||||
!
|
||||
USE LAXlib
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id,&
|
||||
nbgrp, my_bgrp_id, me_bgrp, root_bgrp
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
!INCLUDE 'laxlib.fh'
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
|
||||
! dimension of the matrix to be diagonalized
|
||||
! leading dimension of matrix psi, as declared in the calling pgm unit
|
||||
! input number of states
|
||||
! output number of states
|
||||
! number of spin polarizations
|
||||
LOGICAL :: overlap
|
||||
! if .FALSE. : S|psi> not needed
|
||||
COMPLEX(DP) :: psi_d(npwx*npol,nstart), evc_d(npwx*npol,nbnd)
|
||||
! input and output eigenvectors (may overlap)
|
||||
REAL(DP) :: e_d(nbnd)
|
||||
! eigenvalues
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: psi_d, evc_d, e_d
|
||||
#endif
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: kdim, kdmx
|
||||
COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: hc_d(:,:), sc_d(:,:), vc_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: en_d(:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: aux_d, hc_d, sc_d, vc_d, en_d
|
||||
#endif
|
||||
INTEGER :: n_start, n_end, my_n, i, j
|
||||
!
|
||||
EXTERNAL h_psi_gpu, s_psi_gpu
|
||||
! h_psi(npwx,npw,nvec,psi,hpsi)
|
||||
! calculates H|psi>
|
||||
! s_psi(npwx,npw,nvec,spsi)
|
||||
! calculates S|psi> (if needed)
|
||||
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nvec)
|
||||
|
||||
IF ( npol == 1 ) THEN
|
||||
!
|
||||
kdim = npw
|
||||
kdmx = npwx
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
kdim = npwx*npol
|
||||
kdmx = npwx*npol
|
||||
!
|
||||
END IF
|
||||
!
|
||||
ALLOCATE( aux_d(kdmx, nstart ) )
|
||||
ALLOCATE( hc_d( nstart, nstart) )
|
||||
ALLOCATE( sc_d( nstart, nstart) )
|
||||
ALLOCATE( vc_d( nstart, nstart) )
|
||||
ALLOCATE( en_d( nstart ) )
|
||||
call start_clock('rotwfck'); !write(*,*) 'start rotwfck';FLUSH(6)
|
||||
!
|
||||
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
|
||||
!
|
||||
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
|
||||
!
|
||||
call start_clock('rotwfck:hpsi'); !write(*,*) 'start rotwfck:hpsi';FLUSH(6)
|
||||
CALL h_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
|
||||
call stop_clock('rotwfck:hpsi') ; !write(*,*) 'stop rotwfck:hpsi';FLUSH(6)
|
||||
!
|
||||
call start_clock('rotwfck:hc'); !write(*,*) 'start rotwfck:hc';FLUSH(6)
|
||||
hc_d=(0.D0,0.D0)
|
||||
CALL divide(inter_bgrp_comm,nstart,n_start,n_end)
|
||||
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
|
||||
if (n_start .le. n_end) &
|
||||
call ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, &
|
||||
kdmx, aux_d(1,n_start), kdmx, (0.D0,0.D0), hc_d(1,n_start), nstart )
|
||||
CALL mp_sum( hc_d, inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( hc_d, intra_bgrp_comm )
|
||||
!
|
||||
sc_d=(0.D0,0.D0)
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
CALL s_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
|
||||
if (n_start .le. n_end) &
|
||||
CALL ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, &
|
||||
kdmx, aux_d(1,n_start), kdmx, (0.D0,0.D0), sc_d(1,n_start), nstart )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
if (n_start .le. n_end) &
|
||||
CALL ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, &
|
||||
kdmx, psi_d(1,n_start), kdmx, (0.D0,0.D0), sc_d(1,n_start), nstart )
|
||||
!
|
||||
END IF
|
||||
CALL mp_sum( sc_d, inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( sc_d, intra_bgrp_comm )
|
||||
call stop_clock('rotwfck:hc'); !write(*,*) 'stop rotwfck:hc';FLUSH(6)
|
||||
!
|
||||
! ... Diagonalize
|
||||
!
|
||||
call start_clock('rotwfck:diag'); !write(*,*) 'start rotwfck:diag';FLUSH(6)
|
||||
!! cdiaghg on GPU. See interface from LAXlib module
|
||||
CALL diaghg( nstart, nbnd, hc_d, sc_d, nstart, en_d, vc_d, me_bgrp, root_bgrp, intra_bgrp_comm )
|
||||
call stop_clock('rotwfck:diag'); !write(*,*) 'stop rotwfck:diag';FLUSH(6)
|
||||
call start_clock('rotwfck:evc'); !write(*,*) 'start rotwfck:evc';FLUSH(6)
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i=1,nbnd
|
||||
e_d(i) = en_d(i)
|
||||
END DO
|
||||
!
|
||||
! ... update the basis set
|
||||
!
|
||||
aux_d=(0.D0,0.D0)
|
||||
if (n_start .le. n_end) &
|
||||
CALL ZGEMM( 'N','N', kdim, nbnd, my_n, (1.D0,0.D0), psi_d(1,n_start), &
|
||||
kdmx, vc_d(n_start,1), nstart, (0.D0,0.D0), aux_d, kdmx )
|
||||
CALL mp_sum( aux_d, inter_bgrp_comm )
|
||||
!
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i=1, nbnd
|
||||
DO j=1, kdmx
|
||||
evc_d(j,i) = aux_d(j,i)
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
call stop_clock('rotwfck:evc') ! ; write(*,*) 'stop rotwfck;evc';FLUSH(6)
|
||||
!
|
||||
DEALLOCATE( en_d )
|
||||
DEALLOCATE( vc_d )
|
||||
DEALLOCATE( sc_d )
|
||||
DEALLOCATE( hc_d )
|
||||
DEALLOCATE( aux_d )
|
||||
call stop_clock('rotwfck'); !write(*,*) 'stop rotwfck';FLUSH(6)
|
||||
!call print_clock('rotwfck')
|
||||
!call print_clock('rotwfck:hpsi')
|
||||
!call print_clock('rotwfck:hc')
|
||||
!call print_clock('rotwfck:diag')
|
||||
!call print_clock('rotwfck:evc')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE rotate_wfc_k_gpu
|
||||
!
|
|
@ -9,6 +9,8 @@ DAVID = \
|
|||
cegterg.o \
|
||||
regterg.o
|
||||
|
||||
-include make.gpu
|
||||
|
||||
all : libdavid.a
|
||||
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,13 @@
|
|||
# Makefile for DAVID GPU
|
||||
|
||||
DAVID += \
|
||||
cegterg_gpu.o \
|
||||
regterg_gpu.o
|
||||
|
||||
ifdef PGI_POWER_WORKAROUND
|
||||
# Circumvents bug in the Power implementation of PGI compilers
|
||||
F90FLAGS:= -O1 $(filter-out -fast,$(F90FLAGS))
|
||||
endif
|
||||
|
||||
cegterg_gpu.o : ../../UtilXlib/util_param.o
|
||||
regterg_gpu.o : ../../UtilXlib/util_param.o
|
|
@ -22,7 +22,6 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
|
|||
! ... where H is an hermitean operator, e is a real scalar,
|
||||
! ... S is an uspp matrix, evc is a complex vector
|
||||
! ... (real wavefunctions with only half plane waves stored)
|
||||
|
||||
USE util_param, ONLY : DP, stdout
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, &
|
||||
nbgrp, my_bgrp_id, me_bgrp, root_bgrp
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -32,7 +32,10 @@ DENSE/rotate_HSpsi_gamma.o \
|
|||
DENSE/rotate_HSpsi_k.o \
|
||||
DENSE/rotate_wfc_gamma.o \
|
||||
DENSE/rotate_wfc_k.o
|
||||
# add here other objects, e.g. NewSolver/*.o
|
||||
|
||||
-include make.gpu
|
||||
|
||||
# add here other objects, e.g. ParO/*.o NewSolver/*.o
|
||||
|
||||
libks_solvers.a: $(ALLOBJS)
|
||||
$(AR) $(ARFLAGS) $@ $?
|
||||
|
|
|
@ -6,8 +6,11 @@ include ../../make.inc
|
|||
MODFLAGS= $(MOD_FLAG) ../../ELPA/src $(MOD_FLAG) ../../LAXlib $(MOD_FLAG) ../../UtilXlib $(MOD_FLAG).
|
||||
|
||||
PPCG = \
|
||||
generic_cublas.o \
|
||||
ppcg_gamma.o \
|
||||
ppcg_k.o
|
||||
ppcg_gamma_gpu.o \
|
||||
ppcg_k.o \
|
||||
ppcg_k_gpu.o \
|
||||
|
||||
all : libppcg.a
|
||||
|
||||
|
|
|
@ -0,0 +1,221 @@
|
|||
subroutine gpu_DGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
implicit none
|
||||
character*1 transa, transb
|
||||
integer :: m, n, k, lda, ldb, ldc
|
||||
DOUBLE PRECISION :: alpha, beta
|
||||
DOUBLE PRECISION, dimension(lda, *) :: A
|
||||
DOUBLE PRECISION, dimension(ldb, *) :: B
|
||||
DOUBLE PRECISION, dimension(ldc, *) :: C
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: A, B, C
|
||||
call cublasDGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
|
||||
#endif
|
||||
return
|
||||
end subroutine gpu_DGEMM
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
subroutine gpu_ZGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
implicit none
|
||||
character*1 transa, transb
|
||||
integer :: m, n, k, lda, ldb, ldc
|
||||
DOUBLE COMPLEX :: alpha, beta
|
||||
DOUBLE COMPLEX, dimension(lda, *) :: A
|
||||
DOUBLE COMPLEX, dimension(ldb, *) :: B
|
||||
DOUBLE COMPLEX, dimension(ldc, *) :: C
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: A, B, C
|
||||
call cublasZGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
|
||||
#endif
|
||||
return
|
||||
end subroutine gpu_ZGEMM
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
subroutine gpu_DGER (m, n, alpha, x, incx, y, incy, a, lda)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
implicit none
|
||||
integer :: m, n, lda, incx, incy
|
||||
DOUBLE PRECISION :: alpha
|
||||
DOUBLE PRECISION, dimension(lda, *) :: A
|
||||
DOUBLE PRECISION, dimension(*) :: x, y
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: A, x, y
|
||||
call cublasDGER(m, n, alpha, x, incx, y, incy, a, lda)
|
||||
#endif
|
||||
return
|
||||
end subroutine gpu_DGER
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
function gpu_DDOT (n, dx, incx, dy, incy)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
implicit none
|
||||
DOUBLE PRECISION :: gpu_DDOT
|
||||
integer :: n, incx, incy
|
||||
DOUBLE PRECISION, dimension(*) :: dx, dy
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: dx, dy
|
||||
gpu_DDOT=cublasDDOT(n, dx, incx, dy, incy)
|
||||
#endif
|
||||
return
|
||||
end function gpu_DDOT
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
subroutine gpu_DTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
implicit none
|
||||
character*1 :: side, uplo, transa, diag
|
||||
integer :: m, n, lda, ldb
|
||||
DOUBLE PRECISION :: alpha
|
||||
DOUBLE PRECISION, dimension(lda, *) :: a
|
||||
DOUBLE PRECISION, dimension(ldb, *) :: b
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: a, b
|
||||
call cublasDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
|
||||
#endif
|
||||
return
|
||||
end subroutine gpu_DTRSM
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE gpu_threaded_memset(array, val, length)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE util_param, ONLY : DP
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: length
|
||||
REAL(DP), INTENT(OUT) :: array(length)
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: array
|
||||
#endif
|
||||
REAL(DP), INTENT(IN) :: val
|
||||
!
|
||||
INTEGER :: i
|
||||
!
|
||||
IF (length<=0) RETURN
|
||||
!
|
||||
!$cuf kernel do(1)
|
||||
DO i=1, length
|
||||
array(i) = val
|
||||
ENDDO
|
||||
!
|
||||
END SUBROUTINE gpu_threaded_memset
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE gpu_threaded_assign(array_out, array_in, kdimx, nact, use_idx, idx, bgrp_root_only)
|
||||
! assign (copy) a complex array in a threaded way
|
||||
! array_out( 1:kdimx, 1:nact ) = array_in( 1:kdimx, 1:nact ) or
|
||||
! array_out( 1:kdimx, 1:nact ) = array_in( 1:kdimx, idx(1:nact) )
|
||||
! if the index array idx is given
|
||||
! if bgrp_root_only is present and .true. the assignement is made only by the
|
||||
! MPI root process of the bgrp and array_out is zeroed otherwise
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : root_bgrp_id, nbgrp, my_bgrp_id
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: kdimx, nact
|
||||
COMPLEX(DP), INTENT(OUT) :: array_out( kdimx, nact )
|
||||
COMPLEX(DP), INTENT(IN) :: array_in ( kdimx, * )
|
||||
INTEGER, INTENT(IN) :: idx( * )
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: array_out, array_in, idx
|
||||
#endif
|
||||
LOGICAL, INTENT(IN) :: bgrp_root_only
|
||||
LOGICAL, INTENT(IN) :: use_idx
|
||||
!
|
||||
INTEGER, PARAMETER :: blocksz = 256
|
||||
INTEGER :: nblock
|
||||
|
||||
INTEGER :: i, j
|
||||
!
|
||||
IF (kdimx <=0 .OR. nact<= 0) RETURN
|
||||
!
|
||||
IF (bgrp_root_only .AND. ( my_bgrp_id /= root_bgrp_id ) ) THEN
|
||||
call threaded_memset( array_out, 0.d0, 2*kdimx*nact )
|
||||
RETURN
|
||||
END IF
|
||||
|
||||
nblock = (kdimx - 1)/blocksz + 1
|
||||
|
||||
IF (use_idx ) THEN
|
||||
!$cuf kernel do(2)
|
||||
DO i=1, nact
|
||||
DO j=1,nblock
|
||||
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) )
|
||||
ENDDO
|
||||
ENDDO
|
||||
ELSE
|
||||
!$cuf kernel do(2)
|
||||
DO i=1, nact
|
||||
DO j=1,nblock
|
||||
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i )
|
||||
ENDDO
|
||||
ENDDO
|
||||
END IF
|
||||
!
|
||||
END SUBROUTINE gpu_threaded_assign
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE gpu_threaded_backassign(array_out, idx, array_in, kdimx, nact, use_a2, a2_in )
|
||||
! assign (copy) a complex array in a threaded way
|
||||
! array_out( 1:kdimx, idx(1:nact) ) = array_in( 1:kdimx, 1:nact ) or
|
||||
! array_out( 1:kdimx, idx(1:nact) ) = array_in( 1:kdimx, 1:nact ) + a2_in( 1:kdimx, idx(1:nact) (
|
||||
! if a2_in is present
|
||||
! the index array idx is mandatory otherwise one could use previous routine)
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE util_param, ONLY : DP
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: kdimx, nact
|
||||
COMPLEX(DP), INTENT(INOUT) :: array_out( kdimx, * ) ! we don't want to mess with un referenced columns
|
||||
COMPLEX(DP), INTENT(IN) :: array_in ( kdimx, nact )
|
||||
COMPLEX(DP), INTENT(IN) :: a2_in ( kdimx, * )
|
||||
INTEGER, INTENT(IN) :: idx( * )
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: array_out, array_in, a2_in, idx
|
||||
#endif
|
||||
LOGICAL, INTENT(IN) :: use_a2
|
||||
!
|
||||
INTEGER, PARAMETER :: blocksz = 256
|
||||
INTEGER :: nblock
|
||||
|
||||
INTEGER :: i, j
|
||||
!
|
||||
IF (kdimx <=0 .OR. nact<= 0) RETURN
|
||||
!
|
||||
|
||||
nblock = (kdimx - 1)/blocksz + 1
|
||||
|
||||
IF ( use_a2) THEN
|
||||
!$cuf kernel do(2)
|
||||
DO i=1, nact
|
||||
DO j=1,nblock
|
||||
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) ) = &
|
||||
array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) + &
|
||||
a2_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) )
|
||||
ENDDO
|
||||
ENDDO
|
||||
ELSE
|
||||
!$cuf kernel do(2)
|
||||
DO i=1, nact
|
||||
DO j=1,nblock
|
||||
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i )
|
||||
ENDDO
|
||||
ENDDO
|
||||
END IF
|
||||
!
|
||||
END SUBROUTINE gpu_threaded_backassign
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
@ -1,6 +1,14 @@
|
|||
generic_cublas.o : ../../UtilXlib/mp_bands_util.o
|
||||
generic_cublas.o : ../../UtilXlib/util_param.o
|
||||
ppcg_gamma.o : ../../UtilXlib/mp.o
|
||||
ppcg_gamma.o : ../../UtilXlib/mp_bands_util.o
|
||||
ppcg_gamma.o : ../../UtilXlib/util_param.o
|
||||
ppcg_gamma_gpu.o : ../../UtilXlib/mp.o
|
||||
ppcg_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
ppcg_gamma_gpu.o : ../../UtilXlib/util_param.o
|
||||
ppcg_k.o : ../../UtilXlib/mp.o
|
||||
ppcg_k.o : ../../UtilXlib/mp_bands_util.o
|
||||
ppcg_k.o : ../../UtilXlib/util_param.o
|
||||
ppcg_k_gpu.o : ../../UtilXlib/mp.o
|
||||
ppcg_k_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
ppcg_k_gpu.o : ../../UtilXlib/util_param.o
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -13,6 +13,8 @@ pcg_k.o \
|
|||
bpcg_gamma.o \
|
||||
bpcg_k.o
|
||||
|
||||
-include make.gpu
|
||||
|
||||
all : libparo.a
|
||||
|
||||
libparo.a: $(PARO)
|
||||
|
|
|
@ -0,0 +1,404 @@
|
|||
! Copyright (C) 2015-2016 Aihui Zhou's group
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!-------------------------------------------------------------------------------
|
||||
!
|
||||
! We propose some parallel orbital updating based plane wave basis methods
|
||||
! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
|
||||
! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
|
||||
! parallelization, which make them have great advantage in large-scale parallelization.
|
||||
!
|
||||
! The approach following Algorithm is the parallel orbital updating algorithm:
|
||||
! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
|
||||
! the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$
|
||||
! and let $n=0$.
|
||||
! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
|
||||
! $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))] $$
|
||||
! in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the
|
||||
! $n$-th iteration or the former iterations.
|
||||
! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$ satisfying
|
||||
! $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad \forall v \in \tilde{V}_N$$
|
||||
! where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$,
|
||||
! $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
|
||||
! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else, stop.
|
||||
!
|
||||
! You can see the detailed information through
|
||||
! X. Dai, X. Gong, A. Zhou, J. Zhu,
|
||||
! A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
|
||||
! X. Dai, Z. Liu, X. Zhang, A. Zhou,
|
||||
! A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations,
|
||||
! arXiv:1510.07230 (2015).
|
||||
! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
|
||||
! A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
|
||||
!
|
||||
! The file is written mainly by Stefano de Gironcoli and Yan Pan.
|
||||
!
|
||||
! The following file is for solving step 2 of the parallel orbital updating algorithm.
|
||||
!
|
||||
! Ivan Carnimeo: GPU version
|
||||
!
|
||||
#define ZERO ( 0.D0, 0.D0 )
|
||||
#define ONE ( 1.D0, 0.D0 )
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE bpcg_gamma_gpu( hs_psi_gpu, g_1psi_gpu, psi0_d, spsi0_d, npw, npwx, nbnd, nvec, psi_d, hpsi_d, spsi_d, ethr, e_d, nhpsi )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! Block Preconditioned Conjugate Gradient solution of the linear system
|
||||
!
|
||||
! [ H - e S ]|\tilde\psi> = Pc [ e S - H ] |psi>
|
||||
!
|
||||
! the search targets the space orthogonal to the current best wfcs (psi0);
|
||||
! the solution is sought until the residual norm is a fixed fraction of the RHS norm
|
||||
! in this way the more accurate is the original problem the more accuratly the correction is computed
|
||||
!
|
||||
! in order to avoid un-necessary HSpsi evaluations this version assumes psi,hpsi and spsi are all
|
||||
! provided in input and return their estimate for further use
|
||||
!
|
||||
! Ivan Carnimeo: GPU version
|
||||
!
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE util_param, ONLY : DP, stdout
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, gstart
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! Following varibales are temporary
|
||||
COMPLEX(DP), INTENT(IN) :: psi0_d(npwx,nbnd) ! psi0 needed to compute the Pv projection
|
||||
COMPLEX(DP), INTENT(IN) :: spsi0_d(npwx,nbnd) ! Spsi0 needed to compute the Pv projection
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nbnd, nvec ! input dimensions
|
||||
REAL(DP), INTENT(IN) :: ethr ! threshold for convergence.
|
||||
REAL(DP), INTENT(INOUT) :: e_d(nvec) ! current estimate of the target eigenvalues
|
||||
COMPLEX(DP),INTENT(INOUT) :: psi_d(npwx,nvec),hpsi_d(npwx,nvec),spsi_d(npwx,nvec) !
|
||||
! input: the current estimate of the wfcs
|
||||
! output: the estimated correction vectors
|
||||
INTEGER, INTENT(INOUT) :: nhpsi ! (updated) number of Hpsi evaluations
|
||||
!
|
||||
! ... LOCAL variables
|
||||
!
|
||||
INTEGER, PARAMETER :: maxter = 5 ! maximum number of CG iterations
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: g0(:), g1(:), g2(:), gamma(:), ethr_cg(:), ff(:), ff0(:)
|
||||
INTEGER, ALLOCATABLE :: cg_iter(:)
|
||||
REAL(DP) :: beta, ee
|
||||
INTEGER :: npw2, npwx2, i, l, block_size, done, nactive, nnew, newdone
|
||||
!
|
||||
! ... DEVICE variables
|
||||
!
|
||||
EXTERNAL g_1psi_gpu, hs_psi_gpu
|
||||
REAL(DP), EXTERNAL :: gpu_DDOT
|
||||
COMPLEX(DP), ALLOCATABLE :: b_d(:,:), & ! RHS for testing
|
||||
p_d(:,:), hp_d(:,:), sp_d(:,:), z_d(:,:) ! additional working vetors
|
||||
REAL(DP), ALLOCATABLE :: spsi0vec_d (:,:) ! the product of spsi0 and a group of vectors
|
||||
COMPLEX(DP) :: tmp, tmp_d
|
||||
INTEGER :: ii, jj
|
||||
REAL(DP), ALLOCATABLE :: alpha(:)
|
||||
#if defined (__CUDA)
|
||||
attributes(device) :: psi_d, spsi_d, hpsi_d
|
||||
attributes(device) :: psi0_d, spsi0_d, spsi0vec_d
|
||||
attributes(device) :: e_d, b_d, z_d, p_d, hp_d, sp_d
|
||||
attributes(device) :: tmp_d
|
||||
#endif
|
||||
!
|
||||
!
|
||||
CALL start_clock( 'pcg' ); !write (6,*) ' enter pcg' , e(1:2) ; FLUSH(6)
|
||||
!
|
||||
npw2 = 2*npw
|
||||
npwx2 = 2*npwx
|
||||
block_size = min(nvec,64)
|
||||
!
|
||||
ALLOCATE( g0( block_size ), g1( block_size ), g2( block_size ), gamma( block_size ) )
|
||||
ALLOCATE( ethr_cg( block_size ), ff( block_size ), ff0( block_size ), cg_iter( block_size ) )
|
||||
ALLOCATE( z_d( npwx, block_size ), b_d( npwx, block_size ) )
|
||||
ALLOCATE( p_d(npwx,block_size), hp_d(npwx,block_size), sp_d(npwx,block_size) )
|
||||
ALLOCATE( spsi0vec_d(nbnd, block_size) )
|
||||
ALLOCATE( alpha( block_size ) )
|
||||
!
|
||||
!
|
||||
done = 0 ! the number of correction vectors already solved
|
||||
nactive = 0 ! the number of correction vectors currently being updated
|
||||
cg_iter = 0 ! how many iteration each active vector has completed (<= maxter)
|
||||
|
||||
MAIN_LOOP: & ! This is a continuous loop. It terminates only when nactive vanishes
|
||||
DO
|
||||
nnew = min(done+block_size,nvec)-(done+nactive) ! number of new corrections to be added to the seach
|
||||
if ( nnew > 0 ) then ! add nnew vectors to the active list
|
||||
!write(6,*) ' nnew =', nnew
|
||||
do l=nactive+1,nactive+nnew
|
||||
i=l+done
|
||||
!write(6,*) ' l =',l,' i =',i
|
||||
!write (6,*) ' enter pcg' , e(i) ; FLUSH(6)
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
b_d(ii,l) = e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i) ! initial gradient and saved RHS for later
|
||||
END DO
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
z_d(ii,l) = b_d(ii,l)
|
||||
END DO
|
||||
call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i)) ! initial preconditioned gradient
|
||||
end do
|
||||
!- project on conduction bands
|
||||
CALL start_clock( 'pcg:ortho' )
|
||||
CALL gpu_DGEMM( 'T','N', nbnd,nnew,npw2, 2.D0, spsi0_d, npwx2, z_d(:,nactive+1), npwx2, 0.D0, spsi0vec_d, nbnd )
|
||||
IF ( gstart == 2 ) CALL gpu_DGER( nbnd, nnew, -1.D0, spsi0_d, npwx2, z_d(:,nactive+1), npwx2, spsi0vec_d, nbnd )
|
||||
CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
|
||||
CALL gpu_DGEMM( 'N','N', npw2,nnew,nbnd,-1.D0, psi0_d, npwx2, spsi0vec_d, nbnd, 1.D0, z_d(:,nactive+1), npwx2 )
|
||||
CALL stop_clock( 'pcg:ortho' )
|
||||
!-
|
||||
do l=nactive+1,nactive+nnew; i=l+done
|
||||
g0(l) = 2.D0*gpu_DDOT(npw2,z_d(:,l),1,b_d(:,l),1)
|
||||
IF (gstart==2) g0(l)=g0(l) - gpu_DDOT(2,z_d(1,l),1,b_d(1,l),1)
|
||||
end do
|
||||
CALL mp_sum( g0(nactive+1:nactive+nnew), intra_bgrp_comm ) ! g0 = < initial z | initial gradient b >
|
||||
|
||||
do l=nactive+1,nactive+nnew; i=l+done
|
||||
!write(6,*) ' l =',l,' i =',i
|
||||
ff(l) = 0.d0 ; ff0(l) = ff(l)
|
||||
!write (6,*) 0, g0(l), ff(l)
|
||||
|
||||
! ethr_cg = ethr ! CG convergence threshold could be set from input but it is not ...
|
||||
ethr_cg(l) = 1.0D-2 ! it makes more sense to fix the convergence of the CG solution to a
|
||||
! fixed function of the RHS (see ethr_cg update later).
|
||||
ethr_cg(l) = max ( 0.01*ethr, ethr_cg(l) * g0(l) ) ! here we set the convergence of the correction
|
||||
!write(6,*) 'ethr_cg :', ethr_cg(l)
|
||||
|
||||
! zero the trial solution
|
||||
psi_d(:,i) = ZERO
|
||||
hpsi_d(:,i) = ZERO
|
||||
spsi_d(:,i) = ZERO
|
||||
|
||||
! initial search direction
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
p_d(ii,l) = z_d(ii,l)
|
||||
END DO
|
||||
cg_iter(l) = 0 ! this is a new correction vector, reset its interation count
|
||||
end do
|
||||
nactive = nactive + nnew
|
||||
end if
|
||||
!write(6,*) ' done =',done, ' nactive =',nactive
|
||||
|
||||
! iterate: ! DO cg_iter = 1, maxter ! THIS IS THE ENTRY POINT OF THE PCG LOOP
|
||||
|
||||
if ( nactive == 0 ) EXIT MAIN_LOOP ! this is the only MAIN_LOOP EXIT condition
|
||||
|
||||
cg_iter(1:nactive) = cg_iter(1:nactive) + 1 ! update interation counters
|
||||
|
||||
CALL start_clock( 'pcg:hs_1psi' )
|
||||
! do l = 1, nactive ! THIS COULD/SHOULD BE A GLOBAL CALL (ONLY WITHIN ONE BGRP THOUGH)
|
||||
! CALL hs_1psi( npwx, npw, p(:,l), hp(:,l), sp(:,l) ) ! apply H to a single wavefunction (no bgrp parallelization here!)
|
||||
! end do
|
||||
CALL hs_psi_gpu( npwx, npw, nactive, p_d, hp_d, sp_d ) ! apply H to a single wavefunction (no bgrp parallelization here!)
|
||||
CALL stop_clock( 'pcg:hs_1psi' )
|
||||
|
||||
do l = 1, nactive; i=l+done
|
||||
gamma(l) = 2.D0*gpu_DDOT(npw2,p_d(:,l),1,hp_d(:,l),1) - e_d(i) * 2.D0*gpu_DDOT(npw2,p_d(:,l),1,sp_d(:,l),1)
|
||||
IF (gstart==2) gamma(l) = gamma(l) - gpu_DDOT(2,p_d(1,l),1,hp_d(1,l),1) + e_d(i) * gpu_DDOT(2,p_d(1,l),1,sp_d(1,l),1)
|
||||
end do
|
||||
CALL mp_sum( gamma(1:nactive), intra_bgrp_comm ) ! gamma = < p | hp - e sp >
|
||||
|
||||
do l = 1, nactive; i=l+done
|
||||
!write(6,*) ' l =',l,' i =',i
|
||||
|
||||
alpha(l) = g0(l)/gamma(l)
|
||||
!write(6,*) 'g0, gamma, alpha :', g0(l), gamma(l), alpha_d(l)
|
||||
tmp = alpha(l)
|
||||
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
psi_d(ii,i) = psi_d(ii,i) + tmp * p_d(ii,l) ! updated solution
|
||||
hpsi_d(ii,i) = hpsi_d(ii,i) + tmp * hp_d(ii,l) ! updated solution
|
||||
spsi_d(ii,i) = spsi_d(ii,i) + tmp * sp_d(ii,l) ! updated solution
|
||||
END DO
|
||||
|
||||
g2(l) = 2.D0 * ( gpu_DDOT(npw2,z_d(:,l),1,b_d(:,l),1) + &
|
||||
e_d(i) * gpu_DDOT(npw2,z_d(:,l),1,spsi_d(:,i),1) - &
|
||||
gpu_DDOT(npw2,z_d(:,l),1,hpsi_d(:,i),1) )
|
||||
IF (gstart==2) g2(l) = g2(l) - gpu_DDOT(2,z_d(1,l),1,b_d(1,l),1) - &
|
||||
e_d(i)*gpu_DDOT(2,z_d(1,l),1,spsi_d(1,i),1) + &
|
||||
gpu_DDOT(2,z_d(1,l),1,hpsi_d(1,i),1)
|
||||
end do
|
||||
CALL mp_sum( g2(1:nactive), intra_bgrp_comm ) ! g2 = < old z | new gradient b + e spsi - hpsi >
|
||||
|
||||
do l = 1, nactive; i=l+done ! update the preconditioned gradient
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
z_d(ii,l) = b_d(ii,l) + e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i)
|
||||
END DO
|
||||
call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i))
|
||||
end do
|
||||
!- project on conduction bands
|
||||
CALL start_clock( 'pcg:ortho' )
|
||||
CALL gpu_DGEMM( 'T','N', nbnd,nactive,npw2, 2.D0, spsi0_d, npwx2, z_d, npwx2, 0.D0, spsi0vec_d, nbnd )
|
||||
IF ( gstart == 2 ) CALL gpu_DGER( nbnd, nactive, -1.D0, spsi0_d, npwx2, z_d, npwx2, spsi0vec_d, nbnd )
|
||||
CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
|
||||
CALL gpu_DGEMM( 'N','N', npw2,nactive,nbnd,-1.D0, psi0_d, npwx2, spsi0vec_d, nbnd, 1.D0, z_d, npwx2 )
|
||||
CALL stop_clock( 'pcg:ortho' )
|
||||
!-
|
||||
do l = 1, nactive; i=l+done
|
||||
g1(l) = 2.D0 * ( gpu_DDOT(npw2,z_d(:,l),1,b_d(:,l),1) + &
|
||||
e_d(i) * gpu_DDOT(npw2,z_d(:,l),1,spsi_d(:,i),1) - &
|
||||
gpu_DDOT(npw2,z_d(:,l),1,hpsi_d(:,i),1) )
|
||||
IF (gstart==2) THEN
|
||||
!$cuf kernel do(1)
|
||||
do ii = 1, 1
|
||||
tmp_d = b_d(1,l) + e_d(i)*spsi_d(1,i) - hpsi_d(1,i)
|
||||
end do
|
||||
g1(l) = g1(l) - gpu_DDOT(2,z_d(1,l),1,tmp_d,1)
|
||||
END IF
|
||||
end do
|
||||
CALL mp_sum( g1(1:nactive), intra_bgrp_comm ) ! g1 = < new z | new gradient b + e spsi - hpsi >
|
||||
|
||||
do l = 1, nactive; i = l + done ! evaluate the function ff
|
||||
ff(l) = - ( e_d(i)*gpu_DDOT(npw2,psi_d(:,i),1,spsi_d(:,i),1) - gpu_DDOT(npw2,psi_d(:,i),1,hpsi_d(:,i),1) ) &
|
||||
- 2.D0 * gpu_DDOT(npw2,psi_d(:,i),1,b_d(:,l),1)
|
||||
if (gstart==2) THEN
|
||||
!$cuf kernel do(1)
|
||||
do ii = 1, 1
|
||||
tmp_d = e_d(i)*spsi_d(1,i) - hpsi_d(1,i) + 2.D0 * b_d(1,l)
|
||||
end do
|
||||
ff(l) = ff(l) + 0.5D0 * gpu_DDOT(2,psi_d(1,i),1,tmp_d,1)
|
||||
END IF
|
||||
end do
|
||||
CALL mp_sum( ff(1:nactive), intra_bgrp_comm ) ! function minimum -0.5 < psi | e spsi - hpsi > - < psi | b >
|
||||
|
||||
newdone = 0 ! number of correction vectors that converge (or are done) at this iteration
|
||||
do l = 1, nactive; i = l + done
|
||||
!write (6,*) cg_iter(l), g1(l), ff(l), gamma(l)
|
||||
|
||||
IF ( ff(l) > ff0(l) .AND. ff0(l) < 0.d0 ) THEN
|
||||
tmp = alpha(l)
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
psi_d(ii,i) = psi_d(ii,i) - tmp * p_d(ii,l) ! fallback solution: if last iter failed to improve ff0
|
||||
hpsi_d(ii,i) = hpsi_d(ii,i) - tmp * hp_d(ii,l)! exit whitout updating and ...
|
||||
spsi_d(ii,i) = spsi_d(ii,i) - tmp * sp_d(ii,l)! hope next time it'll be better
|
||||
END DO
|
||||
END IF
|
||||
|
||||
!write(6,*) 'g0, g1, g2 :', g0(l), g1(l), g2(l)
|
||||
!write(6,*) 'ff0, ff : ', ff0(l), ff(l)
|
||||
IF ( ABS ( g1(l) ) < ethr_cg(l) .OR. ( ff(l) > ff0(l) ) .OR. cg_iter(l) == maxter) THEN ! EXIT iterate
|
||||
!write (6,*) ' exit pcg loop'
|
||||
!write(6,*) ' l =',l,' i =',i
|
||||
!if ( cg_iter(l) == maxter.and. ABS(g1(l)) > ethr_cg(l)) write (6,*) 'CG not converged maxter exceeded', cg_iter(l), g1(l), g0(l), ethr_cg(l)
|
||||
!IF ( ABS ( g1(l) ) < ethr_cg(l)) write (6,*) 'CG correction converged ', cg_iter(l), g1(l), ethr_cg(l)
|
||||
!IF ( ABS ( g1(l) ) > g0(l) ) write (6,*) 'CG not converged ', cg_iter(l), g1(l), g0(l), ethr_cg(l)
|
||||
|
||||
nhpsi = nhpsi + cg_iter(l) ! update nhpsi count
|
||||
IF (.NOT. (ABS(g1(l))< ethr_cg(l) .OR. (ff(l)>ff0(l)) ) .AND. cg_iter(l)==maxter) nhpsi = nhpsi + 1 ! because this would be the count
|
||||
|
||||
newdone = newdone + 1 ! one more solution found (or no more active anyway)
|
||||
!write(6,*) ' newdone = ', newdone
|
||||
|
||||
CALL start_clock( 'pcg:move' )
|
||||
!write(6,*) ' swapping converged psi/hpsi/spsi i = ',i, " with i' = ",done+newdone
|
||||
! swap the terminated vector with the first in the list of the active ones
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
p_d (ii,l) = psi_d (ii,done+newdone)
|
||||
psi_d (ii,done+newdone) = psi_d (ii,i)
|
||||
psi_d (ii,i) = p_d (ii,l)
|
||||
hp_d(ii,l) = hpsi_d(ii,done+newdone)
|
||||
hpsi_d(ii,done+newdone) = hpsi_d(ii,i)
|
||||
hpsi_d(ii,i) = hp_d(ii,l)
|
||||
sp_d(ii,l) = spsi_d(ii,done+newdone)
|
||||
spsi_d(ii,done+newdone) = spsi_d(ii,i)
|
||||
spsi_d(ii,i) = sp_d(ii,l)
|
||||
END DO
|
||||
|
||||
ee = e_d(done+newdone)
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, 1
|
||||
e_d(done+newdone) = e_d(i)
|
||||
END DO
|
||||
e_d(i) = ee
|
||||
|
||||
!write(6,*) ' overwrite converged p/hp/etc l = ',l, ' with newdone = ',newdone
|
||||
! move information of the swapped active vector in the right place to keep going
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
p_d(ii,l) = p_d(ii,newdone)
|
||||
hp_d(ii,l) = p_d(ii,newdone)
|
||||
sp_d(ii,l) = sp_d(ii,newdone)
|
||||
b_d(ii,l) = b_d(ii,newdone)
|
||||
z_d(ii,l) = z_d(ii,newdone)
|
||||
END DO
|
||||
|
||||
ff0(l) = ff0(newdone) ; ff(l) = ff(newdone)
|
||||
alpha(l) = alpha(newdone)
|
||||
g0(l) = g0(newdone) ; g1(l) = g1(newdone) ; g2(l) = g2(newdone)
|
||||
cg_iter(l) = cg_iter(newdone) ; ethr_cg(l) = ethr_cg(newdone)
|
||||
CALL stop_clock( 'pcg:move' )
|
||||
|
||||
ELSE
|
||||
|
||||
!write(6,*) ' l =',l,' i =',i
|
||||
beta = (g1(l)-g2(l))/g0(l) ! Polak - Ribiere style update
|
||||
g0(l) = g1(l) ! < new z | new gradient > -> < old z | old gradient >
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
p_d(ii,l) = z_d(ii,l) + beta * p_d(ii,l) ! updated search direction
|
||||
END DO
|
||||
!write(6,*) 'beta :', beta
|
||||
|
||||
ff0(l) = ff(l) ! updated minimum value reached by the function
|
||||
|
||||
END IF
|
||||
end do
|
||||
|
||||
IF ( newdone > 0 ) THEN
|
||||
|
||||
done = done + newdone
|
||||
nactive = nactive - newdone
|
||||
|
||||
!write(6,*) ' there have been ', newdone, ' new converged solution'
|
||||
!write(6,*) ' done = ', done, ' nactive =', nactive
|
||||
|
||||
CALL start_clock( 'pcg:move' )
|
||||
do l=1, nactive
|
||||
|
||||
!write(6,*) ' l+newdone =',l+newdone,' -> l =',l
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
p_d (ii,l) = p_d (ii,l+newdone)
|
||||
hp_d(ii,l) = hp_d(ii,l+newdone)
|
||||
sp_d(ii,l) = sp_d(ii,l+newdone)
|
||||
b_d(ii,l) = b_d(ii,l+newdone)
|
||||
z_d(ii,l) = z_d(ii,l+newdone)
|
||||
END DO
|
||||
|
||||
ff0(l) = ff0(l+newdone) ; ff(l) = ff(l+newdone)
|
||||
g0(l) = g0(l+newdone) ; g1(l) = g1(l+newdone) ; g2(l) = g2(l+newdone)
|
||||
cg_iter(l) = cg_iter(l+newdone) ; ethr_cg(l) = ethr_cg(l+newdone)
|
||||
|
||||
end do
|
||||
CALL stop_clock( 'pcg:move' )
|
||||
|
||||
END IF
|
||||
|
||||
! END DO iterate Here is where the pcg loop would terminate
|
||||
|
||||
END DO MAIN_LOOP
|
||||
!write (6,*) ' exit pcg loop'
|
||||
|
||||
DEALLOCATE( b_d, z_d, p_d, hp_d, sp_d )
|
||||
DEALLOCATE( ethr_cg, ff, ff0, cg_iter )
|
||||
DEALLOCATE( g0, g1, g2, gamma )
|
||||
DEALLOCATE( spsi0vec_d )
|
||||
DEALLOCATE( alpha )
|
||||
!
|
||||
CALL stop_clock( 'pcg' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE bpcg_gamma_gpu
|
|
@ -0,0 +1,389 @@
|
|||
! Copyright (C) 2015-2016 Aihui Zhou's group
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!-------------------------------------------------------------------------------
|
||||
!
|
||||
! We propose some parallel orbital updating based plane wave basis methods
|
||||
! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
|
||||
! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
|
||||
! parallelization, which make them have great advantage in large-scale parallelization.
|
||||
!
|
||||
! The approach following Algorithm is the parallel orbital updating algorithm:
|
||||
! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
|
||||
! the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$
|
||||
! and let $n=0$.
|
||||
! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
|
||||
! $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))] $$
|
||||
! in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the
|
||||
! $n$-th iteration or the former iterations.
|
||||
! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$ satisfying
|
||||
! $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad \forall v \in \tilde{V}_N$$
|
||||
! where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$,
|
||||
! $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
|
||||
! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else, stop.
|
||||
!
|
||||
! You can see the detailed information through
|
||||
! X. Dai, X. Gong, A. Zhou, J. Zhu,
|
||||
! A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
|
||||
! X. Dai, Z. Liu, X. Zhang, A. Zhou,
|
||||
! A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations,
|
||||
! arXiv:1510.07230 (2015).
|
||||
! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
|
||||
! A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
|
||||
!
|
||||
! The file is written mainly by Stefano de Gironcoli and Yan Pan.
|
||||
! * GPU version Ivan Carnimeo
|
||||
!
|
||||
! The following file is for solving step 2 of the parallel orbital updating algorithm.
|
||||
!
|
||||
#define ZERO ( 0.D0, 0.D0 )
|
||||
#define ONE ( 1.D0, 0.D0 )
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE bpcg_k_gpu( hs_psi_gpu, g_1psi_gpu, psi0_d, spsi0_d, npw, npwx, nbnd, npol, nvec, &
|
||||
psi_d, hpsi_d, spsi_d, ethr, e_d, nhpsi )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! Block Preconditioned Conjugate Gradient solution of the linear system
|
||||
!
|
||||
! [ H - e S ]|\tilde\psi> = Pc [ e S - H ] |psi>
|
||||
!
|
||||
! the search targets the space orthogonal to the current best wfcs (psi0);
|
||||
! the solution is sought until the residual norm is a fixed fraction of the RHS norm
|
||||
! in this way the more accurate is the original problem the more accuratly the correction is computed
|
||||
!
|
||||
! in order to avoid un-necessary HSpsi evaluations this version assumes psi,hpsi and spsi are all
|
||||
! provided in input and return their estimate for further use
|
||||
!
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
USE util_param, ONLY : DP, stdout
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! Following varibales are temporary
|
||||
|
||||
COMPLEX(DP),INTENT(IN) :: psi0_d(npwx*npol,nbnd) ! psi0 needed to compute the Pv projection
|
||||
COMPLEX(DP),INTENT(IN) :: spsi0_d(npwx*npol,nbnd) ! Spsi0 needed to compute the Pv projection
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nbnd, npol, nvec ! input dimensions
|
||||
REAL(DP), INTENT(IN) :: ethr ! threshold for convergence.
|
||||
REAL(DP), INTENT(INOUT) :: e_d(nvec) ! current estimate of the target eigenvalues
|
||||
COMPLEX(DP),INTENT(INOUT) :: psi_d(npwx*npol,nvec),hpsi_d(npwx*npol,nvec),spsi_d(npwx*npol,nvec) !
|
||||
! input: the current estimate of the wfcs
|
||||
! output: the estimated correction vectors
|
||||
INTEGER, INTENT(INOUT) :: nhpsi ! (updated) number of Hpsi evaluations
|
||||
!
|
||||
! ... LOCAL variables
|
||||
!
|
||||
INTEGER, PARAMETER :: maxter = 5 ! maximum number of CG iterations
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: b_d(:,:), & ! RHS for testing
|
||||
p_d(:,:), hp_d(:,:), sp_d(:,:), z_d(:,:) ! additional working vetors
|
||||
|
||||
COMPLEX(DP), ALLOCATABLE :: spsi0vec_d (:,:) ! the product of spsi0 and a group of vectors
|
||||
|
||||
REAL(DP), ALLOCATABLE :: g0(:), g1(:), g2(:), gamma(:), ethr_cg(:), ff(:), ff0(:)
|
||||
REAL(DP), ALLOCATABLE :: alpha(:)
|
||||
INTEGER, ALLOCATABLE :: cg_iter(:)
|
||||
REAL(DP) :: beta, ee
|
||||
INTEGER :: kdim, kdmx, i, l, block_size, done, nactive, nnew, newdone
|
||||
!
|
||||
REAL(DP), EXTERNAL :: gpu_DDOT
|
||||
|
||||
EXTERNAL g_1psi_gpu, hs_psi_gpu
|
||||
! hs_1psi( npwx, npw, psi, hpsi, spsi )
|
||||
! hs_psi( npwx, npw, nvec, psi, hpsi, spsi )
|
||||
!
|
||||
!
|
||||
INTEGER :: ii, jj ! cuf kernel indeces
|
||||
REAL(DP) :: tmp
|
||||
#if defined (__CUDA)
|
||||
attributes(device) :: psi_d, hpsi_d, spsi_d, psi0_d, spsi0_d, spsi0vec_d
|
||||
attributes(device) :: e_d
|
||||
attributes(device) :: b_d, p_d, hp_d, sp_d, z_d
|
||||
#endif
|
||||
!
|
||||
CALL start_clock( 'pcg' ); !write (6,*) ' enter pcg' , e(1:2), 'npol = ', npol ; FLUSH(6)
|
||||
!
|
||||
kdim = npwx*(npol-1) + npw
|
||||
kdmx = npwx* npol
|
||||
block_size = min(nvec,64)
|
||||
!
|
||||
ALLOCATE( g0( block_size ), g1( block_size ), g2( block_size ), gamma( block_size ) )
|
||||
ALLOCATE( ethr_cg( block_size ), ff( block_size ), ff0( block_size ), cg_iter( block_size ) )
|
||||
ALLOCATE( z_d( kdmx, block_size ), b_d( kdmx, block_size ) )
|
||||
ALLOCATE( p_d(kdmx,block_size), hp_d(kdmx,block_size), sp_d(kdmx,block_size) )
|
||||
ALLOCATE( spsi0vec_d(nbnd, block_size) )
|
||||
ALLOCATE( alpha( block_size ) )
|
||||
!
|
||||
done = 0 ! the number of correction vectors already solved
|
||||
nactive = 0 ! the number of correction vectors currently being updated
|
||||
cg_iter = 0 ! how many iteration each active vector has completed (<= maxter)
|
||||
|
||||
MAIN_LOOP: & ! This is a continuous loop. It terminates only when nactive vanishes
|
||||
DO
|
||||
nnew = min(done+block_size,nvec)-(done+nactive) ! number of new corrections to be added to the seach
|
||||
if ( nnew > 0 ) then ! add nnew vectors to the active list
|
||||
!write(6,*) ' nnew =', nnew
|
||||
do l=nactive+1,nactive+nnew; i=l+done
|
||||
!write(6,*) ' l =',l,' i =',i
|
||||
!write (6,*) ' enter pcg' , e(i), 'npol = ', npol ; FLUSH(6)
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, kdmx
|
||||
b_d(ii,l) = e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i) ! initial gradient and saved RHS for later
|
||||
END DO
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, kdmx
|
||||
z_d(ii,l) = b_d(ii,l)
|
||||
END DO
|
||||
call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i)) ! initial preconditioned gradient
|
||||
end do
|
||||
!- project on conduction bands
|
||||
CALL start_clock( 'pcg:ortho' )
|
||||
CALL gpu_ZGEMM( 'C', 'N', nbnd, nnew, kdim, ONE, spsi0_d, kdmx, z_d(:,nactive+1), kdmx, ZERO, &
|
||||
spsi0vec_d, nbnd )
|
||||
CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
|
||||
CALL gpu_ZGEMM( 'N', 'N', kdim, nnew, nbnd, (-1.D0,0.D0), psi0_d, kdmx, spsi0vec_d, nbnd, ONE, &
|
||||
z_d(:,nactive+1), kdmx )
|
||||
CALL stop_clock( 'pcg:ortho' )
|
||||
!-
|
||||
do l=nactive+1,nactive+nnew; i=l+done
|
||||
g0(l) = gpu_DDOT( 2*kdim, z_d(:,l), 1, b_d(:,l), 1 )
|
||||
end do
|
||||
CALL mp_sum( g0(nactive+1:nactive+nnew), intra_bgrp_comm ) ! g0 = < initial z | initial gradient b >
|
||||
|
||||
do l=nactive+1,nactive+nnew; i=l+done
|
||||
!write(6,*) ' l =',l,' i =',i
|
||||
ff(l) = 0.d0 ; ff0(l) = ff(l)
|
||||
!write (6,*) 0, g0(l), ff(l)
|
||||
|
||||
! ethr_cg = ethr ! CG convergence threshold could be set from input but it is not ...
|
||||
ethr_cg(l) = 1.0D-2 ! it makes more sense to fix the convergence of the CG solution to a
|
||||
! fixed function of the RHS (see ethr_cg update later).
|
||||
ethr_cg(l) = max ( 0.01*ethr, ethr_cg(l) * g0(l) ) ! here we set the convergence of the correction
|
||||
!write(6,*) 'ethr_cg :', ethr_cg(l)
|
||||
|
||||
! zero the trial solution
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, kdmx
|
||||
psi_d(ii,i) = ZERO
|
||||
hpsi_d(ii,i) = ZERO
|
||||
spsi_d(ii,i) = ZERO
|
||||
END DO
|
||||
! initial search direction
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, kdmx
|
||||
p_d(ii,l) = z_d(ii,l)
|
||||
END DO
|
||||
cg_iter(l) = 0 ! this is a new correction vector, reset its interation count
|
||||
end do
|
||||
nactive = nactive + nnew
|
||||
end if
|
||||
!write(6,*) ' done =',done, ' nactive =',nactive
|
||||
|
||||
! iterate: ! DO cg_iter = 1, maxter ! THIS IS THE ENTRY POINT OF THE PCG LOOP
|
||||
|
||||
if ( nactive == 0 ) EXIT MAIN_LOOP ! this is the only MAIN_LOOP EXIT condition
|
||||
|
||||
cg_iter(1:nactive) = cg_iter(1:nactive) + 1 ! update interation counters
|
||||
|
||||
CALL start_clock( 'pcg:hs_1psi' )
|
||||
! do l = 1, nactive ! THIS COULD/SHOULD BE A GLOBAL CALL (ONLY WITHIN ONE BGRP THOUGH)
|
||||
! CALL hs_1psi( npwx, npw, p(:,l), hp(:,l), sp(:,l) ) ! apply H to a single wavefunction (no bgrp parallelization here!)
|
||||
! end do
|
||||
CALL hs_psi_gpu( npwx, npw, nactive, p_d, hp_d, sp_d ) ! apply H to a single wavefunction (no bgrp parallelization here!)
|
||||
CALL stop_clock( 'pcg:hs_1psi' )
|
||||
do l = 1, nactive; i=l+done
|
||||
gamma(l) = gpu_DDOT( 2*kdim, p_d(:,l), 1, hp_d(:,l), 1 )
|
||||
gamma(l) = gamma(l) - e_d(i) * gpu_DDOT( 2*kdim, p_d(:,l), 1, sp_d(:,l), 1 )
|
||||
end do
|
||||
CALL mp_sum( gamma(1:nactive), intra_bgrp_comm ) ! gamma = < p | hp - e sp >
|
||||
|
||||
do l = 1, nactive; i=l+done
|
||||
!write(6,*) ' l =',l,' i =',i
|
||||
|
||||
alpha(l) = g0(l)/gamma(l)
|
||||
!write(6,*) 'g0, gamma, alpha :', g0(l), gamma(l), alpha(l)
|
||||
tmp = alpha(l)
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, kdmx
|
||||
psi_d(ii,i) = psi_d(ii,i) + tmp * p_d(ii,l) ! updated solution
|
||||
hpsi_d(ii,i) = hpsi_d(ii,i) + tmp * hp_d(ii,l) ! updated solution
|
||||
spsi_d(ii,i) = spsi_d(ii,i) + tmp * sp_d(ii,l) ! updated solution
|
||||
END DO
|
||||
|
||||
g2(l) = gpu_DDOT(2*kdim,z_d(:,l),1,b_d(:,l),1) &
|
||||
+ e_d(i) * gpu_DDOT(2*kdim,z_d(:,l),1,spsi_d(:,i),1) &
|
||||
- gpu_DDOT(2*kdim,z_d(:,l),1,hpsi_d(:,i),1)
|
||||
end do
|
||||
CALL mp_sum( g2(1:nactive), intra_bgrp_comm ) ! g2 = < old z | new gradient b + e spsi - hpsi >
|
||||
do l = 1, nactive; i=l+done ! update the preconditioned gradient
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, kdmx
|
||||
z_d(ii,l) = b_d(ii,l) + e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i)
|
||||
END DO
|
||||
call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i))
|
||||
end do
|
||||
!- project on conduction bands
|
||||
CALL start_clock( 'pcg:ortho' )
|
||||
CALL gpu_ZGEMM( 'C', 'N', nbnd, nactive, kdim, ONE, spsi0_d, kdmx, z_d, kdmx, ZERO, spsi0vec_d, nbnd )
|
||||
CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
|
||||
CALL gpu_ZGEMM( 'N', 'N', kdim, nactive, nbnd, (-1.D0,0.D0), psi0_d, kdmx, spsi0vec_d, nbnd, ONE, z_d, kdmx )
|
||||
CALL stop_clock( 'pcg:ortho' )
|
||||
!-
|
||||
do l = 1, nactive; i=l+done
|
||||
g1(l) = gpu_DDOT(2*kdim,z_d(:,l),1,b_d(:,l),1) &
|
||||
+ e_d(i) * gpu_DDOT(2*kdim,z_d(:,l),1,spsi_d(:,i),1) &
|
||||
- gpu_DDOT(2*kdim,z_d(:,l),1,hpsi_d(:,i),1)
|
||||
end do
|
||||
CALL mp_sum( g1(1:nactive), intra_bgrp_comm ) ! g1 = < new z | new gradient b + e spsi - hpsi >
|
||||
do l = 1, nactive; i = l + done ! evaluate the function ff
|
||||
ff(l) = -0.5_DP * ( e_d(i)*gpu_DDOT(2*kdim,psi_d(:,i),1,spsi_d(:,i),1)&
|
||||
-gpu_DDOT(2*kdim,psi_d(:,i),1,hpsi_d(:,i),1) )
|
||||
ff(l) = ff(l) - gpu_DDOT(2*kdim,psi_d(:,i),1,b_d(:,l),1)
|
||||
end do
|
||||
CALL mp_sum( ff(1:nactive), intra_bgrp_comm ) ! function minimum -0.5 < psi | e spsi - hpsi > - < psi | b >
|
||||
|
||||
newdone = 0 ! number of correction vectors that converge (or are done) at this iteration
|
||||
do l = 1, nactive; i = l + done
|
||||
!write (6,*) cg_iter(l), g1(l), ff(l), gamma(l)
|
||||
|
||||
IF ( ff(l) > ff0(l) .AND. ff0(l) < 0.d0 ) THEN
|
||||
tmp = alpha(l)
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, kdmx
|
||||
psi_d(ii,i) = psi_d(ii,i) - tmp * p_d(ii,l) ! fallback solution: if last iter failed to improve ff0
|
||||
hpsi_d(ii,i) = hpsi_d(ii,i) - tmp * hp_d(ii,l)! exit whitout updating and ...
|
||||
spsi_d(ii,i) = spsi_d(ii,i) - tmp * sp_d(ii,l)! hope next time it'll be better
|
||||
END DO
|
||||
END IF
|
||||
|
||||
!write(6,*) 'g0, g1, g2 :', g0(l), g1(l), g2(l)
|
||||
!write(6,*) 'ff0, ff : ', ff0(l), ff(l)
|
||||
IF ( ABS ( g1(l) ) < ethr_cg(l) .OR. ( ff(l) > ff0(l) ) .OR. cg_iter(l) == maxter) THEN ! EXIT iterate
|
||||
!write (6,*) ' exit pcg loop'
|
||||
!write(6,*) ' l =',l,' i =',i
|
||||
!if ( cg_iter(l) == maxter.and. ABS(g1(l)) > ethr_cg(l)) write (6,*) 'CG not converged maxter exceeded', cg_iter(l), g1(l), g0(l), ethr_cg(l)
|
||||
!IF ( ABS ( g1(l) ) < ethr_cg(l)) write (6,*) 'CG correction converged ', cg_iter(l), g1(l), ethr_cg(l)
|
||||
!IF ( ABS ( g1(l) ) > g0(l) ) write (6,*) 'CG not converged ', cg_iter(l), g1(l), g0(l), ethr_cg(l)
|
||||
|
||||
nhpsi = nhpsi + cg_iter(l) ! update nhpsi count
|
||||
IF (.NOT. (ABS(g1(l))< ethr_cg(l) .OR. (ff(l)>ff0(l)) ) .AND. cg_iter(l)==maxter) nhpsi = nhpsi + 1 ! because this would be the count
|
||||
|
||||
newdone = newdone + 1 ! one more solution found (or no more active anyway)
|
||||
!write(6,*) ' newdone = ', newdone
|
||||
|
||||
CALL start_clock( 'pcg:move' )
|
||||
!write(6,*) ' swapping converged psi/hpsi/spsi i = ',i, " with i' = ",done+newdone
|
||||
! swap the terminated vector with the first in the list of the active ones
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, kdmx
|
||||
p_d (ii,l) = psi_d (ii,done+newdone)
|
||||
psi_d (ii,done+newdone) = psi_d (ii,i)
|
||||
psi_d (ii,i) = p_d (ii,l)
|
||||
hp_d(ii,l) = hpsi_d(ii,done+newdone)
|
||||
hpsi_d(ii,done+newdone) = hpsi_d(ii,i)
|
||||
hpsi_d(ii,i) = hp_d(ii,l)
|
||||
sp_d(ii,l) = spsi_d(ii,done+newdone)
|
||||
spsi_d(ii,done+newdone) = spsi_d(ii,i)
|
||||
spsi_d(ii,i) = sp_d(ii,l)
|
||||
END DO
|
||||
|
||||
ee = e_d(done+newdone)
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, 1
|
||||
e_d(done+newdone) = e_d(i)
|
||||
END DO
|
||||
e_d(i) = ee
|
||||
|
||||
!write(6,*) ' overwrite converged p/hp/etc l = ',l, ' with newdone = ',newdone
|
||||
! move information of the swapped active vector in the right place to keep going
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, kdmx
|
||||
p_d(ii,l) = p_d(ii,newdone)
|
||||
hp_d(ii,l) = p_d(ii,newdone)
|
||||
sp_d(ii,l) = sp_d(ii,newdone)
|
||||
b_d(ii,l) = b_d(ii,newdone)
|
||||
z_d(ii,l) = z_d(ii,newdone)
|
||||
END DO
|
||||
|
||||
alpha(l) = alpha(newdone)
|
||||
|
||||
ff0(l) = ff0(newdone)
|
||||
ff(l) = ff(newdone)
|
||||
g0(l) = g0(newdone)
|
||||
g1(l) = g1(newdone)
|
||||
g2(l) = g2(newdone)
|
||||
cg_iter(l) = cg_iter(newdone)
|
||||
ethr_cg(l) = ethr_cg(newdone)
|
||||
CALL stop_clock( 'pcg:move' )
|
||||
|
||||
ELSE
|
||||
|
||||
!write(6,*) ' l =',l,' i =',i
|
||||
beta = (g1(l)-g2(l))/g0(l) ! Polak - Ribiere style update
|
||||
g0(l) = g1(l) ! < new z | new gradient > -> < old z | old gradient >
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, kdmx
|
||||
p_d(ii,l) = z_d(ii,l) + beta * p_d(ii,l) ! updated search direction
|
||||
END DO
|
||||
!write(6,*) 'beta :', beta
|
||||
|
||||
ff0(l) = ff(l) ! updated minimum value reached by the function
|
||||
|
||||
END IF
|
||||
end do
|
||||
|
||||
IF ( newdone > 0 ) THEN
|
||||
|
||||
done = done + newdone
|
||||
nactive = nactive - newdone
|
||||
|
||||
!write(6,*) ' there have been ', newdone, ' new converged solution'
|
||||
!write(6,*) ' done = ', done, ' nactive =', nactive
|
||||
|
||||
CALL start_clock( 'pcg:move' )
|
||||
do l=1, nactive
|
||||
|
||||
!write(6,*) ' l+newdone =',l+newdone,' -> l =',l
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, kdmx
|
||||
p_d (ii,l) = p_d (ii,l+newdone)
|
||||
hp_d(ii,l) = hp_d(ii,l+newdone)
|
||||
sp_d(ii,l) = sp_d(ii,l+newdone)
|
||||
b_d(ii,l) = b_d(ii,l+newdone)
|
||||
z_d(ii,l) = z_d(ii,l+newdone)
|
||||
END DO
|
||||
|
||||
ff0(l) = ff0(l+newdone) ; ff(l) = ff(l+newdone)
|
||||
g0(l) = g0(l+newdone) ; g1(l) = g1(l+newdone) ; g2(l) = g2(l+newdone)
|
||||
cg_iter(l) = cg_iter(l+newdone) ; ethr_cg(l) = ethr_cg(l+newdone)
|
||||
|
||||
end do
|
||||
CALL stop_clock( 'pcg:move' )
|
||||
|
||||
END IF
|
||||
|
||||
! END DO iterate Here is where the pcg loop would terminate
|
||||
|
||||
END DO MAIN_LOOP
|
||||
!write (6,*) ' exit pcg loop'
|
||||
|
||||
DEALLOCATE( spsi0vec_d )
|
||||
DEALLOCATE( b_d, p_d, hp_d, sp_d, z_d )
|
||||
DEALLOCATE( ethr_cg, ff, ff0, cg_iter )
|
||||
DEALLOCATE( g0, g1, g2, gamma )
|
||||
DEALLOCATE( alpha )
|
||||
!
|
||||
CALL stop_clock( 'pcg' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE bpcg_k_gpu
|
|
@ -0,0 +1,13 @@
|
|||
PARO += \
|
||||
paro_gamma_new_gpu.o \
|
||||
paro_k_new_gpu.o \
|
||||
bpcg_gamma_gpu.o \
|
||||
bpcg_k_gpu.o
|
||||
|
||||
|
||||
bpcg_gamma_gpu.o : ../../UtilXlib/mp.o
|
||||
bpcg_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
bpcg_gamma_gpu.o : ../../UtilXlib/util_param.o
|
||||
paro_gamma_new_gpu.o : ../../UtilXlib/mp.o
|
||||
paro_gamma_new_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
paro_gamma_new_gpu.o : ../../UtilXlib/util_param.o
|
|
@ -0,0 +1,327 @@
|
|||
!
|
||||
! Copyright (C) 2015-2016 Aihui Zhou's group
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!-------------------------------------------------------------------------------
|
||||
!
|
||||
! We propose some parallel orbital updating based plane wave basis methods
|
||||
! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
|
||||
! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
|
||||
! parallelization, which make them have great advantage in large-scale parallelization.
|
||||
!
|
||||
! The approach following Algorithm is the parallel orbital updating algorithm:
|
||||
! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
|
||||
! the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$
|
||||
! and let $n=0$.
|
||||
! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
|
||||
! $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))] $$
|
||||
! in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the
|
||||
! $n$-th iteration or the former iterations.
|
||||
! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$ satisfying
|
||||
! $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad \forall v \in \tilde{V}_N$$
|
||||
! where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$,
|
||||
! $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
|
||||
! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else, stop.
|
||||
!
|
||||
! You can see the detailed information through
|
||||
! X. Dai, X. Gong, A. Zhou, J. Zhu,
|
||||
! A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
|
||||
! X. Dai, Z. Liu, X. Zhang, A. Zhou,
|
||||
! A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations,
|
||||
! arXiv:1510.07230 (2015).
|
||||
! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
|
||||
! A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
|
||||
!
|
||||
! The file is written mainly by Stefano de Gironcoli and Yan Pan.
|
||||
!
|
||||
! GPU version by Ivan Carnimeo
|
||||
!
|
||||
!-------------------------------------------------------------------------------
|
||||
SUBROUTINE paro_gamma_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap, &
|
||||
npwx, npw, nbnd, evc_d, eig_d, btype, ethr, notconv, nhpsi )
|
||||
!-------------------------------------------------------------------------------
|
||||
!paro_flag = 1: modified parallel orbital-updating method
|
||||
!
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
! global variables
|
||||
USE util_param, ONLY : DP, stdout
|
||||
USE mp_bands_util, ONLY : inter_bgrp_comm, nbgrp, my_bgrp_id
|
||||
USE mp, ONLY : mp_sum, mp_allgather, mp_barrier, &
|
||||
mp_type_create_column_section, mp_type_free
|
||||
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INCLUDE 'laxlib.fh'
|
||||
|
||||
! I/O variables
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nbnd
|
||||
COMPLEX(DP), INTENT(INOUT) :: evc_d(npwx,nbnd)
|
||||
REAL(DP), INTENT(IN) :: ethr
|
||||
REAL(DP), INTENT(INOUT) :: eig_d(nbnd)
|
||||
INTEGER, INTENT(IN) :: btype(nbnd)
|
||||
INTEGER, INTENT(OUT) :: notconv, nhpsi
|
||||
! INTEGER, INTENT(IN) :: paro_flag
|
||||
|
||||
! local variables (used in the call to cegterg )
|
||||
!------------------------------------------------------------------------
|
||||
EXTERNAL h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu
|
||||
! subroutine h_psi (npwx,npw,nvec,evc,hpsi) computes H*evc using band parallelization
|
||||
! subroutine s_psi (npwx,npw,nvec,evc,spsi) computes S*evc using band parallelization
|
||||
! subroutine hs_1psi(npwx,npw,evc,hpsi,spsi) computes H*evc and S*evc for a single band
|
||||
! subroutine g_1psi (npwx,npw,psi,eig) computes g*psi for a single band
|
||||
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: itry, paro_ntr, nconv, nextra, nactive, nbase, ntrust, ndiag, nvecx, nproc_ortho
|
||||
|
||||
LOGICAL, ALLOCATABLE :: conv(:)
|
||||
|
||||
REAL(DP), PARAMETER :: extra_factor = 0.5 ! workspace is at most this factor larger than nbnd
|
||||
INTEGER, PARAMETER :: min_extra = 4 ! but at least this lager
|
||||
|
||||
INTEGER :: ibnd, ibnd_start, ibnd_end, how_many, lbnd, kbnd, last_unconverged, &
|
||||
recv_counts(nbgrp), displs(nbgrp), column_type
|
||||
|
||||
INTEGER :: ii, jj, kk ! indexes for cuf kernel loops
|
||||
|
||||
!civn 2fix: these are needed only for __MPI = true (protate)
|
||||
COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
|
||||
REAL(DP), ALLOCATABLE :: eig(:), ew(:)
|
||||
!
|
||||
!
|
||||
! ... device variables
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: ew_d(:)
|
||||
LOGICAL, ALLOCATABLE :: conv_d(:)
|
||||
#if defined (__CUDA)
|
||||
attributes(device) :: psi_d, hpsi_d, spsi_d
|
||||
attributes(device) :: evc_d, eig_d, ew_d
|
||||
attributes(device) :: conv_d
|
||||
#endif
|
||||
!
|
||||
! ... init local variables
|
||||
!
|
||||
CALL laxlib_getval( nproc_ortho = nproc_ortho )
|
||||
paro_ntr = 20
|
||||
|
||||
nvecx = nbnd + max ( nint ( extra_factor * nbnd ), min_extra )
|
||||
!
|
||||
CALL start_clock( 'paro_gamma' ); !write (6,*) ' enter paro diag'
|
||||
|
||||
CALL mp_type_create_column_section(evc_d(1,1), 0, npwx, npwx, column_type)
|
||||
|
||||
ALLOCATE ( conv(nbnd) )
|
||||
ALLOCATE ( conv_d(nbnd) )
|
||||
ALLOCATE ( psi_d(npwx,nvecx), hpsi_d(npwx,nvecx), spsi_d(npwx,nvecx), ew_d(nvecx) )
|
||||
|
||||
CALL start_clock( 'paro:init' );
|
||||
conv(:) = .FALSE. ; nconv = COUNT ( conv(:) )
|
||||
!$cuf kernel do(1)
|
||||
do ii = 1, nbnd
|
||||
conv_d(ii) = .FALSE.
|
||||
end do
|
||||
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
psi_d(ii,1:nbnd) = evc_d(ii,1:nbnd) ! copy input evc into work vector
|
||||
END DO
|
||||
|
||||
call h_psi_gpu (npwx,npw,nbnd,psi_d,hpsi_d) ! computes H*psi
|
||||
call s_psi_gpu (npwx,npw,nbnd,psi_d,spsi_d) ! computes S*psi
|
||||
nhpsi = 0 ; IF (my_bgrp_id==0) nhpsi = nbnd
|
||||
CALL stop_clock( 'paro:init' );
|
||||
|
||||
#if defined(__MPI)
|
||||
IF ( nproc_ortho == 1 ) THEN
|
||||
#endif
|
||||
CALL rotate_HSpsi_gamma_gpu ( npwx, npw, nbnd, nbnd, psi_d, hpsi_d, overlap, spsi_d, eig_d )
|
||||
#if defined(__MPI)
|
||||
ELSE
|
||||
!civn 2fix
|
||||
ALLOCATE ( psi(npwx,nvecx), hpsi(npwx,nvecx), spsi(npwx,nvecx), eig(nbnd) )
|
||||
eig = eig_d
|
||||
psi = psi_d
|
||||
hpsi = hpsi_d
|
||||
spsi = spsi_d
|
||||
CALL protate_HSpsi_gamma( npwx, npw, nbnd, nbnd, psi, hpsi, overlap, spsi, eig )
|
||||
eig_d = eig
|
||||
psi_d = psi
|
||||
hpsi_d = hpsi
|
||||
spsi_d = spsi
|
||||
DEALLOCATE ( psi, hpsi, spsi, eig )
|
||||
ENDIF
|
||||
#endif
|
||||
|
||||
!write (6,'(10f10.4)') psi(1:5,1:3)
|
||||
|
||||
!write (6,*) eig(1:nbnd)
|
||||
|
||||
ParO_loop : &
|
||||
DO itry = 1,paro_ntr
|
||||
|
||||
!write (6,*) ' paro_itry =', itry, ethr
|
||||
|
||||
!----------------------------
|
||||
|
||||
nactive = nbnd - (nconv+1)/2 ! number of correction vectors to be computed (<nbnd)
|
||||
notconv = nbnd - nconv ! number of needed roots
|
||||
nextra = nactive - notconv ! number of extra vectors
|
||||
nbase = nconv + nactive ! number of orbitals the correction should be orthogonal to (<2*nbnd)
|
||||
ndiag = nbase + nactive ! dimension of the matrix to be diagonalized at this iteration (<2*nbnd)
|
||||
|
||||
!----------------------------
|
||||
|
||||
nactive = min ( (nvecx-nconv)/2, nvecx-nbnd) ! number of corrections there is space for
|
||||
notconv = nbnd - nconv ! number of needed roots
|
||||
nextra = max ( nactive - notconv, 0 ) ! number of extra vectors, if any
|
||||
nbase = max ( nconv + nactive , nbnd ) ! number of orbitals to be orthogonal to (<nvecx)
|
||||
ntrust = min ( nconv + nactive , nbnd ) ! number of orbitals that will be actually corrected
|
||||
ndiag = nbase + nactive ! dimension of the matrix to be diagonalized at this iteration (<nvecx)
|
||||
|
||||
!write (6,*) itry, notconv, conv
|
||||
!write (6,*) ' nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag =', nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag
|
||||
|
||||
CALL divide_all(inter_bgrp_comm,nactive,ibnd_start,ibnd_end,recv_counts,displs)
|
||||
how_many = ibnd_end - ibnd_start + 1
|
||||
!write (6,*) nactive, ibnd_start, ibnd_end, recv_counts, displs
|
||||
|
||||
CALL start_clock( 'paro:pack' );
|
||||
lbnd = 1; kbnd = 1
|
||||
DO ibnd = 1, ntrust ! pack unconverged roots in the available space
|
||||
IF (.NOT.conv(ibnd) ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
psi_d (ii,nbase+kbnd) = psi_d(ii,ibnd)
|
||||
hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
|
||||
spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
|
||||
END DO
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, 1
|
||||
ew_d(kbnd) = eig_d(ibnd)
|
||||
END DO
|
||||
last_unconverged = ibnd
|
||||
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
|
||||
END IF
|
||||
END DO
|
||||
DO ibnd = nbnd+1, nbase ! add extra vectors if it is the case
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
psi_d (ii,nbase+kbnd) = psi_d(ii,ibnd)
|
||||
hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
|
||||
spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
|
||||
END DO
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, 1
|
||||
ew_d(kbnd) = eig_d(last_unconverged)
|
||||
END DO
|
||||
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx
|
||||
DO jj = nbase+1, nbase+how_many
|
||||
kk = jj + ibnd_start - 1
|
||||
psi_d (ii,jj) = psi_d (ii,kk)
|
||||
hpsi_d(ii,jj) = hpsi_d(ii,kk)
|
||||
spsi_d(ii,jj) = spsi_d(ii,kk)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, how_many
|
||||
kk = ii + ibnd_start - 1
|
||||
ew_d(ii) = ew_d(kk)
|
||||
END DO
|
||||
|
||||
CALL stop_clock( 'paro:pack' );
|
||||
|
||||
! write (6,*) ' check nactive = ', lbnd, nactive
|
||||
if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED '
|
||||
CALL bpcg_gamma_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, how_many, &
|
||||
psi_d(:,nbase+1), hpsi_d(:,nbase+1), spsi_d(:,nbase+1), ethr, ew_d(1), nhpsi)
|
||||
CALL start_clock( 'paro:mp_bar' );
|
||||
CALL mp_barrier(inter_bgrp_comm)
|
||||
CALL stop_clock( 'paro:mp_bar' );
|
||||
CALL start_clock( 'paro:mp_sum' );
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx
|
||||
DO jj = nbase+1, nbase+how_many
|
||||
kk = jj + ibnd_start - 1
|
||||
psi_d(ii, kk) = psi_d(ii, jj)
|
||||
hpsi_d(ii, kk) = hpsi_d(ii, jj)
|
||||
spsi_d(ii, kk) = spsi_d(ii, jj)
|
||||
END DO
|
||||
END DO
|
||||
CALL mp_allgather(psi_d (:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
CALL mp_allgather(hpsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
CALL mp_allgather(spsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
CALL stop_clock( 'paro:mp_sum' );
|
||||
|
||||
#if defined(__MPI)
|
||||
IF ( nproc_ortho == 1 ) THEN
|
||||
#endif
|
||||
CALL rotate_HSpsi_gamma_gpu ( npwx, npw, ndiag, ndiag, psi_d, hpsi_d, overlap, spsi_d, ew_d )
|
||||
#if defined(__MPI)
|
||||
ELSE
|
||||
!civn 2fix
|
||||
ALLOCATE ( psi(npwx,nvecx), hpsi(npwx,nvecx), spsi(npwx,nvecx), ew(nvecx) )
|
||||
ew = ew_d
|
||||
psi = psi_d
|
||||
hpsi = hpsi_d
|
||||
spsi = spsi_d
|
||||
CALL protate_HSpsi_gamma( npwx, npw, ndiag, ndiag, psi, hpsi, overlap, spsi, ew )
|
||||
ew_d = ew
|
||||
psi_d = psi
|
||||
hpsi_d = hpsi
|
||||
spsi_d = spsi
|
||||
DEALLOCATE ( psi, hpsi, spsi, ew )
|
||||
ENDIF
|
||||
#endif
|
||||
|
||||
!write (6,*) ' ew : ', ew(1:nbnd)
|
||||
! only the first nbnd eigenvalues are relevant for convergence
|
||||
! but only those that have actually been corrected should be trusted
|
||||
conv(1:nbnd) = .FALSE.
|
||||
!$cuf kernel do(1)
|
||||
do ii = 1, nbnd
|
||||
conv_d(ii) = .FALSE.
|
||||
end do
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, ntrust
|
||||
conv_d(ii) = ABS(ew_d(ii) - eig_d(ii)).LT.ethr
|
||||
END DO
|
||||
conv = conv_d
|
||||
nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, nbnd
|
||||
eig_d(ii) = ew_d(ii)
|
||||
END DO
|
||||
IF ( nconv == nbnd ) EXIT ParO_loop
|
||||
|
||||
END DO ParO_loop
|
||||
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx
|
||||
DO jj = 1, nbnd
|
||||
evc_d(ii,jj) = psi_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
|
||||
CALL mp_sum(nhpsi,inter_bgrp_comm)
|
||||
|
||||
DEALLOCATE ( ew_d, conv )
|
||||
DEALLOCATE ( conv_d )
|
||||
DEALLOCATE ( psi_d, hpsi_d, spsi_d )
|
||||
|
||||
CALL mp_type_free( column_type )
|
||||
|
||||
CALL stop_clock( 'paro_gamma' ); !write (6,*) ' exit paro diag'
|
||||
|
||||
END SUBROUTINE paro_gamma_new_gpu
|
|
@ -0,0 +1,325 @@
|
|||
!
|
||||
! Copyright (C) 2015-2016 Aihui Zhou's group
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!-------------------------------------------------------------------------------
|
||||
!
|
||||
! We propose some parallel orbital updating based plane wave basis methods
|
||||
! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
|
||||
! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
|
||||
! parallelization, which make them have great advantage in large-scale parallelization.
|
||||
!
|
||||
! The approach following Algorithm is the parallel orbital updating algorithm:
|
||||
! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
|
||||
! the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$
|
||||
! and let $n=0$.
|
||||
! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
|
||||
! $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))] $$
|
||||
! in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the
|
||||
! $n$-th iteration or the former iterations.
|
||||
! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$ satisfying
|
||||
! $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad \forall v \in \tilde{V}_N$$
|
||||
! where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$,
|
||||
! $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
|
||||
! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else, stop.
|
||||
!
|
||||
! You can see the detailed information through
|
||||
! X. Dai, X. Gong, A. Zhou, J. Zhu,
|
||||
! A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
|
||||
! X. Dai, Z. Liu, X. Zhang, A. Zhou,
|
||||
! A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations,
|
||||
! arXiv:1510.07230 (2015).
|
||||
! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
|
||||
! A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
|
||||
!
|
||||
! The file is written mainly by Stefano de Gironcoli and Yan Pan.
|
||||
!
|
||||
!-------------------------------------------------------------------------------
|
||||
SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap, &
|
||||
npwx, npw, nbnd, npol, evc_d, eig_d, btype, ethr, notconv, nhpsi )
|
||||
!-------------------------------------------------------------------------------
|
||||
!paro_flag = 1: modified parallel orbital-updating method
|
||||
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
! global variables
|
||||
USE util_param, ONLY : DP, stdout
|
||||
USE mp_bands_util, ONLY : inter_bgrp_comm, nbgrp, my_bgrp_id
|
||||
USE mp, ONLY : mp_sum, mp_allgather, mp_barrier, &
|
||||
mp_type_create_column_section, mp_type_free
|
||||
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INCLUDE 'laxlib.fh'
|
||||
|
||||
! I/O variables
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nbnd, npol
|
||||
COMPLEX(DP), INTENT(INOUT) :: evc_d(npwx*npol,nbnd)
|
||||
REAL(DP), INTENT(IN) :: ethr
|
||||
REAL(DP), INTENT(INOUT) :: eig_d(nbnd)
|
||||
REAL(DP), ALLOCATABLE :: eig(:) ! copy for protate
|
||||
INTEGER, INTENT(IN) :: btype(nbnd)
|
||||
INTEGER, INTENT(OUT) :: notconv, nhpsi
|
||||
! INTEGER, INTENT(IN) :: paro_flag
|
||||
|
||||
! local variables (used in the call to cegterg )
|
||||
!------------------------------------------------------------------------
|
||||
EXTERNAL h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu
|
||||
! subroutine h_psi (npwx,npw,nvec,evc,hpsi) computes H*evc using band parallelization
|
||||
! subroutine s_psi (npwx,npw,nvec,evc,spsi) computes S*evc using band parallelization
|
||||
! subroutine hs_1psi(npwx,npw,evc,hpsi,spsi) computes H*evc and S*evc for a single band
|
||||
! subroutine g_1psi (npwx,npw,psi,eig) computes g*psi for a single band
|
||||
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: itry, paro_ntr, nconv, nextra, nactive, nbase, ntrust, ndiag, nvecx, nproc_ortho
|
||||
|
||||
LOGICAL, ALLOCATABLE :: conv(:)
|
||||
|
||||
REAL(DP), PARAMETER :: extra_factor = 0.5 ! workspace is at most this factor larger than nbnd
|
||||
INTEGER, PARAMETER :: min_extra = 4 ! but at least this lager
|
||||
|
||||
INTEGER :: ibnd, ibnd_start, ibnd_end, how_many, lbnd, kbnd, last_unconverged, &
|
||||
recv_counts(nbgrp), displs(nbgrp), column_type
|
||||
|
||||
INTEGER :: ii, jj, kk ! indexes for cuf kernel loops
|
||||
!
|
||||
!civn 2fix: these are needed only for __MPI = true (protate)
|
||||
REAL(DP), ALLOCATABLE :: ew(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
|
||||
!
|
||||
! .. device variables
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: ew_d(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
|
||||
LOGICAL, ALLOCATABLE :: conv_d(:)
|
||||
#if defined (__CUDA)
|
||||
attributes(device) :: evc_d, eig_d
|
||||
attributes(device) :: psi_d, hpsi_d, spsi_d, ew_d
|
||||
attributes(device) :: conv_d
|
||||
#endif
|
||||
!
|
||||
! ... init local variables
|
||||
!
|
||||
CALL laxlib_getval( nproc_ortho = nproc_ortho )
|
||||
paro_ntr = 20
|
||||
|
||||
nvecx = nbnd + max ( nint ( extra_factor * nbnd ), min_extra )
|
||||
!
|
||||
CALL start_clock( 'paro_k' ); !write (6,*) ' enter paro diag'
|
||||
|
||||
CALL mp_type_create_column_section(evc_d(1,1), 0, npwx*npol, npwx*npol, column_type)
|
||||
|
||||
ALLOCATE ( ew_d(nvecx), conv(nbnd) )
|
||||
ALLOCATE ( conv_d(nbnd) )
|
||||
ALLOCATE ( psi_d(npwx*npol,nvecx), hpsi_d(npwx*npol,nvecx), spsi_d(npwx*npol,nvecx) )
|
||||
|
||||
CALL start_clock( 'paro:init' );
|
||||
conv(:) = .FALSE. ; nconv = COUNT ( conv(:) )
|
||||
!$cuf kernel do(1)
|
||||
do ii = 1, nbnd
|
||||
conv_d(ii) = .FALSE.
|
||||
end do
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx*npol
|
||||
DO jj = 1, nbnd
|
||||
psi_d(ii,jj) = evc_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
|
||||
call h_psi_gpu (npwx,npw,nbnd,psi_d,hpsi_d) ! computes H*psi
|
||||
call s_psi_gpu (npwx,npw,nbnd,psi_d,spsi_d) ! computes S*psi
|
||||
nhpsi = 0 ; IF (my_bgrp_id==0) nhpsi = nbnd
|
||||
CALL stop_clock( 'paro:init' );
|
||||
|
||||
#if defined(__MPI)
|
||||
IF ( nproc_ortho == 1 ) THEN
|
||||
#endif
|
||||
CALL rotate_HSpsi_k_gpu ( npwx, npw, nbnd, nbnd, npol, psi_d, hpsi_d, overlap, spsi_d, eig_d )
|
||||
#if defined(__MPI)
|
||||
ELSE
|
||||
!civn 2fix
|
||||
ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), eig(nbnd) )
|
||||
psi = psi_d
|
||||
hpsi = hpsi_d
|
||||
spsi = spsi_d
|
||||
eig = eig_d
|
||||
CALL protate_HSpsi_k( npwx, npw, nbnd, nbnd, npol, psi, hpsi, overlap, spsi, eig )
|
||||
psi_d = psi
|
||||
hpsi_d = hpsi
|
||||
spsi_d = spsi
|
||||
eig_d = eig
|
||||
DEALLOCATE ( psi, hpsi, spsi, eig )
|
||||
ENDIF
|
||||
#endif
|
||||
!write (6,'(10f10.4)') psi(1:5,1:3)
|
||||
|
||||
!write (6,*) eig(1:nbnd)
|
||||
|
||||
ParO_loop : &
|
||||
DO itry = 1,paro_ntr
|
||||
|
||||
!write (6,*) ' paro_itry =', itry, ethr
|
||||
|
||||
!----------------------------
|
||||
|
||||
nactive = nbnd - (nconv+1)/2 ! number of correction vectors to be computed (<nbnd)
|
||||
notconv = nbnd - nconv ! number of needed roots
|
||||
nextra = nactive - notconv ! number of extra vectors
|
||||
nbase = nconv + nactive ! number of orbitals the correction should be orthogonal to (<2*nbnd)
|
||||
ndiag = nbase + nactive ! dimension of the matrix to be diagonalized at this iteration (<2*nbnd)
|
||||
|
||||
!----------------------------
|
||||
|
||||
nactive = min ( (nvecx-nconv)/2, nvecx-nbnd) ! number of corrections there is space for
|
||||
notconv = nbnd - nconv ! number of needed roots
|
||||
nextra = max ( nactive - notconv, 0 ) ! number of extra vectors, if any
|
||||
nbase = max ( nconv + nactive , nbnd ) ! number of orbitals to be orthogonal to (<nvecx)
|
||||
ntrust = min ( nconv + nactive , nbnd ) ! number of orbitals that will be actually corrected
|
||||
ndiag = nbase + nactive ! dimension of the matrix to be diagonalized at this iteration (<nvecx)
|
||||
|
||||
!write (6,*) itry, notconv, conv
|
||||
!write (6,*) ' nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag =', nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag
|
||||
|
||||
CALL divide_all(inter_bgrp_comm,nactive,ibnd_start,ibnd_end,recv_counts,displs)
|
||||
how_many = ibnd_end - ibnd_start + 1
|
||||
!write (6,*) nactive, ibnd_start, ibnd_end, recv_counts, displs
|
||||
|
||||
CALL start_clock( 'paro:pack' );
|
||||
lbnd = 1; kbnd = 1
|
||||
DO ibnd = 1, ntrust ! pack unconverged roots in the available space
|
||||
IF (.NOT.conv(ibnd) ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx*npol
|
||||
psi_d (ii,nbase+kbnd) = psi_d(ii,ibnd)
|
||||
hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
|
||||
spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
|
||||
END DO
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, 1
|
||||
ew_d(kbnd) = eig_d(ibnd)
|
||||
END DO
|
||||
last_unconverged = ibnd
|
||||
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
|
||||
END IF
|
||||
END DO
|
||||
DO ibnd = nbnd+1, nbase ! add extra vectors if it is the case
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, npwx*npol
|
||||
psi_d (ii,nbase+kbnd) = psi_d(ii,ibnd)
|
||||
hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
|
||||
spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
|
||||
END DO
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, 1
|
||||
ew_d(kbnd) = eig_d(last_unconverged)
|
||||
END DO
|
||||
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
|
||||
END DO
|
||||
!$cuf kernel do(2)
|
||||
DO jj = 1, how_many
|
||||
kk = jj + ibnd_start - 1
|
||||
DO ii = 1, npwx*npol
|
||||
psi_d (ii,nbase+jj) = psi_d (ii,nbase+kk)
|
||||
hpsi_d(ii,nbase+jj) = hpsi_d(ii,nbase+kk)
|
||||
spsi_d(ii,nbase+jj) = spsi_d(ii,nbase+kk)
|
||||
END DO
|
||||
END DO
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, how_many
|
||||
ew_d(ii) = ew_d(ii+ibnd_start-1)
|
||||
END DO
|
||||
CALL stop_clock( 'paro:pack' );
|
||||
|
||||
!write (6,*) ' check nactive = ', lbnd, nactive, nconv
|
||||
if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED '
|
||||
|
||||
CALL bpcg_k_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, npol, how_many, &
|
||||
psi_d(:,nbase+1), hpsi_d(:,nbase+1), spsi_d(:,nbase+1), ethr, ew_d(1), nhpsi)
|
||||
!
|
||||
CALL start_clock( 'paro:mp_bar' );
|
||||
CALL mp_barrier(inter_bgrp_comm)
|
||||
CALL stop_clock( 'paro:mp_bar' );
|
||||
CALL start_clock( 'paro:mp_sum' );
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx*npol
|
||||
DO jj = nbase+1, nbase+how_many
|
||||
kk = jj + ibnd_start - 1
|
||||
psi_d (ii,kk) = psi_d (ii,jj)
|
||||
hpsi_d(ii,kk) = hpsi_d(ii,jj)
|
||||
spsi_d(ii,kk) = spsi_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
|
||||
CALL mp_allgather(psi_d (:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
CALL mp_allgather(hpsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
CALL mp_allgather(spsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
|
||||
CALL stop_clock( 'paro:mp_sum' );
|
||||
|
||||
#if defined(__MPI)
|
||||
IF ( nproc_ortho == 1 ) THEN
|
||||
#endif
|
||||
CALL rotate_HSpsi_k_gpu ( npwx, npw, ndiag, ndiag, npol, psi_d, hpsi_d, overlap, spsi_d, ew_d )
|
||||
#if defined(__MPI)
|
||||
ELSE
|
||||
!civn 2fix
|
||||
ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), ew(nvecx) )
|
||||
psi = psi_d
|
||||
hpsi = hpsi_d
|
||||
spsi = spsi_d
|
||||
ew = ew_d
|
||||
CALL protate_HSpsi_k( npwx, npw, ndiag, ndiag, npol, psi, hpsi, overlap, spsi, ew )
|
||||
psi_d = psi
|
||||
hpsi_d = hpsi
|
||||
spsi_d = spsi
|
||||
ew_d = ew
|
||||
DEALLOCATE ( psi, hpsi, spsi, ew )
|
||||
ENDIF
|
||||
#endif
|
||||
!write (6,*) ' ew : ', ew(1:nbnd)
|
||||
! only the first nbnd eigenvalues are relevant for convergence
|
||||
! but only those that have actually been corrected should be trusted
|
||||
conv(1:nbnd) = .FALSE.
|
||||
!$cuf kernel do(1)
|
||||
do ii = 1, nbnd
|
||||
conv_d(ii) = .FALSE.
|
||||
end do
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, ntrust
|
||||
conv_d(ii) = ABS(ew_d(ii) - eig_d(ii)).LT.ethr
|
||||
END DO
|
||||
conv = conv_d
|
||||
nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv
|
||||
!$cuf kernel do(1)
|
||||
DO ii = 1, nbnd
|
||||
eig_d(ii) = ew_d(ii)
|
||||
END DO
|
||||
IF ( nconv == nbnd ) EXIT ParO_loop
|
||||
|
||||
END DO ParO_loop
|
||||
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, npwx*npol
|
||||
DO jj = 1, nbnd
|
||||
evc_d(ii,jj) = psi_d(ii,jj)
|
||||
END DO
|
||||
END DO
|
||||
|
||||
CALL mp_sum(nhpsi,inter_bgrp_comm)
|
||||
|
||||
DEALLOCATE ( ew_d, conv )
|
||||
DEALLOCATE ( conv_d )
|
||||
DEALLOCATE ( psi_d, hpsi_d, spsi_d )
|
||||
|
||||
CALL mp_type_free( column_type )
|
||||
|
||||
CALL stop_clock( 'paro_k' ); !write (6,*) ' exit paro diag'
|
||||
|
||||
END SUBROUTINE paro_k_new_gpu
|
|
@ -0,0 +1,18 @@
|
|||
# Makefile for DAVIDSON and CG on GPU
|
||||
|
||||
ALLOBJS += \
|
||||
Davidson/cegterg_gpu.o \
|
||||
Davidson/regterg_gpu.o \
|
||||
DENSE/rotate_wfc_k_gpu.o \
|
||||
DENSE/rotate_wfc_gamma_gpu.o \
|
||||
CG/rcgdiagg_gpu.o \
|
||||
CG/ccgdiagg_gpu.o \
|
||||
PPCG/generic_cublas.o \
|
||||
PPCG/ppcg_gamma_gpu.o \
|
||||
PPCG/ppcg_k_gpu.o \
|
||||
ParO/bpcg_gamma_gpu.o \
|
||||
ParO/bpcg_k_gpu.o \
|
||||
ParO/paro_gamma_new_gpu.o \
|
||||
ParO/paro_k_new_gpu.o \
|
||||
DENSE/rotate_HSpsi_gamma_gpu.o \
|
||||
DENSE/rotate_HSpsi_k_gpu.o
|
|
@ -2,10 +2,13 @@
|
|||
|
||||
include ../make.inc
|
||||
|
||||
|
||||
EXTLIBS=$(CUDA_LIBS) $(SCALAPACK_LIBS) $(LAPACK_LIBS) $(BLAS_LIBS) $(MPI_LIBS)
|
||||
|
||||
|
||||
LAX = la_types.o \
|
||||
la_error.o \
|
||||
la_module.o \
|
||||
la_helper.o \
|
||||
cdiaghg.o \
|
||||
rdiaghg.o \
|
||||
|
|
|
@ -0,0 +1,442 @@
|
|||
MODULE LAXlib
|
||||
#ifdef __CUDA
|
||||
USE cudafor
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTERFACE diaghg
|
||||
MODULE PROCEDURE cdiaghg_cpu_, rdiaghg_cpu_
|
||||
#ifdef __CUDA
|
||||
MODULE PROCEDURE cdiaghg_gpu_, rdiaghg_gpu_
|
||||
#endif
|
||||
END INTERFACE
|
||||
!
|
||||
INTERFACE pdiaghg
|
||||
MODULE PROCEDURE pcdiaghg_, prdiaghg_
|
||||
#ifdef __CUDA
|
||||
MODULE PROCEDURE pcdiaghg__gpu, prdiaghg__gpu
|
||||
#endif
|
||||
END INTERFACE
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE cdiaghg_cpu_( n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm, offload )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... calculates eigenvalues and eigenvectors of the generalized problem
|
||||
! ... Hv=eSv, with H hermitean matrix, S overlap matrix.
|
||||
! ... On output both matrix are unchanged
|
||||
!
|
||||
! ... LAPACK version - uses both ZHEGV and ZHEGVX
|
||||
!
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
include 'laxlib_kinds.fh'
|
||||
!
|
||||
INTEGER, INTENT(IN) :: n, m, ldh
|
||||
! dimension of the matrix to be diagonalized
|
||||
! number of eigenstates to be calculate
|
||||
! leading dimension of h, as declared in the calling pgm unit
|
||||
COMPLEX(DP), INTENT(INOUT) :: h(ldh,n), s(ldh,n)
|
||||
! actually intent(in) but compilers don't know and complain
|
||||
! matrix to be diagonalized
|
||||
! overlap matrix
|
||||
REAL(DP), INTENT(OUT) :: e(n)
|
||||
! eigenvalues
|
||||
COMPLEX(DP), INTENT(OUT) :: v(ldh,m)
|
||||
! eigenvectors (column-wise)
|
||||
INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
!
|
||||
LOGICAL, OPTIONAL :: offload
|
||||
! optionally solve the eigenvalue problem on the GPU
|
||||
LOGICAL :: loffload
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: v_d(:,:), h_d(:,:), s_d(:,:)
|
||||
REAL(DP), ALLOCATABLE, DEVICE :: e_d(:)
|
||||
INTEGER :: info
|
||||
#endif
|
||||
!
|
||||
loffload = .false.
|
||||
!
|
||||
! the following ifdef ensures no offload if not compiling from GPU
|
||||
#if defined(__CUDA)
|
||||
IF (PRESENT(offload)) loffload = offload
|
||||
#endif
|
||||
!
|
||||
! ... always false when compiling without CUDA support
|
||||
!
|
||||
IF ( loffload ) THEN
|
||||
#if defined(__CUDA)
|
||||
!
|
||||
ALLOCATE(s_d, source=s); ALLOCATE(h_d, source=h)
|
||||
ALLOCATE(e_d(n), v_d(ldh,n))
|
||||
!
|
||||
CALL laxlib_cdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
|
||||
!
|
||||
e = e_d
|
||||
v(1:ldh,1:m) = v_d(1:ldh,1:m)
|
||||
!
|
||||
DEALLOCATE(h_d, s_d, e_d, v_d)
|
||||
#endif
|
||||
ELSE
|
||||
CALL laxlib_cdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
|
||||
END IF
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE cdiaghg_cpu_
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE cdiaghg_gpu_( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm, onhost )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... calculates eigenvalues and eigenvectors of the generalized problem
|
||||
! ... Hv=eSv, with H hermitean matrix, S overlap matrix.
|
||||
! ... On output both matrix are unchanged
|
||||
!
|
||||
! ... LAPACK version - uses both ZHEGV and ZHEGVX
|
||||
!
|
||||
USE cudafor
|
||||
!
|
||||
IMPLICIT NONE
|
||||
include 'laxlib_kinds.fh'
|
||||
!
|
||||
INTEGER, INTENT(IN) :: n, m, ldh
|
||||
! dimension of the matrix to be diagonalized
|
||||
! number of eigenstates to be calculate
|
||||
! leading dimension of h, as declared in the calling pgm unit
|
||||
COMPLEX(DP), DEVICE, INTENT(INOUT) :: h_d(ldh,n), s_d(ldh,n)
|
||||
! actually intent(in) but compilers don't know and complain
|
||||
! matrix to be diagonalized
|
||||
! overlap matrix
|
||||
REAL(DP), DEVICE, INTENT(OUT) :: e_d(n)
|
||||
! eigenvalues
|
||||
COMPLEX(DP), DEVICE, INTENT(OUT) :: v_d(ldh,n)
|
||||
! eigenvectors (column-wise)
|
||||
INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
! communicators
|
||||
LOGICAL, OPTIONAL :: onhost
|
||||
! optionally solve the eigenvalue problem on the CPU
|
||||
LOGICAL :: lonhost
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
|
||||
REAL(DP), ALLOCATABLE :: e(:)
|
||||
!
|
||||
INTEGER :: info
|
||||
!
|
||||
lonhost = .false.
|
||||
!
|
||||
IF (PRESENT(onhost)) lonhost = onhost
|
||||
!
|
||||
!
|
||||
IF ( lonhost ) THEN
|
||||
!
|
||||
ALLOCATE(s, source=s_d); ALLOCATE(h, source=h_d)
|
||||
ALLOCATE(e(n), v(ldh,m))
|
||||
!
|
||||
CALL laxlib_cdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
|
||||
!
|
||||
e_d = e
|
||||
v_d(1:ldh,1:m) = v(1:ldh,1:m)
|
||||
!
|
||||
DEALLOCATE(h, s, e, v)
|
||||
ELSE
|
||||
CALL laxlib_cdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
|
||||
END IF
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE cdiaghg_gpu_
|
||||
#endif
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rdiaghg_cpu_( n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm, offload )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... general interface for rdiaghg
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
include 'laxlib_kinds.fh'
|
||||
!
|
||||
INTEGER, INTENT(IN) :: n, m, ldh
|
||||
! dimension of the matrix to be diagonalized
|
||||
! number of eigenstates to be calculate
|
||||
! leading dimension of h, as declared in the calling pgm unit
|
||||
REAL(DP), INTENT(INOUT) :: h(ldh,n), s(ldh,n)
|
||||
! actually intent(in) but compilers don't know and complain
|
||||
! matrix to be diagonalized
|
||||
! overlap matrix
|
||||
REAL(DP), INTENT(OUT) :: e(n)
|
||||
! eigenvalues
|
||||
REAL(DP), INTENT(OUT) :: v(ldh,m)
|
||||
! eigenvectors (column-wise)
|
||||
INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
! communicators
|
||||
LOGICAL, OPTIONAL :: offload
|
||||
! optionally solve the eigenvalue problem on the GPU
|
||||
LOGICAL :: loffload
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
REAL(DP), ALLOCATABLE, DEVICE :: v_d(:,:), h_d(:,:), s_d(:,:)
|
||||
REAL(DP), ALLOCATABLE, DEVICE :: e_d(:)
|
||||
INTEGER :: info
|
||||
#endif
|
||||
!
|
||||
loffload = .false.
|
||||
!
|
||||
! the following ifdef ensures no offload if not compiling from GPU
|
||||
#if defined(__CUDA)
|
||||
IF (PRESENT(offload)) loffload = offload
|
||||
#endif
|
||||
!
|
||||
! ... always false when compiling without CUDA support
|
||||
!
|
||||
IF ( loffload ) THEN
|
||||
#if defined(__CUDA)
|
||||
!
|
||||
ALLOCATE(s_d, source=s); ALLOCATE(h_d, source=h)
|
||||
ALLOCATE(e_d(n), v_d(ldh,n))
|
||||
!
|
||||
CALL laxlib_rdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
|
||||
!
|
||||
e = e_d
|
||||
v(1:ldh,1:m) = v_d(1:ldh,1:m)
|
||||
!
|
||||
DEALLOCATE(h_d, s_d, e_d, v_d)
|
||||
#endif
|
||||
ELSE
|
||||
CALL laxlib_rdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
|
||||
END IF
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE rdiaghg_cpu_
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rdiaghg_gpu_( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm, onhost )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... General interface to rdiaghg_gpu
|
||||
!
|
||||
USE cudafor
|
||||
!
|
||||
IMPLICIT NONE
|
||||
include 'laxlib_kinds.fh'
|
||||
!
|
||||
INTEGER, INTENT(IN) :: n, m, ldh
|
||||
! dimension of the matrix to be diagonalized
|
||||
! number of eigenstates to be calculate
|
||||
! leading dimension of h, as declared in the calling pgm unit
|
||||
REAL(DP), DEVICE, INTENT(INOUT) :: h_d(ldh,n), s_d(ldh,n)
|
||||
! actually intent(in) but compilers don't know and complain
|
||||
! matrix to be diagonalized
|
||||
! overlap matrix
|
||||
REAL(DP), DEVICE, INTENT(OUT) :: e_d(n)
|
||||
! eigenvalues
|
||||
REAL(DP), DEVICE, INTENT(OUT) :: v_d(ldh,n)
|
||||
! eigenvectors (column-wise)
|
||||
INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
! communicators
|
||||
LOGICAL, OPTIONAL :: onhost
|
||||
! optionally solve the eigenvalue problem on the CPU
|
||||
LOGICAL :: lonhost
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
|
||||
REAL(DP), ALLOCATABLE :: e(:)
|
||||
!
|
||||
INTEGER :: info
|
||||
!
|
||||
lonhost = .false.
|
||||
!
|
||||
IF (PRESENT(onhost)) lonhost = onhost
|
||||
!
|
||||
!
|
||||
IF ( lonhost ) THEN
|
||||
!
|
||||
ALLOCATE(s, source=s_d); ALLOCATE(h, source=h_d)
|
||||
ALLOCATE(e(n), v(ldh,m))
|
||||
!
|
||||
CALL laxlib_rdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
|
||||
!
|
||||
e_d = e
|
||||
v_d(1:ldh,1:m) = v(1:ldh,1:m)
|
||||
!
|
||||
DEALLOCATE(h, s, e, v)
|
||||
ELSE
|
||||
CALL laxlib_rdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
|
||||
END IF
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE rdiaghg_gpu_
|
||||
#endif
|
||||
!
|
||||
! === Parallel diagonalization interface subroutines
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE prdiaghg_( n, h, s, ldh, e, v, idesc, offload )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... calculates eigenvalues and eigenvectors of the generalized problem
|
||||
! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
|
||||
! ... On output both matrix are unchanged
|
||||
!
|
||||
! ... Parallel version with full data distribution
|
||||
!
|
||||
IMPLICIT NONE
|
||||
include 'laxlib_kinds.fh'
|
||||
include 'laxlib_param.fh'
|
||||
!
|
||||
INTEGER, INTENT(IN) :: n, ldh
|
||||
! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
|
||||
! leading dimension of h, as declared in the calling pgm unit
|
||||
REAL(DP), INTENT(INOUT) :: h(ldh,ldh), s(ldh,ldh)
|
||||
! matrix to be diagonalized
|
||||
! overlap matrix
|
||||
!
|
||||
REAL(DP), INTENT(OUT) :: e(n)
|
||||
! eigenvalues
|
||||
REAL(DP), INTENT(OUT) :: v(ldh,ldh)
|
||||
! eigenvectors (column-wise)
|
||||
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
|
||||
!
|
||||
LOGICAL, OPTIONAL :: offload
|
||||
! place-holder, offloading on GPU not implemented yet
|
||||
LOGICAL :: loffload
|
||||
|
||||
CALL laxlib_prdiaghg( n, h, s, ldh, e, v, idesc)
|
||||
|
||||
END SUBROUTINE
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE pcdiaghg_( n, h, s, ldh, e, v, idesc, offload )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... calculates eigenvalues and eigenvectors of the generalized problem
|
||||
! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
|
||||
! ... On output both matrix are unchanged
|
||||
!
|
||||
! ... Parallel version with full data distribution
|
||||
!
|
||||
!
|
||||
IMPLICIT NONE
|
||||
include 'laxlib_kinds.fh'
|
||||
include 'laxlib_param.fh'
|
||||
!
|
||||
INTEGER, INTENT(IN) :: n, ldh
|
||||
! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
|
||||
! leading dimension of h, as declared in the calling pgm unit
|
||||
COMPLEX(DP), INTENT(INOUT) :: h(ldh,ldh), s(ldh,ldh)
|
||||
! matrix to be diagonalized
|
||||
! overlap matrix
|
||||
!
|
||||
REAL(DP), INTENT(OUT) :: e(n)
|
||||
! eigenvalues
|
||||
COMPLEX(DP), INTENT(OUT) :: v(ldh,ldh)
|
||||
! eigenvectors (column-wise)
|
||||
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
|
||||
!
|
||||
LOGICAL, OPTIONAL :: offload
|
||||
! place-holder, offloading on GPU not implemented yet
|
||||
LOGICAL :: loffload
|
||||
|
||||
CALL laxlib_pcdiaghg( n, h, s, ldh, e, v, idesc)
|
||||
|
||||
END SUBROUTINE
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE prdiaghg__gpu( n, h_d, s_d, ldh, e_d, v_d, idesc, onhost )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... calculates eigenvalues and eigenvectors of the generalized problem
|
||||
! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
|
||||
! ... On output both matrix are unchanged
|
||||
!
|
||||
! ... Parallel version with full data distribution
|
||||
!
|
||||
IMPLICIT NONE
|
||||
include 'laxlib_kinds.fh'
|
||||
include 'laxlib_param.fh'
|
||||
!
|
||||
INTEGER, INTENT(IN) :: n, ldh
|
||||
! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
|
||||
! leading dimension of h, as declared in the calling pgm unit
|
||||
REAL(DP), INTENT(INOUT), DEVICE :: h_d(ldh,ldh), s_d(ldh,ldh)
|
||||
! matrix to be diagonalized
|
||||
! overlap matrix
|
||||
!
|
||||
REAL(DP), INTENT(OUT), DEVICE :: e_d(n)
|
||||
! eigenvalues
|
||||
REAL(DP), INTENT(OUT), DEVICE :: v_d(ldh,ldh)
|
||||
! eigenvectors (column-wise)
|
||||
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
|
||||
!
|
||||
LOGICAL, OPTIONAL :: onhost
|
||||
! place-holder, prdiaghg on GPU not implemented yet
|
||||
LOGICAL :: lonhost
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
|
||||
REAL(DP), ALLOCATABLE :: e(:)
|
||||
|
||||
ALLOCATE(h(ldh,ldh), s(ldh,ldh), e(n), v(ldh,ldh))
|
||||
h = h_d; s = s_d;
|
||||
CALL laxlib_prdiaghg( n, h, s, ldh, e, v, idesc)
|
||||
e_d = e; v_d = v
|
||||
DEALLOCATE(h,s,v,e)
|
||||
!
|
||||
END SUBROUTINE
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE pcdiaghg__gpu( n, h_d, s_d, ldh, e_d, v_d, idesc, onhost )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... calculates eigenvalues and eigenvectors of the generalized problem
|
||||
! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
|
||||
! ... On output both matrix are unchanged
|
||||
!
|
||||
! ... Parallel version with full data distribution
|
||||
!
|
||||
IMPLICIT NONE
|
||||
include 'laxlib_kinds.fh'
|
||||
include 'laxlib_param.fh'
|
||||
!
|
||||
INTEGER, INTENT(IN) :: n, ldh
|
||||
! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
|
||||
! leading dimension of h, as declared in the calling pgm unit
|
||||
COMPLEX(DP), INTENT(INOUT), DEVICE :: h_d(ldh,ldh), s_d(ldh,ldh)
|
||||
! matrix to be diagonalized
|
||||
! overlap matrix
|
||||
!
|
||||
REAL(DP), INTENT(OUT), DEVICE :: e_d(n)
|
||||
! eigenvalues
|
||||
COMPLEX(DP), INTENT(OUT), DEVICE :: v_d(ldh,ldh)
|
||||
! eigenvectors (column-wise)
|
||||
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
|
||||
!
|
||||
LOGICAL, OPTIONAL :: onhost
|
||||
! place-holder, pcdiaghg on GPU not implemented yet
|
||||
LOGICAL :: lonhost
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
|
||||
REAL(DP), ALLOCATABLE :: e(:)
|
||||
|
||||
ALLOCATE(h(ldh,ldh), s(ldh,ldh), e(n), v(ldh,ldh))
|
||||
h = h_d; s = s_d;
|
||||
CALL laxlib_pcdiaghg( n, h, s, ldh, e, v, idesc)
|
||||
e_d = e; v_d = v
|
||||
DEALLOCATE(h,s,v,e)
|
||||
!
|
||||
END SUBROUTINE
|
||||
#endif
|
||||
END MODULE LAXlib
|
|
@ -395,4 +395,16 @@ SUBROUTINE redist_row2col_x( n, a, b, ldx, nx, idesc )
|
|||
REAL(DP) :: a(ldx,nx), b(ldx,nx)
|
||||
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
|
||||
END SUBROUTINE
|
||||
#if defined (__CUDA)
|
||||
SUBROUTINE redist_row2col_gpu_x( n, a, b, ldx, nx, idesc )
|
||||
IMPLICIT NONE
|
||||
include 'laxlib_kinds.fh'
|
||||
include 'laxlib_param.fh'
|
||||
INTEGER, INTENT(IN) :: n
|
||||
INTEGER, INTENT(IN) :: ldx, nx
|
||||
REAL(DP), DEVICE :: a(:,:)
|
||||
REAL(DP), DEVICE :: b(:,:)
|
||||
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
|
||||
END SUBROUTINE
|
||||
#endif
|
||||
END INTERFACE
|
||||
|
|
|
@ -7,6 +7,7 @@ la_error.o : la_param.o
|
|||
la_helper.o : dspev_drv.o
|
||||
la_helper.o : la_param.o
|
||||
la_helper.o : la_types.o
|
||||
la_module.o : la_param.o
|
||||
la_helper.o : mp_diag.o
|
||||
mp_diag.o : la_param.o
|
||||
ptoolkit.o : la_param.o
|
||||
|
|
|
@ -3719,13 +3719,10 @@ SUBROUTINE redist_row2col_gpu_x( n, a, b, ldx, nx, idesc )
|
|||
INTEGER, INTENT(IN) :: n
|
||||
INTEGER, INTENT(IN) :: ldx, nx
|
||||
REAL(DP), DEVICE :: a(:,:)
|
||||
REAL(DP) :: b(:,:)
|
||||
REAL(DP), DEVICE :: b(:,:)
|
||||
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: wrk(:,:)
|
||||
#if defined(__GPU_MPI)
|
||||
ATTRIBUTES(DEVICE) :: wrk
|
||||
#endif
|
||||
REAL(DP), ALLOCATABLE :: a_h(:,:), b_h(:,:)
|
||||
INTEGER :: ierr
|
||||
INTEGER :: np, rowid, colid
|
||||
INTEGER :: comm
|
||||
|
@ -3778,30 +3775,30 @@ SUBROUTINE redist_row2col_gpu_x( n, a, b, ldx, nx, idesc )
|
|||
CALL lax_error__( " redist_row2col_gpu ", " in MPI_BARRIER ", ABS( ierr ) )
|
||||
!
|
||||
#if defined(__GPU_MPI)
|
||||
ALLOCATE( wrk(SIZE(b,1),SIZE(b,2)), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL lax_error__( " redist_row2col_gpu ", " allocating wrk ", ABS( ierr ) )
|
||||
!
|
||||
ierr = cudaDeviceSynchronize()
|
||||
CALL MPI_SENDRECV(a, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
|
||||
wrk, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
|
||||
IF( ierr /= 0 ) &
|
||||
CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
|
||||
|
||||
b = wrk
|
||||
!
|
||||
DEALLOCATE( wrk )
|
||||
#else
|
||||
ALLOCATE( wrk, SOURCE=a, STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL lax_error__( " redist_row2col_gpu ", " allocating wrk ", ABS( ierr ) )
|
||||
!
|
||||
CALL MPI_SENDRECV(wrk, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
|
||||
b, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
|
||||
IF( ierr /= 0 ) &
|
||||
CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
|
||||
#else
|
||||
ALLOCATE( a_h, SOURCE=a, STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL lax_error__( " redist_row2col_gpu ", " allocating a_h ", ABS( ierr ) )
|
||||
ALLOCATE( b_h, MOLD=b, STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL lax_error__( " redist_row2col_gpu ", " allocating b_h ", ABS( ierr ) )
|
||||
!
|
||||
DEALLOCATE( wrk )
|
||||
CALL MPI_SENDRECV(a_h, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
|
||||
b_h, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
|
||||
IF( ierr /= 0 ) &
|
||||
CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
|
||||
!
|
||||
b = b_h
|
||||
!
|
||||
DEALLOCATE( a_h, b_h )
|
||||
#endif
|
||||
!
|
||||
#else
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
*.bin binary
|
|
@ -0,0 +1,38 @@
|
|||
# Makefile for LAXlib
|
||||
|
||||
include ../../make.inc
|
||||
|
||||
MODFLAGS= $(MOD_FLAG)../../UtilXlib $(MOD_FLAG)../../ELPA/src $(MOD_FLAG).. $(MOD_FLAG).
|
||||
|
||||
SRCS = test_diaghg_1.f90 \
|
||||
test_diaghg_2.f90 \
|
||||
test_diaghg_3.f90 \
|
||||
test_diaghg_4.f90 \
|
||||
test_diaghg_gpu_1.f90 \
|
||||
test_diaghg_gpu_2.f90 \
|
||||
test_diaghg_gpu_3.f90 \
|
||||
test_diaghg_gpu_4.f90
|
||||
|
||||
EXECS = $(SRCS:.f90=.x)
|
||||
|
||||
UTILS = tester.o mp_world.o utils.o test_io.o test_helpers.o
|
||||
|
||||
all: common $(EXECS)
|
||||
|
||||
common: $(UTILS)
|
||||
|
||||
%.x: %.o
|
||||
$(LD) $(LDFLAGS) $< $(UTILS) -o $@ ../libqela.a \
|
||||
../../UtilXlib/libutil.a $(LIBOBJS) $(QELIBS)
|
||||
|
||||
clean :
|
||||
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
|
||||
|
||||
# .PHONY forces execution of a rule irrespective of the presence of an
|
||||
# updated file with the same name of the rule. In this way, the script
|
||||
# that generates version.f90 always runs, updating the version if you
|
||||
# execute "svn update". The update_version script takes care of not
|
||||
# changing the file if the svn version did not change
|
||||
|
||||
.PHONY: all clean
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
# LAXlib testing suite
|
||||
|
||||
In order to run the tests first run `./configure` in QE topdir and generate a
|
||||
valid `make.inc`.
|
||||
You may also download large eigenvalue problems (in this directory) with:
|
||||
|
||||
wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1EAB3xkoD-i9p4nW6NJDED3WaEK8ZCcf4' -O SiGeK1.bin
|
||||
wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=13lFkDbv99V8fqiXER1N2IzoJ_EhuGtt9' -O SiGeK2.bin
|
||||
|
||||
Finally `make` and run all `.x` executable files.
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,89 @@
|
|||
!----------------------------------------------------------------------------
|
||||
MODULE mp_world
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
#if defined(__MPI)
|
||||
USE mpi
|
||||
#endif
|
||||
USE mp, ONLY : mp_barrier, mp_start, mp_end, mp_stop
|
||||
USE mp, ONLY : mp_count_nodes
|
||||
!
|
||||
IMPLICIT NONE
|
||||
SAVE
|
||||
!
|
||||
! ... World group - all QE routines using mp_world_start to start MPI
|
||||
! ... will work in the communicator passed as input to mp_world_start
|
||||
!
|
||||
INTEGER :: nnode = 1 ! number of nodes
|
||||
INTEGER :: nproc = 1 ! number of processors
|
||||
INTEGER :: mpime = 0 ! processor index (starts from 0 to nproc-1)
|
||||
INTEGER :: root = 0 ! index of the root processor
|
||||
INTEGER :: world_comm = 0 ! communicator
|
||||
!
|
||||
! ... library_mode =.true. if QE is called as a library by an external code
|
||||
! ... if true, MPI_Init() is not called when starting MPI,
|
||||
! ... MPI_Finalize() is not called when stopping MPI
|
||||
!
|
||||
!
|
||||
#if defined(__MPI)
|
||||
LOGICAL :: library_mode = .FALSE.
|
||||
#endif
|
||||
PRIVATE
|
||||
PUBLIC :: nnode, nproc, mpime, root, world_comm, mp_world_start, mp_world_end
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE mp_world_start ( my_world_comm )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: my_world_comm
|
||||
#if defined(__MPI)
|
||||
INTEGER :: ierr
|
||||
#endif
|
||||
#if defined(_OPENMP)
|
||||
INTEGER :: PROVIDED
|
||||
#endif
|
||||
!
|
||||
world_comm = my_world_comm
|
||||
!
|
||||
! ... check if mpi is already initialized (library mode) or not
|
||||
!
|
||||
#if defined(__MPI)
|
||||
CALL MPI_Initialized ( library_mode, ierr)
|
||||
IF (ierr/=0) CALL mp_stop( 8000 )
|
||||
IF (.NOT. library_mode ) THEN
|
||||
#if defined(_OPENMP)
|
||||
CALL MPI_Init_thread(MPI_THREAD_FUNNELED, PROVIDED, ierr)
|
||||
#else
|
||||
CALL MPI_Init(ierr)
|
||||
#endif
|
||||
IF (ierr/=0) CALL mp_stop( 8001 )
|
||||
END IF
|
||||
#endif
|
||||
!
|
||||
CALL mp_start( nproc, mpime, world_comm )
|
||||
!CALL mp_count_nodes ( nnode, world_comm )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE mp_world_start
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE mp_world_end ( )
|
||||
!-----------------------------------------------------------------------
|
||||
#if defined(__MPI)
|
||||
INTEGER :: ierr
|
||||
#endif
|
||||
!
|
||||
CALL mp_barrier( world_comm )
|
||||
CALL mp_end ( world_comm )
|
||||
#if defined(__MPI)
|
||||
CALL mpi_finalize(ierr)
|
||||
IF (ierr/=0) CALL mp_stop( 8002 )
|
||||
#endif
|
||||
!
|
||||
END SUBROUTINE mp_world_end
|
||||
!
|
||||
END MODULE mp_world
|
|
@ -0,0 +1,122 @@
|
|||
program test_diaghg
|
||||
#if defined(__MPI)
|
||||
USE MPI
|
||||
#endif
|
||||
USE mp, ONLY : mp_bcast
|
||||
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
|
||||
root, nproc, world_comm
|
||||
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
INTEGER :: world_group = 0
|
||||
!
|
||||
CALL test%init()
|
||||
|
||||
#if defined(__MPI)
|
||||
world_group = MPI_COMM_WORLD
|
||||
#endif
|
||||
CALL mp_world_start(world_group)
|
||||
!
|
||||
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
|
||||
!
|
||||
CALL real_1(test)
|
||||
!
|
||||
CALL complex_1(test)
|
||||
!
|
||||
CALL collect_results(test)
|
||||
!
|
||||
CALL mp_world_end()
|
||||
!
|
||||
IF (mpime .eq. 0) CALL test%print()
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE real_1(test)
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
! real variables
|
||||
real(DP) :: h(2,2)
|
||||
real(DP) :: s(2,2)
|
||||
real(DP) :: e(2)
|
||||
real(DP) :: v(2,2)
|
||||
|
||||
h = 0.d0
|
||||
h(1,1) = 1.d0
|
||||
h(2,2) = 1.d0
|
||||
s = 0.d0
|
||||
s(1,1) = 1.d0
|
||||
s(2,2) = 1.d0
|
||||
!
|
||||
v = 0.d0
|
||||
e = 0.d0
|
||||
CALL diaghg( 2, 2, h, s, 2, e, v, .false. )
|
||||
!
|
||||
CALL test%assert_close( e, [1.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(v, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(s, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
!
|
||||
v = 0.d0
|
||||
e = 0.d0
|
||||
CALL diaghg( 2, 2, h, s, 2, e, v, .true. )
|
||||
!
|
||||
CALL test%assert_close( e, [1.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(v, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(s, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
!
|
||||
END SUBROUTINE real_1
|
||||
!
|
||||
SUBROUTINE complex_1(test)
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
! real variables
|
||||
complex(DP) :: h(2,2)
|
||||
complex(DP) :: h_save(2,2)
|
||||
complex(DP) :: s(2,2)
|
||||
complex(DP) :: s_save(2,2)
|
||||
real(DP) :: e(2)
|
||||
complex(DP) :: v(2,2)
|
||||
!
|
||||
h = 0.d0
|
||||
h(1,1) = (1.d0, 0.d0)
|
||||
h(1,2) = (0.d0, -2.d0)
|
||||
h(2,1) = ( 0.d0, 2.d0)
|
||||
h(2,2) = ( 5.d0, 0.d0)
|
||||
s = 0.d0
|
||||
s(1,1) = (1.d0, 0.d0)
|
||||
s(2,2) = (1.d0, 0.d0)
|
||||
!
|
||||
h_save = h
|
||||
s_save = s
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( 2, 2, h, s, 2, e, v, .false. )
|
||||
! 0.1715728752538099, 5.82842712474619
|
||||
CALL test%assert_close( e, [0.1715728752538099d0, 5.82842712474619d0] )
|
||||
CALL test%assert_close( v(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
|
||||
CALL test%assert_close( v(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
|
||||
CALL test%assert_close( RESHAPE(h, [4]), RESHAPE(h_save, [4]))
|
||||
CALL test%assert_close( RESHAPE(s, [4]), RESHAPE(s_save, [4]))
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( 2, 2, h, s, 2, e, v, .true. )
|
||||
!
|
||||
CALL test%assert_close( e, [0.1715728752538099d0, 5.82842712474619d0] )
|
||||
CALL test%assert_close( v(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
|
||||
CALL test%assert_close( v(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
|
||||
CALL test%assert_close( RESHAPE(h, [4]), RESHAPE(h_save, [4]))
|
||||
CALL test%assert_close( RESHAPE(s, [4]), RESHAPE(s_save, [4]))
|
||||
!
|
||||
END SUBROUTINE complex_1
|
||||
end program test_diaghg
|
|
@ -0,0 +1,110 @@
|
|||
program test_diaghg_2
|
||||
#if defined(__MPI)
|
||||
USE MPI
|
||||
#endif
|
||||
USE mp, ONLY : mp_bcast
|
||||
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
|
||||
root, nproc, world_comm
|
||||
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
INTEGER :: world_group = 0
|
||||
!
|
||||
CALL test%init()
|
||||
|
||||
#if defined(__MPI)
|
||||
world_group = MPI_COMM_WORLD
|
||||
#endif
|
||||
CALL mp_world_start(world_group)
|
||||
!
|
||||
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
|
||||
!
|
||||
CALL complex_1(test)
|
||||
!
|
||||
CALL collect_results(test)
|
||||
!
|
||||
CALL mp_world_end()
|
||||
!
|
||||
IF (mpime .eq. 0) CALL test%print()
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE complex_1(test)
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
integer, parameter :: m_size=1024
|
||||
complex(DP) :: h(m_size,m_size)
|
||||
complex(DP) :: h_save(m_size,m_size)
|
||||
complex(DP) :: s(m_size,m_size)
|
||||
complex(DP) :: s_save(m_size,m_size)
|
||||
real(DP) :: e(m_size)
|
||||
complex(DP) :: v(m_size,m_size)
|
||||
real(DP) :: e_save(m_size)
|
||||
complex(DP) :: v_save(m_size,m_size)
|
||||
!
|
||||
CALL hermitian(m_size, h)
|
||||
CALL hermitian(m_size, s)
|
||||
!
|
||||
h_save = h
|
||||
s_save = s
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( m_size, m_size, h, s, m_size, e, v, .false. )
|
||||
!
|
||||
CALL test%assert_close( RESHAPE(h, [m_size*m_size]), RESHAPE(h_save, [m_size*m_size]))
|
||||
CALL test%assert_close( RESHAPE(s, [m_size*m_size]), RESHAPE(s_save, [m_size*m_size]))
|
||||
!
|
||||
e_save = e
|
||||
v_save = v
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( m_size, m_size, h, s, m_size, e, v, .true. )
|
||||
!
|
||||
CALL test%assert_close( RESHAPE(h, [m_size*m_size]), RESHAPE(h_save, [m_size*m_size]))
|
||||
CALL test%assert_close( RESHAPE(s, [m_size*m_size]), RESHAPE(s_save, [m_size*m_size]))
|
||||
test%tolerance32=1.d-5
|
||||
test%tolerance64=1.d-14
|
||||
CALL test%assert_close( e, e_save)
|
||||
!
|
||||
END SUBROUTINE complex_1
|
||||
!
|
||||
SUBROUTINE hermitian(mSize, M)
|
||||
USE la_param, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
integer, intent(in) :: msize
|
||||
complex(dp), intent(out) :: M(:,:)
|
||||
!
|
||||
real(dp), allocatable :: rnd(:)
|
||||
complex(dp), allocatable :: tmp(:,:)
|
||||
|
||||
INTEGER :: h, k, j
|
||||
!
|
||||
ALLOCATE(rnd(mSize*(mSize+1)))
|
||||
CALL RANDOM_NUMBER(rnd)
|
||||
rnd = 1.d0*rnd - 5.d-1
|
||||
!
|
||||
M = (0.d0, 0.d0)
|
||||
j = 1
|
||||
DO k=1,mSize
|
||||
DO h=1,mSize
|
||||
IF(h>k) THEN
|
||||
M(h,k) = CMPLX(rnd(j), rnd(j+1))
|
||||
M(k,h) = CONJG(M(h,k))
|
||||
j=j+2;
|
||||
ELSE IF(k == h) THEN
|
||||
M(k,h) = CMPLX(mSize, 0.d0, kind=DP)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE hermitian
|
||||
|
||||
end program test_diaghg_2
|
|
@ -0,0 +1,174 @@
|
|||
program test_diaghg_3
|
||||
#if defined(__MPI)
|
||||
USE MPI
|
||||
#endif
|
||||
USE mp, ONLY : mp_bcast
|
||||
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
|
||||
root, nproc, world_comm
|
||||
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
INTEGER :: world_group = 0
|
||||
!
|
||||
CALL test%init()
|
||||
test%tolerance64=1.d-8
|
||||
!
|
||||
#if defined(__MPI)
|
||||
world_group = MPI_COMM_WORLD
|
||||
#endif
|
||||
CALL mp_world_start(world_group)
|
||||
!
|
||||
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
|
||||
!
|
||||
CALL real_1(test)
|
||||
!
|
||||
CALL complex_1(test)
|
||||
!
|
||||
CALL collect_results(test)
|
||||
!
|
||||
CALL mp_world_end()
|
||||
!
|
||||
IF (mpime .eq. 0) CALL test%print()
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE real_1(test)
|
||||
USE mp_world, ONLY : mpime
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
USE test_io
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
integer :: ldh, n, m
|
||||
real(DP), allocatable :: h(:,:)
|
||||
real(DP), allocatable :: h_save(:,:)
|
||||
real(DP), allocatable :: s(:,:)
|
||||
real(DP), allocatable :: s_save(:,:)
|
||||
real(DP), allocatable :: e(:)
|
||||
real(DP), allocatable :: v(:,:)
|
||||
real(DP), allocatable :: e_save(:)
|
||||
real(DP), allocatable :: v_save(:,:)
|
||||
!
|
||||
character(len=20) :: inputs(2)
|
||||
integer :: i, j, info
|
||||
!
|
||||
inputs = ["ZnOG1.bin", "ZnOG2.bin"]
|
||||
!
|
||||
DO i=1, SIZE(inputs)
|
||||
!
|
||||
CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
|
||||
!
|
||||
IF (info /= 0) THEN
|
||||
IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
|
||||
CYCLE
|
||||
END IF
|
||||
!
|
||||
ALLOCATE(h_save, SOURCE=h)
|
||||
ALLOCATE(s_save, SOURCE=s)
|
||||
ALLOCATE(e_save, SOURCE=e)
|
||||
ALLOCATE(v_save, SOURCE=v)
|
||||
!
|
||||
h_save = h
|
||||
s_save = s
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( n, m, h, s, ldh, e, v, .false. )
|
||||
!
|
||||
test%tolerance64=1.d-6 ! check this
|
||||
DO j = 1, m
|
||||
CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
test%tolerance64=1.d-8 ! check this
|
||||
CALL test%assert_close( e(1:m), e_save(1:m) )
|
||||
!
|
||||
!
|
||||
h = h_save
|
||||
s = s_save
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( n, m, h, s, ldh, e, v, .true. )
|
||||
!
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m))
|
||||
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save)
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE real_1
|
||||
!
|
||||
SUBROUTINE complex_1(test)
|
||||
USE mp_world, ONLY : mpime
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
USE test_io
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
integer :: ldh, n, m
|
||||
complex(DP), allocatable :: h(:,:)
|
||||
complex(DP), allocatable :: h_save(:,:)
|
||||
complex(DP), allocatable :: s(:,:)
|
||||
complex(DP), allocatable :: s_save(:,:)
|
||||
real(DP), allocatable :: e(:)
|
||||
complex(DP), allocatable :: v(:,:)
|
||||
real(DP), allocatable :: e_save(:)
|
||||
complex(DP), allocatable :: v_save(:,:)
|
||||
!
|
||||
character(len=20) :: inputs(4)
|
||||
integer :: i, j, info
|
||||
!
|
||||
inputs = ["ZnOK1.bin ", &
|
||||
"ZnOK2.bin ", &
|
||||
"SiGeK1.bin", &
|
||||
"SiGeK2.bin"]
|
||||
!
|
||||
DO i=1, SIZE(inputs)
|
||||
!
|
||||
CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
|
||||
!
|
||||
IF (info /= 0) THEN
|
||||
IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
|
||||
CYCLE
|
||||
END IF
|
||||
!
|
||||
ALLOCATE(h_save, SOURCE=h)
|
||||
ALLOCATE(s_save, SOURCE=s)
|
||||
ALLOCATE(e_save, SOURCE=e)
|
||||
ALLOCATE(v_save, SOURCE=v)
|
||||
!
|
||||
h_save = h
|
||||
s_save = s
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( n, m, h, s, ldh, e, v, .false. )
|
||||
!
|
||||
DO j = 1, m
|
||||
CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m) )
|
||||
!
|
||||
!
|
||||
h = h_save
|
||||
s = s_save
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( n, m, h, s, ldh, e, v, .true. )
|
||||
!
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m))
|
||||
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save)
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE complex_1
|
||||
|
||||
end program test_diaghg_3
|
|
@ -0,0 +1,312 @@
|
|||
! This test uses the internal parallel diagonalization algorithm of LAXlib
|
||||
! to solve the problems stored in binary files:
|
||||
!
|
||||
! - ZnOG1.bin
|
||||
! - ZnOG2.bin
|
||||
! - ZnOK1.bin
|
||||
! - ZnOK2.bin
|
||||
! - SiGeK1.bin
|
||||
! - SiGeK2.bin
|
||||
!
|
||||
! If the scalacpak or ELPA driver is used, the test is skipped.
|
||||
!
|
||||
#if ! defined(__SCALAPACK)
|
||||
program test_diaghg_4
|
||||
#if defined(__MPI)
|
||||
USE MPI
|
||||
#endif
|
||||
USE mp, ONLY : mp_bcast
|
||||
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
|
||||
root, nproc, world_comm
|
||||
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
INTEGER :: world_group = 0
|
||||
!
|
||||
CALL test%init()
|
||||
test%tolerance64=1.d-8
|
||||
!
|
||||
#if defined(__MPI)
|
||||
world_group = MPI_COMM_WORLD
|
||||
#endif
|
||||
CALL mp_world_start(world_group)
|
||||
!
|
||||
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
|
||||
!
|
||||
CALL parallel_real_1(test)
|
||||
!
|
||||
CALL parallel_complex_1(test)
|
||||
!
|
||||
CALL collect_results(test)
|
||||
!
|
||||
CALL mp_world_end()
|
||||
!
|
||||
IF (mpime .eq. 0) CALL test%print()
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE parallel_real_1(test)
|
||||
USE mp_world, ONLY : mpime
|
||||
USE LAXlib
|
||||
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
|
||||
USE la_param, ONLY : DP
|
||||
USE test_io
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
TYPE(la_descriptor) :: desc
|
||||
integer :: ldh, n, m
|
||||
real(DP), allocatable :: h(:,:), hdst(:,:) !< full and distributed Hpsi
|
||||
real(DP), allocatable :: h_save(:,:) !< full Hpsi, used to check consistence across calls
|
||||
real(DP), allocatable :: s(:,:), sdst(:,:) !< full and distributed Spsi
|
||||
real(DP), allocatable :: s_save(:,:) !< full Spsi, used to check consistence across calls
|
||||
real(DP), allocatable :: e(:) !< full set of eigenvalues
|
||||
real(DP), allocatable :: v(:,:), vdst(:,:) !< full and distributed eigenvectors
|
||||
real(DP), allocatable :: e_save(:) !< full set of eigenvalues, used for checks
|
||||
real(DP), allocatable :: v_save(:,:) !< full set of eigenvectors, used for checks
|
||||
!
|
||||
character(len=20) :: inputs(2)
|
||||
integer :: l, i, j, ii, jj, info, nrdst
|
||||
logical :: la_proc
|
||||
!
|
||||
inputs = ["ZnOG1.bin", "ZnOG2.bin"]
|
||||
!
|
||||
DO l=1, SIZE(inputs)
|
||||
!
|
||||
CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
|
||||
!
|
||||
IF (info /= 0) THEN
|
||||
IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
|
||||
CYCLE
|
||||
END IF
|
||||
!
|
||||
ALLOCATE(h_save, SOURCE=h)
|
||||
ALLOCATE(s_save, SOURCE=s)
|
||||
ALLOCATE(e_save, SOURCE=e)
|
||||
ALLOCATE(v_save, SOURCE=v)
|
||||
!
|
||||
h_save = h
|
||||
s_save = s
|
||||
!
|
||||
CALL init_parallel_diag(desc, n)
|
||||
!
|
||||
IF( desc%active_node > 0 ) la_proc = .TRUE.
|
||||
nrdst = desc%nrcx
|
||||
IF (.not. la_proc) nrdst = 1
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
print *, nrdst, n, m
|
||||
ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
|
||||
ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
|
||||
ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
|
||||
!
|
||||
IF (la_proc) THEN
|
||||
DO j = 1, desc%nc ! number of column in the local block of lambda
|
||||
DO i = 1, desc%nr ! number of row in the local block of lambda
|
||||
ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
|
||||
jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
|
||||
hdst(i, j) = h(ii, jj)
|
||||
sdst(i, j) = s(ii, jj)
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .false. )
|
||||
!
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m) )
|
||||
!
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .true. )
|
||||
!
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m))
|
||||
!
|
||||
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE parallel_real_1
|
||||
!
|
||||
SUBROUTINE parallel_complex_1(test)
|
||||
USE mp_world, ONLY : mpime
|
||||
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
USE test_io
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
integer :: ldh, n, m
|
||||
complex(DP), allocatable :: h(:,:), hdst(:,:) !< full and distributed Hpsi
|
||||
complex(DP), allocatable :: h_save(:,:) !< full Hpsi, used to check consistence across calls
|
||||
complex(DP), allocatable :: s(:,:), sdst(:,:) !< full and distributed Spsi
|
||||
complex(DP), allocatable :: s_save(:,:) !< full Spsi, used to check consistence across calls
|
||||
real(DP), allocatable :: e(:) !< full set of eigenvalues
|
||||
complex(DP), allocatable :: v(:,:), vdst(:,:) !< full and distributed eigenvectors
|
||||
real(DP), allocatable :: e_save(:) !< full set of eigenvalues, used for checks
|
||||
complex(DP), allocatable :: v_save(:,:) !< full set of eigenvectors, used for checks
|
||||
TYPE(la_descriptor) :: desc
|
||||
!
|
||||
character(len=20) :: inputs(4)
|
||||
integer :: l, i, j, ii, jj, info, nrdst
|
||||
logical :: la_proc
|
||||
!
|
||||
inputs = ["ZnOK1.bin ", &
|
||||
"ZnOK2.bin ", &
|
||||
"SiGeK1.bin", &
|
||||
"SiGeK2.bin"]
|
||||
!
|
||||
DO l=1, SIZE(inputs)
|
||||
!
|
||||
CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
|
||||
!
|
||||
IF (info /= 0) THEN
|
||||
IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
|
||||
CYCLE
|
||||
END IF
|
||||
!
|
||||
ALLOCATE(h_save, SOURCE=h)
|
||||
ALLOCATE(s_save, SOURCE=s)
|
||||
ALLOCATE(e_save, SOURCE=e)
|
||||
ALLOCATE(v_save, SOURCE=v)
|
||||
!
|
||||
h_save = h
|
||||
s_save = s
|
||||
!
|
||||
CALL init_parallel_diag(desc, n)
|
||||
!
|
||||
IF( desc%active_node > 0 ) la_proc = .TRUE.
|
||||
nrdst = desc%nrcx
|
||||
IF (.not. la_proc) nrdst = 1
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
print *, nrdst, n, m
|
||||
ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
|
||||
ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
|
||||
ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
|
||||
!
|
||||
IF (la_proc) THEN
|
||||
DO j = 1, desc%nc ! number of column in the local block of lambda
|
||||
DO i = 1, desc%nr ! number of row in the local block of lambda
|
||||
ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
|
||||
jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
|
||||
hdst(i, j) = h(ii, jj)
|
||||
sdst(i, j) = s(ii, jj)
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .false. )
|
||||
!
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m) )
|
||||
!
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .true. )
|
||||
!
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m))
|
||||
!
|
||||
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE parallel_complex_1
|
||||
!
|
||||
SUBROUTINE init_parallel_diag(desc, n)
|
||||
|
||||
USE mp_world, ONLY : mpime, nproc, world_comm
|
||||
USE mp_diag, ONLY : ortho_parent_comm
|
||||
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
implicit none
|
||||
!
|
||||
TYPE(la_descriptor) :: desc
|
||||
INTEGER :: n ! global dimension of the matrix
|
||||
!
|
||||
INTEGER :: ierr = 0
|
||||
INTEGER :: color, key
|
||||
!
|
||||
INTEGER :: np_ortho(2) = 1 ! size of the processor grid used in ortho
|
||||
INTEGER :: me_ortho(2) = 0 ! coordinates of the processors
|
||||
INTEGER :: me_ortho1 = 0 ! task id for the ortho group
|
||||
INTEGER :: nproc_ortho = 1 ! size of the ortho group:
|
||||
INTEGER :: ortho_comm = 0 ! communicator for the ortho group
|
||||
INTEGER :: ortho_row_comm = 0 ! communicator for the ortho row group
|
||||
INTEGER :: ortho_col_comm = 0 ! communicator for the ortho col group
|
||||
INTEGER :: ortho_comm_id = 0 ! id of the ortho_comm
|
||||
!
|
||||
ortho_parent_comm = world_comm
|
||||
!
|
||||
#if defined __MPI
|
||||
!
|
||||
CALL grid2d_dims( 'S', nproc, np_ortho(1), np_ortho(2) )
|
||||
!
|
||||
nproc_ortho = np_ortho(1) * np_ortho(2)
|
||||
!
|
||||
! here we choose the first "nproc_ortho" processors
|
||||
!
|
||||
color = 0
|
||||
IF( mpime < nproc_ortho ) color = 1
|
||||
!
|
||||
key = mpime
|
||||
!
|
||||
! initialize the communicator for the new group by splitting the input
|
||||
! communicator
|
||||
!
|
||||
CALL mpi_comm_split( MPI_COMM_WORLD , color, key, ortho_comm, ierr )
|
||||
!
|
||||
! Computes coordinates of the processors, in row maior order
|
||||
!
|
||||
CALL mpi_comm_rank( ortho_comm, me_ortho1, ierr)
|
||||
!
|
||||
IF( mpime == 0 .AND. me_ortho1 /= 0 ) &
|
||||
CALL lax_error__( " init_ortho_group ", " wrong root task in ortho group ", ierr )
|
||||
!
|
||||
if( color == 1 ) then
|
||||
! this task belong to the ortho_group compute its coordinates
|
||||
ortho_comm_id = 1
|
||||
CALL GRID2D_COORDS( 'R', me_ortho1, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
|
||||
CALL GRID2D_RANK( 'R', np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2), ierr )
|
||||
IF( ierr /= me_ortho1 ) &
|
||||
CALL lax_error__( " init_ortho_group ", " wrong task coordinates in ortho group ", ierr )
|
||||
IF( me_ortho1 /= mpime ) &
|
||||
CALL lax_error__( " init_ortho_group ", " wrong rank assignment in ortho group ", ierr )
|
||||
CALL mpi_comm_split( ortho_comm , me_ortho(2), me_ortho(1), ortho_col_comm, ierr )
|
||||
CALL mpi_comm_split( ortho_comm , me_ortho(1), me_ortho(2), ortho_row_comm, ierr )
|
||||
else
|
||||
! this task does NOT belong to the ortho_group set dummy values
|
||||
ortho_comm_id = 0
|
||||
me_ortho(1) = me_ortho1
|
||||
me_ortho(2) = me_ortho1
|
||||
endif
|
||||
#else
|
||||
ortho_comm_id = 1
|
||||
#endif
|
||||
CALL descla_init( desc, n, n, np_ortho, me_ortho, ortho_comm, -1, ortho_comm_id )
|
||||
|
||||
END SUBROUTINE init_parallel_diag
|
||||
|
||||
end program test_diaghg_4
|
||||
#else
|
||||
program test_diaghg_4
|
||||
end program test_diaghg_4
|
||||
#endif
|
|
@ -0,0 +1,175 @@
|
|||
#if defined(__CUDA)
|
||||
program test_diaghg_gpu
|
||||
#if defined(__MPI)
|
||||
USE MPI
|
||||
#endif
|
||||
USE mp, ONLY : mp_bcast
|
||||
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
|
||||
root, nproc, world_comm
|
||||
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
INTEGER :: world_group = 0
|
||||
!
|
||||
CALL test%init()
|
||||
|
||||
#if defined(__MPI)
|
||||
world_group = MPI_COMM_WORLD
|
||||
#endif
|
||||
CALL mp_world_start(world_group)
|
||||
!
|
||||
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
|
||||
!
|
||||
CALL real_1(test)
|
||||
!
|
||||
CALL complex_1(test)
|
||||
!
|
||||
CALL collect_results(test)
|
||||
!
|
||||
CALL mp_world_end()
|
||||
!
|
||||
IF (mpime .eq. 0) CALL test%print()
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE real_1(test)
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
USE cudafor
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
! variables on device
|
||||
real(DP), device :: h_d(2,2)
|
||||
real(DP), device :: s_d(2,2)
|
||||
real(DP), device :: e_d(2)
|
||||
real(DP), device :: v_d(2,2)
|
||||
! variables on host
|
||||
real(DP) :: h_h(2,2)
|
||||
real(DP) :: s_h(2,2)
|
||||
real(DP) :: e_h(2)
|
||||
real(DP) :: v_h(2,2)
|
||||
|
||||
h_h = 0.d0
|
||||
h_h(1,1) = 1.d0
|
||||
h_h(2,2) = 1.d0
|
||||
h_d = h_h
|
||||
|
||||
s_h = 0.d0
|
||||
s_h(1,1) = 1.d0
|
||||
s_h(2,2) = 1.d0
|
||||
s_d = s_h
|
||||
!
|
||||
v_d = 0.d0
|
||||
e_d = 0.d0
|
||||
CALL diaghg( 2, 2, h_d, s_d, 2, e_d, v_d, .false. )
|
||||
v_h = v_d
|
||||
e_h = e_d
|
||||
h_h = h_d
|
||||
s_h = s_d
|
||||
!
|
||||
CALL test%assert_close( e_h, [1.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(v_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(h_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(s_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
!
|
||||
v_d = 0.d0
|
||||
e_d = 0.d0
|
||||
CALL diaghg( 2, 2, h_d, s_d, 2, e_d, v_d, .true. )
|
||||
v_h = v_d
|
||||
e_h = e_d
|
||||
h_h = h_d
|
||||
s_h = s_d
|
||||
!
|
||||
CALL test%assert_close( e_h, [1.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(v_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(h_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
CALL test%assert_close( RESHAPE(s_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
|
||||
!
|
||||
END SUBROUTINE real_1
|
||||
!
|
||||
SUBROUTINE complex_1(test)
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
USE cudafor
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
! variables on device
|
||||
complex(DP), device :: h_d(2,2)
|
||||
complex(DP), device :: s_d(2,2)
|
||||
complex(DP), device :: v_d(2,2)
|
||||
real(DP), device :: e_d(2)
|
||||
! variables on host
|
||||
complex(DP) :: h_h(2,2)
|
||||
complex(DP) :: s_h(2,2)
|
||||
complex(DP) :: v_h(2,2)
|
||||
real(DP) :: e_h(2)
|
||||
|
||||
|
||||
complex(DP) :: s_save(2,2)
|
||||
complex(DP) :: h_save(2,2)
|
||||
|
||||
|
||||
!
|
||||
h_h = 0.d0
|
||||
h_h(1,1) = (1.d0, 0.d0)
|
||||
h_h(1,2) = (0.d0, -2.d0)
|
||||
h_h(2,1) = ( 0.d0, 2.d0)
|
||||
h_h(2,2) = ( 5.d0, 0.d0)
|
||||
s_h = 0.d0
|
||||
s_h(1,1) = (1.d0, 0.d0)
|
||||
s_h(2,2) = (1.d0, 0.d0)
|
||||
!
|
||||
! save for later comparison
|
||||
h_save = h_h
|
||||
s_save = s_h
|
||||
!
|
||||
! Update device
|
||||
h_d = h_h
|
||||
s_d = s_h
|
||||
!
|
||||
v_h = (0.d0, 0.d0)
|
||||
e_h = 0.d0
|
||||
v_d = v_h; e_d = e_h
|
||||
!
|
||||
CALL diaghg( 2, 2, h_d, s_d, 2, e_d, v_d, .false. )
|
||||
v_h = v_d
|
||||
e_h = e_d
|
||||
h_h = h_d
|
||||
s_h = s_d
|
||||
!
|
||||
! 0.1715728752538099, 5.82842712474619
|
||||
CALL test%assert_close( e_h, [0.1715728752538099d0, 5.82842712474619d0] )
|
||||
CALL test%assert_close( v_h(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
|
||||
CALL test%assert_close( v_h(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
|
||||
CALL test%assert_close( RESHAPE(h_h, [4]), RESHAPE(h_save, [4]))
|
||||
CALL test%assert_close( RESHAPE(s_h, [4]), RESHAPE(s_save, [4]))
|
||||
!
|
||||
v_h = (0.d0, 0.d0)
|
||||
e_h = 0.d0
|
||||
!
|
||||
! Update device
|
||||
h_d = h_h
|
||||
s_d = s_h
|
||||
v_d = v_h; e_d = e_h
|
||||
CALL diaghg( 2, 2, h_d, s_d, 2, e_d, v_d, .true. )
|
||||
v_h = v_d
|
||||
e_h = e_d
|
||||
h_h = h_d
|
||||
s_h = s_d
|
||||
!
|
||||
CALL test%assert_close( e_h, [0.1715728752538099d0, 5.82842712474619d0] )
|
||||
CALL test%assert_close( v_h(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
|
||||
CALL test%assert_close( v_h(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
|
||||
CALL test%assert_close( RESHAPE(h_h, [4]), RESHAPE(h_save, [4]))
|
||||
CALL test%assert_close( RESHAPE(s_h, [4]), RESHAPE(s_save, [4]))
|
||||
!
|
||||
END SUBROUTINE complex_1
|
||||
end program test_diaghg_gpu
|
||||
#else
|
||||
program test_diaghg_gpu
|
||||
end program test_diaghg_gpu
|
||||
#endif
|
|
@ -0,0 +1,151 @@
|
|||
#if defined(__CUDA)
|
||||
program test_diaghg_gpu_2
|
||||
#if defined(__MPI)
|
||||
USE MPI
|
||||
#endif
|
||||
USE mp, ONLY : mp_bcast
|
||||
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
|
||||
root, nproc, world_comm
|
||||
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
INTEGER :: world_group = 0
|
||||
!
|
||||
CALL test%init()
|
||||
|
||||
#if defined(__MPI)
|
||||
world_group = MPI_COMM_WORLD
|
||||
#endif
|
||||
CALL mp_world_start(world_group)
|
||||
!
|
||||
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
|
||||
!
|
||||
CALL complex_1(test)
|
||||
!
|
||||
CALL collect_results(test)
|
||||
!
|
||||
CALL mp_world_end()
|
||||
!
|
||||
IF (mpime .eq. 0) CALL test%print()
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE complex_1(test)
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
USE cudafor
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
integer, parameter :: m_size=1024
|
||||
integer :: i
|
||||
complex(DP) :: h(m_size,m_size)
|
||||
complex(DP), DEVICE :: h_d(m_size,m_size)
|
||||
complex(DP) :: h_save(m_size,m_size)
|
||||
complex(DP) :: s(m_size,m_size)
|
||||
complex(DP), DEVICE :: s_d(m_size,m_size)
|
||||
complex(DP) :: s_save(m_size,m_size)
|
||||
real(DP) :: e(m_size), e_(m_size)
|
||||
real(DP), DEVICE :: e_d(m_size)
|
||||
complex(DP) :: v(m_size,m_size)
|
||||
complex(DP), DEVICE :: v_d(m_size,m_size)
|
||||
real(DP) :: e_save(m_size)
|
||||
complex(DP) :: v_save(m_size,m_size)
|
||||
!
|
||||
CALL hermitian(m_size, h)
|
||||
CALL hermitian(m_size, s)
|
||||
!
|
||||
h_save = h
|
||||
s_save = s
|
||||
h_d = h ! copy H and S to device
|
||||
s_d = s ! <----------|
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
v_d = (0.d0, 0.d0)
|
||||
e_d = 0.d0
|
||||
!
|
||||
! 1. Compare same algorithm starting from data on device ...
|
||||
CALL diaghg( m_size, m_size-1, h_d, s_d, m_size, e_d, v_d )
|
||||
e_save = e_d
|
||||
v_save = v_d
|
||||
! 2. ... and on the host, this will trigger the same subroutine used above
|
||||
CALL diaghg( m_size, m_size-1, h, s, m_size, e, v, .true. )
|
||||
!
|
||||
CALL test%assert_close( RESHAPE(h, [m_size*m_size]), RESHAPE(h_save, [m_size*m_size]))
|
||||
CALL test%assert_close( RESHAPE(s, [m_size*m_size]), RESHAPE(s_save, [m_size*m_size]))
|
||||
test%tolerance32=1.d-5
|
||||
test%tolerance64=1.d-10
|
||||
DO i=1, m_size-1
|
||||
CALL test%assert_close( v(1:m_size,i), v_save(1:m_size,i))
|
||||
CALL test%assert_close( e(i), e_save(i) )
|
||||
END DO
|
||||
!
|
||||
! reset data
|
||||
h = h_save
|
||||
s = s_save
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
!
|
||||
! 3. repeat the same task but with CPU subroutine now.
|
||||
! note that it uses a different algorithm and produces slightly
|
||||
! different eigenvectors.
|
||||
!
|
||||
CALL diaghg( m_size, m_size-1, h, s, m_size, e, v )
|
||||
h = h_save; s = s_save
|
||||
!
|
||||
! Solve-again, with the same algorithm used in the GPU version.
|
||||
! This is needed to compare eigenvectors.
|
||||
CALL solve_with_zhegvd(m_size, h, s, m_size, e_)
|
||||
!
|
||||
test%tolerance32=1.d-5
|
||||
test%tolerance64=1.d-10
|
||||
DO i=1, m_size-1
|
||||
! compare eigenvectors obtained in 1. with LAPACK zhegvd
|
||||
CALL test%assert_close( v_save(1:m_size,i), h(1:m_size,i) )
|
||||
! compare eigenvalues obtained with zhegvd, 1. and 3.
|
||||
CALL test%assert_close( e(i), e_save(i) )
|
||||
CALL test%assert_close( e_(i), e_save(i) )
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE complex_1
|
||||
!
|
||||
SUBROUTINE hermitian(mSize, M)
|
||||
USE la_param, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
integer, intent(in) :: msize
|
||||
complex(dp), intent(out) :: M(:,:)
|
||||
!
|
||||
real(dp), allocatable :: rnd(:)
|
||||
complex(dp), allocatable :: tmp(:,:)
|
||||
|
||||
INTEGER :: h, k, j
|
||||
!
|
||||
ALLOCATE(rnd(mSize*(mSize+1)))
|
||||
CALL RANDOM_NUMBER(rnd)
|
||||
rnd = 1.d0*rnd - 5.d-1
|
||||
!
|
||||
M = (0.d0, 0.d0)
|
||||
j = 1
|
||||
DO k=1,mSize
|
||||
DO h=1,mSize
|
||||
IF(h>k) THEN
|
||||
M(h,k) = CMPLX(rnd(j), rnd(j+1))
|
||||
M(k,h) = CONJG(M(h,k))
|
||||
j=j+2;
|
||||
ELSE IF(k == h) THEN
|
||||
M(k,h) = CMPLX(mSize, 0.d0, kind=DP)
|
||||
END IF
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE hermitian
|
||||
!
|
||||
end program test_diaghg_gpu_2
|
||||
#else
|
||||
program test_diaghg_gpu_2
|
||||
end program test_diaghg_gpu_2
|
||||
#endif
|
|
@ -0,0 +1,283 @@
|
|||
#ifdef __CUDA
|
||||
program test_diaghg_gpu_3
|
||||
#if defined(__MPI)
|
||||
USE MPI
|
||||
#endif
|
||||
USE mp, ONLY : mp_bcast
|
||||
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
|
||||
root, nproc, world_comm
|
||||
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
INTEGER :: world_group = 0
|
||||
!
|
||||
CALL test%init()
|
||||
test%tolerance64=1.d-8
|
||||
!
|
||||
#if defined(__MPI)
|
||||
world_group = MPI_COMM_WORLD
|
||||
#endif
|
||||
CALL mp_world_start(world_group)
|
||||
!
|
||||
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
|
||||
!
|
||||
CALL real_1(test)
|
||||
!
|
||||
CALL complex_1(test)
|
||||
!
|
||||
CALL collect_results(test)
|
||||
!
|
||||
CALL mp_world_end()
|
||||
!
|
||||
IF (mpime .eq. 0) CALL test%print()
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE real_1(test)
|
||||
USE mp_world, ONLY : mpime
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
USE test_io
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
integer :: ldh, n, m
|
||||
real(DP), allocatable :: h(:,:)
|
||||
real(DP), allocatable :: h_save(:,:)
|
||||
real(DP), allocatable :: s(:,:)
|
||||
real(DP), allocatable :: s_save(:,:)
|
||||
real(DP), allocatable :: e(:)
|
||||
real(DP), allocatable :: v(:,:)
|
||||
real(DP), allocatable :: e_save(:)
|
||||
real(DP), allocatable :: v_save(:,:)
|
||||
real(DP), allocatable :: e_dc(:) ! stores results from divide and conquer
|
||||
real(DP), allocatable :: v_dc(:,:) !
|
||||
! device copies
|
||||
real(DP), allocatable, device :: h_d(:,:)
|
||||
real(DP), allocatable, device :: s_d(:,:)
|
||||
real(DP), allocatable, device :: e_d(:)
|
||||
real(DP), allocatable, device :: v_d(:,:)
|
||||
!
|
||||
!
|
||||
character(len=20) :: inputs(2)
|
||||
integer :: i, j, info
|
||||
!
|
||||
inputs = ["ZnOG1.bin", "ZnOG2.bin"]
|
||||
!
|
||||
DO i=1, SIZE(inputs)
|
||||
!
|
||||
CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
|
||||
!
|
||||
IF (info /= 0) THEN
|
||||
IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
|
||||
CYCLE
|
||||
END IF
|
||||
!
|
||||
ALLOCATE(h_save, SOURCE=h)
|
||||
ALLOCATE(s_save, SOURCE=s)
|
||||
ALLOCATE(e_save, SOURCE=e)
|
||||
ALLOCATE(v_save, SOURCE=v)
|
||||
!
|
||||
h_save = h
|
||||
s_save = s
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( n, m, h, s, ldh, e, v, .false. )
|
||||
!
|
||||
test%tolerance64=1.d-6 ! <- check this
|
||||
DO j = 1, m
|
||||
CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
test%tolerance64=1.d-8 ! <- check this
|
||||
CALL test%assert_close( e(1:m), e_save(1:m) )
|
||||
!
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( n, m, h, s, ldh, e, v, .true. )
|
||||
!
|
||||
ALLOCATE(v_dc, SOURCE=h_save)
|
||||
ALLOCATE(e_dc(n))
|
||||
s = s_save
|
||||
CALL solve_with_dsygvd(n, v_dc, s, ldh, e_dc)
|
||||
s = s_save
|
||||
!
|
||||
DO j = 1, m
|
||||
CALL test%assert_close( v(1:n, j), v_dc(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m))
|
||||
!
|
||||
! GPU data & subroutines
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
s = s_save
|
||||
h = h_save
|
||||
ALLOCATE(e_d, SOURCE=e); ALLOCATE(v_d, SOURCE=v)
|
||||
ALLOCATE(h_d, SOURCE=h); ALLOCATE(s_d, SOURCE=s)
|
||||
!
|
||||
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, .false. )
|
||||
!
|
||||
v(1:n, 1:m) = v_d(1:n, 1:m)
|
||||
e(1:m) = e_d(1:m)
|
||||
DO j = 1, m
|
||||
CALL test%assert_close( v(1:n, j), v_dc(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m) )
|
||||
!
|
||||
!
|
||||
v_d = (0.d0, 0.d0)
|
||||
e_d = 0.d0
|
||||
s_d = s_save
|
||||
h_d = h_save
|
||||
!
|
||||
! Start from data on the GPU and diagonalize on the CPU
|
||||
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, .true. )
|
||||
!
|
||||
v(1:n, 1:m) = v_d(1:n, 1:m)
|
||||
e(1:m) = e_d(1:m)
|
||||
!
|
||||
test%tolerance64=1.d-6 ! <- check this
|
||||
DO j = 1, m
|
||||
CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
test%tolerance64=1.d-8 ! <- check this
|
||||
!
|
||||
CALL test%assert_close( e(1:m), e_save(1:m))
|
||||
!
|
||||
DEALLOCATE(h_d, s_d, e_d, v_d)
|
||||
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, v_dc, e_dc)
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE real_1
|
||||
!
|
||||
SUBROUTINE complex_1(test)
|
||||
USE mp_world, ONLY : mpime
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
USE test_io
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
integer :: ldh, n, m
|
||||
complex(DP), allocatable :: h(:,:)
|
||||
complex(DP), allocatable :: h_save(:,:)
|
||||
complex(DP), allocatable :: s(:,:)
|
||||
complex(DP), allocatable :: s_save(:,:)
|
||||
real(DP), allocatable :: e(:)
|
||||
complex(DP), allocatable :: v(:,:)
|
||||
real(DP), allocatable :: e_save(:)
|
||||
complex(DP), allocatable :: v_save(:,:)
|
||||
real(DP), allocatable :: e_dc(:) ! stores results from divide and conquer
|
||||
complex(DP), allocatable :: v_dc(:,:) !
|
||||
|
||||
! device copies
|
||||
complex(DP), allocatable, device :: h_d(:,:)
|
||||
complex(DP), allocatable, device :: s_d(:,:)
|
||||
real(DP), allocatable, device :: e_d(:)
|
||||
complex(DP), allocatable, device :: v_d(:,:)
|
||||
!
|
||||
!
|
||||
character(len=20) :: inputs(4)
|
||||
integer :: i, j, info
|
||||
!
|
||||
inputs = ["ZnOK1.bin ", &
|
||||
"ZnOK2.bin ", &
|
||||
"SiGeK1.bin", &
|
||||
"SiGeK2.bin"]
|
||||
!
|
||||
DO i=1, SIZE(inputs)
|
||||
!
|
||||
CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
|
||||
!
|
||||
IF (info /= 0) THEN
|
||||
IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
|
||||
CYCLE
|
||||
END IF
|
||||
!
|
||||
ALLOCATE(h_save, SOURCE=h)
|
||||
ALLOCATE(s_save, SOURCE=s)
|
||||
ALLOCATE(e_save, SOURCE=e)
|
||||
ALLOCATE(v_save, SOURCE=v)
|
||||
ALLOCATE(e_dc(n))
|
||||
ALLOCATE(v_dc, SOURCE=h)
|
||||
!
|
||||
h_save = h
|
||||
s_save = s
|
||||
!
|
||||
! == Check CPU interface without and with offloading ==
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( n, m, h, s, ldh, e, v, .false. )
|
||||
!
|
||||
DO j = 1, m
|
||||
CALL test%assert_close( v(1:n, j), v_save(1:n, j) )
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m) )
|
||||
!
|
||||
!
|
||||
h = h_save; s = s_save;
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
CALL diaghg( n, m, h, s, ldh, e, v, .true. )
|
||||
!
|
||||
! N.B.: GPU eigensolver uses a different algorithm: zhegvd
|
||||
s = s_save; e_dc = 0.d0
|
||||
CALL solve_with_zhegvd(n, v_dc, s, ldh, e_dc)
|
||||
!
|
||||
DO j = 1, m
|
||||
CALL test%assert_close( v(1:n, j), v_dc(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m))
|
||||
!
|
||||
! GPU data & subroutines
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
s = s_save
|
||||
h = h_save
|
||||
ALLOCATE(e_d, SOURCE=e); ALLOCATE(v_d, SOURCE=v)
|
||||
ALLOCATE(h_d, SOURCE=h); ALLOCATE(s_d, SOURCE=s)
|
||||
!
|
||||
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, .false. )
|
||||
!
|
||||
v(1:n, 1:m) = v_d(1:n, 1:m)
|
||||
e(1:m) = e_d(1:m)
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m) )
|
||||
!
|
||||
!
|
||||
v_d = (0.d0, 0.d0)
|
||||
e_d = 0.d0
|
||||
s_d = s_save
|
||||
h_d = h_save
|
||||
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, .true. )
|
||||
!
|
||||
v(1:n, 1:m) = v_d(1:n, 1:m)
|
||||
e(1:m) = e_d(1:m)
|
||||
!
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m))
|
||||
!
|
||||
DEALLOCATE(h_d, s_d, e_d, v_d)
|
||||
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save)
|
||||
DEALLOCATE(e_dc, v_dc)
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE complex_1
|
||||
!
|
||||
end program test_diaghg_gpu_3
|
||||
#else
|
||||
program test_diaghg_gpu_3
|
||||
end program test_diaghg_gpu_3
|
||||
#endif
|
|
@ -0,0 +1,345 @@
|
|||
! This test uses the internal parallel diagonalization algorithm of LAXlib
|
||||
! (GPU interface) to solve the problems stored in binary files:
|
||||
!
|
||||
! - ZnOG1.bin
|
||||
! - ZnOG2.bin
|
||||
! - ZnOK1.bin
|
||||
! - ZnOK2.bin
|
||||
! - SiGeK1.bin
|
||||
! - SiGeK2.bin
|
||||
!
|
||||
! If the scalacpak or ELPA driver is used, the test is skipped.
|
||||
!
|
||||
#if ( ! defined(__SCALAPACK) ) && defined(__CUDA)
|
||||
program test_diaghg_gpu_4
|
||||
#if defined(__MPI)
|
||||
USE MPI
|
||||
#endif
|
||||
USE mp, ONLY : mp_bcast
|
||||
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
|
||||
root, nproc, world_comm
|
||||
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
INTEGER :: world_group = 0
|
||||
!
|
||||
CALL test%init()
|
||||
test%tolerance64=1.d-8
|
||||
!
|
||||
#if defined(__MPI)
|
||||
world_group = MPI_COMM_WORLD
|
||||
#endif
|
||||
CALL mp_world_start(world_group)
|
||||
!
|
||||
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
|
||||
!
|
||||
CALL parallel_real_1(test)
|
||||
!
|
||||
CALL parallel_complex_1(test)
|
||||
!
|
||||
CALL collect_results(test)
|
||||
!
|
||||
CALL mp_world_end()
|
||||
!
|
||||
IF (mpime .eq. 0) CALL test%print()
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE parallel_real_1(test)
|
||||
USE cudafor
|
||||
USE mp_world, ONLY : mpime
|
||||
USE LAXlib
|
||||
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
|
||||
USE la_param, ONLY : DP
|
||||
USE test_io
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
TYPE(la_descriptor) :: desc
|
||||
integer :: ldh, n, m
|
||||
real(DP), allocatable :: h(:,:), hdst(:,:) !< full and distributed Hpsi
|
||||
real(DP), allocatable, device :: hdst_d(:,:) !< distributed Hpsi on device
|
||||
real(DP), allocatable :: h_save(:,:) !< full Hpsi, used to check consistence across calls
|
||||
real(DP), allocatable :: s(:,:), sdst(:,:) !< full and distributed Spsi
|
||||
real(DP), allocatable, device :: sdst_d(:,:) !< distributed Spsi on device
|
||||
real(DP), allocatable :: s_save(:,:) !< full Spsi, used to check consistence across calls
|
||||
real(DP), allocatable :: e(:) !< full set of eigenvalues
|
||||
real(DP), allocatable, device :: e_d(:) !< full set of eigenvalues
|
||||
real(DP), allocatable :: v(:,:), vdst(:,:) !< full and distributed eigenvectors
|
||||
real(DP), allocatable, device :: vdst_d(:,:) !< full and distributed eigenvectors
|
||||
real(DP), allocatable :: e_save(:) !< full set of eigenvalues, used for checks
|
||||
real(DP), allocatable :: v_save(:,:) !< full set of eigenvectors, used for checks
|
||||
!
|
||||
character(len=20) :: inputs(2)
|
||||
integer :: l, i, j, ii, jj, info, nrdst
|
||||
logical :: la_proc
|
||||
!
|
||||
inputs = ["ZnOG1.bin", "ZnOG2.bin"]
|
||||
!
|
||||
DO l=1, SIZE(inputs)
|
||||
!
|
||||
! Read the problem from specified input file.
|
||||
! this will also allocate(h, s, e, v)
|
||||
CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
|
||||
!
|
||||
IF (info /= 0) THEN
|
||||
IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
|
||||
CYCLE
|
||||
END IF
|
||||
!
|
||||
ALLOCATE(h_save, SOURCE=h)
|
||||
ALLOCATE(s_save, SOURCE=s)
|
||||
ALLOCATE(e_save, SOURCE=e)
|
||||
ALLOCATE(v_save, SOURCE=v)
|
||||
!
|
||||
h_save = h
|
||||
s_save = s
|
||||
!
|
||||
CALL init_parallel_diag(desc, n)
|
||||
!
|
||||
IF( desc%active_node > 0 ) la_proc = .TRUE. ! selects processors involved in diagonalization
|
||||
nrdst = desc%nrcx ! stores square distributed matrix sizes
|
||||
IF (.not. la_proc) nrdst = 1 ! dummy value to avoid 0 allocations
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
print *, nrdst, n, m
|
||||
ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
|
||||
ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
|
||||
ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
|
||||
!
|
||||
IF (la_proc) THEN
|
||||
DO j = 1, desc%nc ! number of column in the local block of lambda
|
||||
DO i = 1, desc%nr ! number of row in the local block of lambda
|
||||
ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
|
||||
jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
|
||||
hdst(i, j) = h(ii, jj)
|
||||
sdst(i, j) = s(ii, jj)
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
ALLOCATE(hdst_d, SOURCE=hdst)
|
||||
ALLOCATE(sdst_d, SOURCE=sdst)
|
||||
ALLOCATE(vdst_d( nrdst , nrdst ), e_d(n))
|
||||
e(1:n) = 0.d0
|
||||
e_d(1:n) = 0.d0
|
||||
!
|
||||
CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .false. )
|
||||
!
|
||||
e(1:n) = e_d
|
||||
!
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m) )
|
||||
!
|
||||
!
|
||||
e(1:n) = 0.d0
|
||||
e_d(1:n) = 0.d0
|
||||
!
|
||||
CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .true. )
|
||||
!
|
||||
e(1:n) = e_d
|
||||
!
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m))
|
||||
!
|
||||
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
|
||||
DEALLOCATE(e_d, hdst_d, sdst_d, vdst_d)
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE parallel_real_1
|
||||
!
|
||||
SUBROUTINE parallel_complex_1(test)
|
||||
USE cudafor
|
||||
USE mp_world, ONLY : mpime
|
||||
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
USE test_io
|
||||
implicit none
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
!
|
||||
integer :: ldh, n, m
|
||||
complex(DP), allocatable :: h(:,:), hdst(:,:) !< full and distributed Hpsi
|
||||
complex(DP), allocatable, device :: hdst_d(:,:) !< distributed Hpsi on device
|
||||
complex(DP), allocatable :: h_save(:,:) !< full Hpsi, used to check consistence across calls
|
||||
complex(DP), allocatable :: s(:,:), sdst(:,:) !< full and distributed Spsi
|
||||
complex(DP), allocatable, device :: sdst_d(:,:) !< distributed Spsi on device
|
||||
complex(DP), allocatable :: s_save(:,:) !< full Spsi, used to check consistence across calls
|
||||
real(DP), allocatable :: e(:) !< full set of eigenvalues
|
||||
real(DP), allocatable, device :: e_d(:) !< full set of eigenvalues
|
||||
complex(DP), allocatable :: v(:,:), vdst(:,:) !< full and distributed eigenvectors
|
||||
complex(DP), allocatable, device :: vdst_d(:,:) !< full and distributed eigenvectors
|
||||
real(DP), allocatable :: e_save(:) !< full set of eigenvalues, used for checks
|
||||
complex(DP), allocatable :: v_save(:,:) !< full set of eigenvectors, used for checks
|
||||
TYPE(la_descriptor) :: desc
|
||||
!
|
||||
character(len=20) :: inputs(4)
|
||||
integer :: l, i, j, ii, jj, info, nrdst
|
||||
logical :: la_proc
|
||||
!
|
||||
inputs = ["ZnOK1.bin ", &
|
||||
"ZnOK2.bin ", &
|
||||
"SiGeK1.bin", &
|
||||
"SiGeK2.bin"]
|
||||
!
|
||||
DO l=1, SIZE(inputs)
|
||||
!
|
||||
! Read the problem from specified input file.
|
||||
! this will also allocate(h, s, e, v)
|
||||
CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
|
||||
!
|
||||
IF (info /= 0) THEN
|
||||
IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
|
||||
CYCLE
|
||||
END IF
|
||||
!
|
||||
ALLOCATE(h_save, SOURCE=h)
|
||||
ALLOCATE(s_save, SOURCE=s)
|
||||
ALLOCATE(e_save, SOURCE=e)
|
||||
ALLOCATE(v_save, SOURCE=v)
|
||||
!
|
||||
h_save = h
|
||||
s_save = s
|
||||
!
|
||||
CALL init_parallel_diag(desc, n)
|
||||
!
|
||||
IF( desc%active_node > 0 ) la_proc = .TRUE.
|
||||
nrdst = desc%nrcx
|
||||
IF (.not. la_proc) nrdst = 1
|
||||
!
|
||||
v = (0.d0, 0.d0)
|
||||
e = 0.d0
|
||||
!
|
||||
ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
|
||||
ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
|
||||
ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
|
||||
!
|
||||
IF (la_proc) THEN
|
||||
DO j = 1, desc%nc ! number of column in the local block of lambda
|
||||
DO i = 1, desc%nr ! number of row in the local block of lambda
|
||||
ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
|
||||
jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
|
||||
hdst(i, j) = h(ii, jj)
|
||||
sdst(i, j) = s(ii, jj)
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
ALLOCATE(hdst_d, SOURCE=hdst)
|
||||
ALLOCATE(sdst_d, SOURCE=sdst)
|
||||
ALLOCATE(vdst_d( nrdst , nrdst ), e_d(n))
|
||||
!
|
||||
e_d(1:n) = 0.d0
|
||||
CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .false. )
|
||||
e = e_d
|
||||
!
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m) )
|
||||
!
|
||||
!
|
||||
e_d(1:n) = 0.d0
|
||||
CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .true. )
|
||||
!
|
||||
e = e_d
|
||||
DO j = 1, m
|
||||
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
|
||||
END DO
|
||||
CALL test%assert_close( e(1:m), e_save(1:m))
|
||||
!
|
||||
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
|
||||
DEALLOCATE(e_d, hdst_d, sdst_d, vdst_d)
|
||||
END DO
|
||||
!
|
||||
END SUBROUTINE parallel_complex_1
|
||||
!
|
||||
SUBROUTINE init_parallel_diag(desc, n)
|
||||
|
||||
USE mp_world, ONLY : mpime, nproc, world_comm
|
||||
USE mp_diag, ONLY : ortho_parent_comm
|
||||
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
|
||||
USE LAXlib
|
||||
USE la_param, ONLY : DP
|
||||
implicit none
|
||||
!
|
||||
TYPE(la_descriptor) :: desc
|
||||
INTEGER :: n ! global dimension of the matrix
|
||||
!
|
||||
INTEGER :: ierr = 0
|
||||
INTEGER :: color, key
|
||||
!
|
||||
INTEGER :: np_ortho(2) = 1 ! size of the processor grid used in ortho
|
||||
INTEGER :: me_ortho(2) = 0 ! coordinates of the processors
|
||||
INTEGER :: me_ortho1 = 0 ! task id for the ortho group
|
||||
INTEGER :: nproc_ortho = 1 ! size of the ortho group:
|
||||
INTEGER :: ortho_comm = 0 ! communicator for the ortho group
|
||||
INTEGER :: ortho_row_comm = 0 ! communicator for the ortho row group
|
||||
INTEGER :: ortho_col_comm = 0 ! communicator for the ortho col group
|
||||
INTEGER :: ortho_comm_id = 0 ! id of the ortho_comm
|
||||
!
|
||||
ortho_parent_comm = world_comm
|
||||
!
|
||||
#if defined __MPI
|
||||
!
|
||||
CALL grid2d_dims( 'S', nproc, np_ortho(1), np_ortho(2) )
|
||||
!
|
||||
nproc_ortho = np_ortho(1) * np_ortho(2)
|
||||
!
|
||||
! here we choose the first "nproc_ortho" processors
|
||||
!
|
||||
color = 0
|
||||
IF( mpime < nproc_ortho ) color = 1
|
||||
!
|
||||
key = mpime
|
||||
!
|
||||
! initialize the communicator for the new group by splitting the input
|
||||
! communicator
|
||||
!
|
||||
CALL mpi_comm_split( MPI_COMM_WORLD , color, key, ortho_comm, ierr )
|
||||
!
|
||||
! Computes coordinates of the processors, in row maior order
|
||||
!
|
||||
CALL mpi_comm_rank( ortho_comm, me_ortho1, ierr)
|
||||
!
|
||||
IF( mpime == 0 .AND. me_ortho1 /= 0 ) &
|
||||
CALL lax_error__( " init_ortho_group ", " wrong root task in ortho group ", ierr )
|
||||
!
|
||||
if( color == 1 ) then
|
||||
! this task belong to the ortho_group compute its coordinates
|
||||
ortho_comm_id = 1
|
||||
CALL GRID2D_COORDS( 'R', me_ortho1, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
|
||||
CALL GRID2D_RANK( 'R', np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2), ierr )
|
||||
IF( ierr /= me_ortho1 ) &
|
||||
CALL lax_error__( " init_ortho_group ", " wrong task coordinates in ortho group ", ierr )
|
||||
IF( me_ortho1 /= mpime ) &
|
||||
CALL lax_error__( " init_ortho_group ", " wrong rank assignment in ortho group ", ierr )
|
||||
CALL mpi_comm_split( ortho_comm , me_ortho(2), me_ortho(1), ortho_col_comm, ierr )
|
||||
CALL mpi_comm_split( ortho_comm , me_ortho(1), me_ortho(2), ortho_row_comm, ierr )
|
||||
else
|
||||
! this task does NOT belong to the ortho_group set dummy values
|
||||
ortho_comm_id = 0
|
||||
me_ortho(1) = me_ortho1
|
||||
me_ortho(2) = me_ortho1
|
||||
endif
|
||||
#else
|
||||
ortho_comm_id = 1
|
||||
#endif
|
||||
CALL descla_init( desc, n, n, np_ortho, me_ortho, ortho_comm, -1, ortho_comm_id )
|
||||
|
||||
END SUBROUTINE init_parallel_diag
|
||||
|
||||
end program test_diaghg_gpu_4
|
||||
#else
|
||||
program test_diaghg_gpu_4
|
||||
end program test_diaghg_gpu_4
|
||||
#endif
|
|
@ -0,0 +1,76 @@
|
|||
!
|
||||
SUBROUTINE solve_with_zhegvd(n, v, s, ldh, e)
|
||||
USE la_param, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
!
|
||||
complex(DP) :: v(ldh,n)
|
||||
complex(DP) :: s(ldh,n)
|
||||
real(DP) :: e(n)
|
||||
INTEGER :: n
|
||||
!
|
||||
INTEGER :: lwork, lrwork, liwork, info, ldh
|
||||
!
|
||||
REAL(DP) :: abstol
|
||||
INTEGER, ALLOCATABLE :: iwork(:), ifail(:)
|
||||
REAL(DP), ALLOCATABLE :: rwork(:)
|
||||
COMPLEX(DP), ALLOCATABLE :: work(:)
|
||||
! various work space
|
||||
|
||||
!
|
||||
ALLOCATE(work(1), rwork(1), iwork(1))
|
||||
CALL ZHEGVD( 1, 'V', 'U', n, v, ldh, &
|
||||
s, ldh, e, work, -1, rwork, -1, iwork, -1, info )
|
||||
|
||||
IF (info /= 0) print *, "Workspace not computed!"
|
||||
|
||||
lwork = work(1)
|
||||
lrwork = rwork(1)
|
||||
liwork = iwork(1)
|
||||
|
||||
DEALLOCATE(work, rwork, iwork)
|
||||
ALLOCATE(work(lwork), rwork(lrwork), iwork(liwork))
|
||||
|
||||
CALL ZHEGVD( 1, 'V', 'U', n, v, ldh, &
|
||||
s, ldh, e, work, lwork, rwork, lrwork, iwork, liwork, info )
|
||||
|
||||
DEALLOCATE(work, rwork, iwork)
|
||||
!
|
||||
END SUBROUTINE solve_with_zhegvd
|
||||
!
|
||||
SUBROUTINE solve_with_dsygvd(n, v, s, ldh, e)
|
||||
USE la_param, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
!
|
||||
REAL(DP) :: v(ldh,n)
|
||||
REAL(DP) :: s(ldh,n)
|
||||
real(DP) :: e(n)
|
||||
INTEGER :: n
|
||||
!
|
||||
INTEGER :: lwork, liwork, info, ldh
|
||||
!
|
||||
REAL(DP) :: abstol
|
||||
INTEGER, ALLOCATABLE :: iwork(:), ifail(:)
|
||||
REAL(DP), ALLOCATABLE :: work(:)
|
||||
! various work space
|
||||
|
||||
!
|
||||
ALLOCATE(work(1), iwork(1))
|
||||
CALL dsygvd( 1, 'V', 'U', n, v, ldh, &
|
||||
s, ldh, e, work, -1, iwork, -1, info )
|
||||
|
||||
IF (info /= 0) print *, "Workspace not computed!"
|
||||
|
||||
lwork = work(1)
|
||||
liwork = iwork(1)
|
||||
|
||||
DEALLOCATE(work, iwork)
|
||||
ALLOCATE(work(lwork), iwork(liwork))
|
||||
!
|
||||
CALL dsygvd( 1, 'V', 'U', n, v, ldh, &
|
||||
s, ldh, e, work, lwork, iwork, liwork, info )
|
||||
!
|
||||
DEALLOCATE(work, iwork)
|
||||
!
|
||||
END SUBROUTINE solve_with_dsygvd
|
||||
!
|
||||
! TODO: add check for eigenvalue probelm
|
|
@ -0,0 +1,104 @@
|
|||
MODULE test_io
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTERFACE read_problem
|
||||
MODULE PROCEDURE read_cmplx_problem, read_real_problem
|
||||
END INTERFACE
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE read_cmplx_problem(fname, ldh, n, m, h, s, e, v, info)
|
||||
USE la_param, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
character(len=*), intent(in) :: fname
|
||||
integer, intent(out) :: ldh, n, m
|
||||
complex(dp), allocatable, intent(inout) :: h(:,:)
|
||||
complex(dp), allocatable, intent(inout) :: s(:,:)
|
||||
real(dp), allocatable, intent(inout) :: e(:)
|
||||
complex(dp), allocatable, intent(inout) :: v(:,:)
|
||||
integer, intent(out) :: info
|
||||
!
|
||||
real(dp) :: aux1, aux2
|
||||
integer :: i, j, t
|
||||
logical :: exist
|
||||
!
|
||||
character(len=20):: fname_
|
||||
!
|
||||
info = 0
|
||||
fname_ = trim(fname)
|
||||
!
|
||||
print *, "reading ", fname_
|
||||
inquire(file=fname_, exist=exist)
|
||||
!
|
||||
if (.not. exist ) then
|
||||
info=1
|
||||
return
|
||||
end if
|
||||
!
|
||||
open (unit=15, file=fname_, status='old', form='unformatted', action='read')
|
||||
!
|
||||
t = 0
|
||||
read (15) n
|
||||
read (15) m
|
||||
read (15) ldh
|
||||
!
|
||||
t = t + 3
|
||||
!
|
||||
ALLOCATE(h(ldh, n), s(ldh, n), e(n), v(ldh, m))
|
||||
!
|
||||
READ(15) h
|
||||
READ(15) s
|
||||
READ(15) e
|
||||
READ(15) v
|
||||
!
|
||||
close(15)
|
||||
END SUBROUTINE read_cmplx_problem
|
||||
!
|
||||
SUBROUTINE read_real_problem(fname, ldh, n, m, h, s, e, v, info)
|
||||
USE la_param, ONLY : DP
|
||||
IMPLICIT NONE
|
||||
character(len=*), intent(in) :: fname
|
||||
integer, intent(out) :: ldh, n, m
|
||||
real(dp), allocatable, intent(inout) :: h(:,:)
|
||||
real(dp), allocatable, intent(inout) :: s(:,:)
|
||||
real(dp), allocatable, intent(inout) :: e(:)
|
||||
real(dp), allocatable, intent(inout) :: v(:,:)
|
||||
integer, intent(out) :: info
|
||||
!
|
||||
real(dp) :: aux1, aux2
|
||||
integer :: i, j, t
|
||||
logical :: exist
|
||||
!
|
||||
character(len=20):: fname_
|
||||
!
|
||||
info = 0
|
||||
fname_ = trim(fname)
|
||||
!
|
||||
print *, "reading ", fname_
|
||||
inquire(file=fname_, exist=exist)
|
||||
!
|
||||
if (.not. exist ) then
|
||||
info=1
|
||||
return
|
||||
end if
|
||||
!
|
||||
open (unit=15, file=fname_, status='old', form='unformatted', action='read')
|
||||
!
|
||||
t = 0
|
||||
read (15) n
|
||||
read (15) m
|
||||
read (15) ldh
|
||||
!
|
||||
t = t + 3
|
||||
!
|
||||
ALLOCATE(h(ldh, n), s(ldh, n), e(n), v(ldh, m))
|
||||
!
|
||||
READ(15) h
|
||||
READ(15) s
|
||||
READ(15) e
|
||||
READ(15) v
|
||||
!
|
||||
close(15)
|
||||
END SUBROUTINE read_real_problem
|
||||
END MODULE test_io
|
|
@ -0,0 +1,862 @@
|
|||
! This file is part of fortran_tester
|
||||
! Copyright 2015 Pierre de Buyl and Peter Colberg
|
||||
! 2016 Pierre de Buyl and Stefano Szaghi
|
||||
! 2018 Pierre de Buyl and Pietro Bonfa
|
||||
! License: BSD
|
||||
|
||||
!> Routines to test Fortran programs
|
||||
!!
|
||||
!! fortran_tester is a pure-Fortran module. It provides a datatype to hold test results and
|
||||
!! routines to test for equality, closeness, and positivity of variables. The routines are
|
||||
!! overloaded and the resulting interface consists of a small number of names.
|
||||
|
||||
module tester
|
||||
use, intrinsic :: iso_fortran_env, only : int8, int16, int32, int64, real32, real64
|
||||
|
||||
implicit none
|
||||
private
|
||||
public :: tester_t
|
||||
|
||||
!> The main **tester** class.
|
||||
type :: tester_t
|
||||
integer(int32) :: n_errors=0_int32 !< Number of errors.
|
||||
integer(int32) :: n_tests=0_int32 !< Number of tests.
|
||||
real(real32) :: tolerance32=2._real32*epsilon(1._real32) !< Real tolerance, 32 bits.
|
||||
real(real64) :: tolerance64=2._real64*epsilon(1._real64) !< Real tolerance, 64 bits.
|
||||
contains
|
||||
procedure :: init !< Initialize the tester.
|
||||
procedure :: print !< Print tests results.
|
||||
generic, public :: assert_equal => &
|
||||
assert_equal_i8, &
|
||||
assert_equal_i16, &
|
||||
assert_equal_i32, &
|
||||
assert_equal_i64, &
|
||||
assert_equal_r32, &
|
||||
assert_equal_r64, &
|
||||
assert_equal_c32, &
|
||||
assert_equal_c64, &
|
||||
assert_equal_l, &
|
||||
assert_equal_i8_1, &
|
||||
assert_equal_i16_1, &
|
||||
assert_equal_i32_1, &
|
||||
assert_equal_i64_1, &
|
||||
assert_equal_r32_1, &
|
||||
assert_equal_r64_1, &
|
||||
assert_equal_c32_1, &
|
||||
assert_equal_c64_1, &
|
||||
assert_equal_l_1 !< Check if two values (integer, real, complex or logical) are equal.
|
||||
procedure, private :: assert_equal_i8 !< Check if two integers (8 bits) are equal.
|
||||
procedure, private :: assert_equal_i16 !< Check if two integers (16 bits) are equal.
|
||||
procedure, private :: assert_equal_i32 !< Check if two integers (32 bits) are equal.
|
||||
procedure, private :: assert_equal_i64 !< Check if two integers (64 bits) are equal.
|
||||
procedure, private :: assert_equal_r32 !< Check if two reals (32 bits) are equal.
|
||||
procedure, private :: assert_equal_r64 !< Check if two reals (64 bits) are equal.
|
||||
procedure, private :: assert_equal_c32 !< Check if two complex numbers (32 bits) are equal.
|
||||
procedure, private :: assert_equal_c64 !< Check if two complex numbers (64 bits) are equal.
|
||||
procedure, private :: assert_equal_l !< Check if two logicals are equal.
|
||||
procedure, private :: assert_equal_i8_1 !< Check if two integer (8 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_i16_1 !< Check if two integer (16 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_i32_1 !< Check if two integer (32 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_i64_1 !< Check if two integer (64 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_r32_1 !< Check if two real (32 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_r64_1 !< Check if two real (64 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_c32_1 !< Check if two complex (32 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_c64_1 !< Check if two complex (64 bits) arrays (rank 1) are equal.
|
||||
procedure, private :: assert_equal_l_1 !< Check if two logical arrays (rank 1) are equal.
|
||||
generic, public :: assert_positive => &
|
||||
assert_positive_i8, &
|
||||
assert_positive_i16, &
|
||||
assert_positive_i32, &
|
||||
assert_positive_i64, &
|
||||
assert_positive_r32, &
|
||||
assert_positive_r64, &
|
||||
assert_positive_i8_1, &
|
||||
assert_positive_i16_1, &
|
||||
assert_positive_i32_1, &
|
||||
assert_positive_i64_1, &
|
||||
assert_positive_r32_1, &
|
||||
assert_positive_r64_1 !< Check if a number (integer or real) is positive.
|
||||
procedure, private :: assert_positive_i8 !< Check if a integer (8 bits) is positive.
|
||||
procedure, private :: assert_positive_i16 !< Check if a integer (16 bits) is positive.
|
||||
procedure, private :: assert_positive_i32 !< Check if a integer (32 bits) is positive.
|
||||
procedure, private :: assert_positive_i64 !< Check if a integer (64 bits) is positive.
|
||||
procedure, private :: assert_positive_r32 !< Check if a real (32 bits) is positive.
|
||||
procedure, private :: assert_positive_r64 !< Check if a real (64 bits) is positive.
|
||||
procedure, private :: assert_positive_i8_1 !< Check if a integer (8 bits) array (rank 1) is positive.
|
||||
procedure, private :: assert_positive_i16_1 !< Check if a integer (16 bits) array (rank 1) is positive.
|
||||
procedure, private :: assert_positive_i32_1 !< Check if a integer (32 bits) array (rank 1) is positive.
|
||||
procedure, private :: assert_positive_i64_1 !< Check if a integer (64 bits) array (rank 1) is positive.
|
||||
procedure, private :: assert_positive_r32_1 !< Check if a real (32 bits) array (rank 1) is positive.
|
||||
procedure, private :: assert_positive_r64_1 !< Check if a real (64 bits) array (rank 1) is positive.
|
||||
generic, public :: assert_close => &
|
||||
assert_close_r32, &
|
||||
assert_close_r64, &
|
||||
assert_close_c32, &
|
||||
assert_close_c64, &
|
||||
assert_close_r32_1, &
|
||||
assert_close_r64_1, &
|
||||
assert_close_c32_1, &
|
||||
assert_close_c64_1 !< Check if two values (real or complex) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_r32 !< Check if two reals (32 bits) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_r64 !< Check if two reals (64 bits) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_c32 !< Check if two complex numbers (32 bits) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_c64 !< Check if two complex numbers (64 bits) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_r32_1 !< Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_r64_1 !< Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_c32_1 !< Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
procedure, private :: assert_close_c64_1 !< Check if two complex (64 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
end type tester_t
|
||||
|
||||
contains
|
||||
|
||||
!> Initialize the tester.
|
||||
subroutine init(this, tolerance32, tolerance64)
|
||||
class(tester_t), intent(out) :: this !< The tester.
|
||||
real(real32), intent(in), optional :: tolerance32 !< Real tolerance, 32 bits.
|
||||
real(real64), intent(in), optional :: tolerance64 !< Real tolerance, 64 bits.
|
||||
|
||||
this% n_errors = 0
|
||||
this% n_tests = 0
|
||||
|
||||
if (present(tolerance64)) then
|
||||
this% tolerance64 = tolerance64
|
||||
else
|
||||
this% tolerance64 = 2._real64*epsilon(1._real64)
|
||||
end if
|
||||
|
||||
if (present(tolerance32)) then
|
||||
this% tolerance32 = tolerance32
|
||||
else
|
||||
this% tolerance32 = 2._real32*epsilon(1._real32)
|
||||
end if
|
||||
|
||||
end subroutine init
|
||||
|
||||
!> Print tests results.
|
||||
subroutine print(this, errorstop)
|
||||
class(tester_t), intent(in) :: this !< The tester.
|
||||
logical, intent(in), optional :: errorstop !< Flag to activate error stop if one test fails.
|
||||
|
||||
logical :: do_errorstop
|
||||
if (present(errorstop)) then
|
||||
do_errorstop = errorstop
|
||||
else
|
||||
do_errorstop = .true.
|
||||
end if
|
||||
|
||||
write(*,*) 'fortran_tester:', this% n_errors, ' error(s) for', this% n_tests, 'test(s)'
|
||||
|
||||
if (this% n_errors == 0) then
|
||||
write(*,*) 'fortran_tester: all tests succeeded'
|
||||
else
|
||||
write(*,*) 'fortran_tester: tests failed'
|
||||
if (do_errorstop) then
|
||||
stop 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine print
|
||||
|
||||
!> Check if two integers (8 bits) are equal.
|
||||
subroutine assert_equal_i8(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int8), intent(in) :: i1 !< Value to compare.
|
||||
integer(int8), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i1 .ne. i2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i8
|
||||
|
||||
!> Check if two integers (16 bits) are equal.
|
||||
subroutine assert_equal_i16(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int16), intent(in) :: i1 !< Value to compare.
|
||||
integer(int16), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i1 .ne. i2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i16
|
||||
|
||||
!> Check if two integers (32 bits) are equal.
|
||||
subroutine assert_equal_i32(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int32), intent(in) :: i1 !< Value to compare.
|
||||
integer(int32), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i1 .ne. i2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i32
|
||||
|
||||
!> Check if two integers (64 bits) are equal.
|
||||
subroutine assert_equal_i64(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int64), intent(in) :: i1 !< Value to compare.
|
||||
integer(int64), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i1 .ne. i2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i64
|
||||
|
||||
!> Check if two reals (32 bits) are equal.
|
||||
subroutine assert_equal_r32(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), intent(in) :: r1 !< Value to compare.
|
||||
real(real32), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (r1 .ne. r2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_r32
|
||||
|
||||
!> Check if two reals (64 bits) are equal.
|
||||
subroutine assert_equal_r64(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), intent(in) :: r1 !< Value to compare.
|
||||
real(real64), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (r1 .ne. r2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_r64
|
||||
|
||||
!> Check if two complex numbers (32 bits) are equal.
|
||||
subroutine assert_equal_c32(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real32), intent(in) :: c1 !< Value to compare.
|
||||
complex(real32), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (c1 .ne. c2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_c32
|
||||
|
||||
!> Check if two complex numbers (64 bits) are equal.
|
||||
subroutine assert_equal_c64(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real64), intent(in) :: c1 !< Value to compare.
|
||||
complex(real64), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (c1 .ne. c2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_c64
|
||||
|
||||
!> Check if two logicals are equal.
|
||||
subroutine assert_equal_l(this, l1, l2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
logical, intent(in) :: l1 !< Value to compare.
|
||||
logical, intent(in) :: l2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (l1 .neqv. l2) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_l
|
||||
|
||||
!> Check if two integer (8 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_i8_1(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int8), dimension(:), intent(in) :: i1 !< Value to compare.
|
||||
integer(int8), dimension(:), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(i1) .ne. size(i2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(i1-i2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i8_1
|
||||
|
||||
!> Check if two integer (16 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_i16_1(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int16), dimension(:), intent(in) :: i1 !< Value to compare.
|
||||
integer(int16), dimension(:), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(i1) .ne. size(i2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(i1-i2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i16_1
|
||||
|
||||
!> Check if two integer (32 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_i32_1(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int32), dimension(:), intent(in) :: i1 !< Value to compare.
|
||||
integer(int32), dimension(:), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(i1) .ne. size(i2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(i1-i2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i32_1
|
||||
|
||||
!> Check if two integer (64 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_i64_1(this, i1, i2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int64), dimension(:), intent(in) :: i1 !< Value to compare.
|
||||
integer(int64), dimension(:), intent(in) :: i2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(i1) .ne. size(i2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(i1-i2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_i64_1
|
||||
|
||||
!> Check if two real (32 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_r32_1(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), dimension(:), intent(in) :: r1 !< Value to compare.
|
||||
real(real32), dimension(:), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(r1) .ne. size(r2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(r1-r2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_r32_1
|
||||
|
||||
!> Check if two real (64 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_r64_1(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), dimension(:), intent(in) :: r1 !< Value to compare.
|
||||
real(real64), dimension(:), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(r1) .ne. size(r2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(r1-r2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_r64_1
|
||||
|
||||
!> Check if two complex (32 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_c32_1(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real32), dimension(:), intent(in) :: c1 !< Value to compare.
|
||||
complex(real32), dimension(:), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(c1) .ne. size(c2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(c1-c2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_c32_1
|
||||
|
||||
!> Check if two complex (64 bits) arrays (rank 1) are equal.
|
||||
subroutine assert_equal_c64_1(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real64), dimension(:), intent(in) :: c1 !< Value to compare.
|
||||
complex(real64), dimension(:), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(c1) .ne. size(c2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(c1-c2)) > 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_c64_1
|
||||
|
||||
!> Check if two logical arrays (rank 1) are equal.
|
||||
subroutine assert_equal_l_1(this, l1, l2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
logical, intent(in), dimension(:) :: l1 !< Value to compare.
|
||||
logical, intent(in), dimension(:) :: l2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
integer :: k
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(l1) .ne. size(l2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
do k = 1, size(l1)
|
||||
if (l1(k) .neqv. l2(k)) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
exit
|
||||
end if
|
||||
end do
|
||||
end if
|
||||
|
||||
end subroutine assert_equal_l_1
|
||||
|
||||
!> Check if a integer (32 bits) is positive.
|
||||
subroutine assert_positive_i8(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int8), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i8
|
||||
|
||||
!> Check if a integer (16 bits) is positive.
|
||||
subroutine assert_positive_i16(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int16), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i16
|
||||
|
||||
!> Check if a integer (32 bits) is positive.
|
||||
subroutine assert_positive_i32(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int32), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i32
|
||||
|
||||
!> Check if a integer (32 bits) is positive.
|
||||
subroutine assert_positive_i64(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int64), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (i < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i64
|
||||
|
||||
!> Check if a real (32 bits) is positive.
|
||||
subroutine assert_positive_r32(this, r, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), intent(in) :: r !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (r < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_r32
|
||||
|
||||
!> Check if a real (64 bits) is positive.
|
||||
subroutine assert_positive_r64(this, r, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), intent(in) :: r !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
if (r < 0) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_r64
|
||||
|
||||
!> Check if a integer (8 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_i8_1(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int8), dimension(:), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(i) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i8_1
|
||||
|
||||
!> Check if a integer (16 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_i16_1(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int16), dimension(:), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(i) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i16_1
|
||||
|
||||
!> Check if a integer (32 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_i32_1(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int32), dimension(:), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(i) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i32_1
|
||||
|
||||
!> Check if a integer (64 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_i64_1(this, i, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
integer(int64), dimension(:), intent(in) :: i !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(i) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_i64_1
|
||||
|
||||
!> Check if a real (32 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_r32_1(this, r, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), dimension(:), intent(in) :: r !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(r) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_r32_1
|
||||
|
||||
!> Check if a real (64 bits) array (rank 1) is positive.
|
||||
subroutine assert_positive_r64_1(this, r, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), dimension(:), intent(in) :: r !< Value to check.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( minval(r) < 0 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_positive_r64_1
|
||||
|
||||
!> Check if two reals (32 bits) are close with respect a tolerance.
|
||||
subroutine assert_close_r32(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), intent(in) :: r1 !< Value to compare.
|
||||
real(real32), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( abs(r1-r2) > this% tolerance32 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_r32
|
||||
|
||||
!> Check if two reals (64 bits) are close with respect a tolerance.
|
||||
subroutine assert_close_r64(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), intent(in) :: r1 !< Value to compare.
|
||||
real(real64), intent(in) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( abs(r1-r2) > this% tolerance64 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_r64
|
||||
|
||||
!> Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
subroutine assert_close_r32_1(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real32), intent(in), dimension(:) :: r1 !< Value to compare.
|
||||
real(real32), intent(in), dimension(:) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(r1) .ne. size(r2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_r32_1
|
||||
|
||||
!> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
subroutine assert_close_r64_1(this, r1, r2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
real(real64), intent(in), dimension(:) :: r1 !< Value to compare.
|
||||
real(real64), intent(in), dimension(:) :: r2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(r1) .ne. size(r2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_r64_1
|
||||
|
||||
!> Check if two complex numbers (32 bits) are close with respect a tolerance.
|
||||
subroutine assert_close_c32(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real32), intent(in) :: c1 !< Value to compare.
|
||||
complex(real32), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( abs(c1-c2) > this% tolerance32 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_c32
|
||||
|
||||
!> Check if two complex numbers (64 bits) are close with respect a tolerance.
|
||||
subroutine assert_close_c64(this, r1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real64), intent(in) :: r1 !< Value to compare.
|
||||
complex(real64), intent(in) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( abs(r1-c2) > this% tolerance64 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_c64
|
||||
|
||||
!> Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
subroutine assert_close_c32_1(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real32), intent(in), dimension(:) :: c1 !< Value to compare.
|
||||
complex(real32), intent(in), dimension(:) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(c1) .ne. size(c2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(c1-c2)) > this% tolerance32 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_c32_1
|
||||
|
||||
!> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
|
||||
subroutine assert_close_c64_1(this, c1, c2, fail)
|
||||
class(tester_t), intent(inout) :: this !< The tester.
|
||||
complex(real64), intent(in), dimension(:) :: c1 !< Value to compare.
|
||||
complex(real64), intent(in), dimension(:) :: c2 !< Value to compare.
|
||||
logical, intent(in), optional :: fail !< Fail flag.
|
||||
|
||||
this% n_tests = this% n_tests + 1
|
||||
|
||||
if ( size(c1) .ne. size(c2) ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
else
|
||||
if ( maxval(abs(c1-c2)) > this% tolerance64 ) then
|
||||
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
|
||||
this% n_errors = this% n_errors + 1
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
end subroutine assert_close_c64_1
|
||||
|
||||
end module tester
|
|
@ -0,0 +1,53 @@
|
|||
SUBROUTINE collect_results(test)
|
||||
#if defined(__MPI)
|
||||
USE mpi
|
||||
#endif
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
INTEGER :: itottests, itoterr, ierr, me
|
||||
!
|
||||
#if defined(__MPI)
|
||||
!
|
||||
CALL MPI_REDUCE(test%n_errors, itoterr, 1, MPI_INTEGER, MPI_SUM, &
|
||||
0, MPI_COMM_WORLD, ierr)
|
||||
! Fail in case MPI fails...
|
||||
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
|
||||
!
|
||||
CALL MPI_REDUCE(test%n_tests, itottests, 1, MPI_INTEGER, MPI_SUM, &
|
||||
0, MPI_COMM_WORLD, ierr)
|
||||
! Fail in case MPI fails...
|
||||
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
|
||||
!
|
||||
test%n_tests = itottests
|
||||
test%n_errors = itoterr
|
||||
!
|
||||
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
|
||||
!
|
||||
CALL MPI_Comm_rank(MPI_COMM_WORLD, me, ierr);
|
||||
!
|
||||
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
|
||||
!
|
||||
#endif
|
||||
END SUBROUTINE collect_results
|
||||
|
||||
|
||||
SUBROUTINE no_test
|
||||
#if defined(__MPI)
|
||||
USE mpi
|
||||
#endif
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
!TYPE(tester_t) :: test
|
||||
INTEGER :: ierr
|
||||
!
|
||||
#if defined(__MPI)
|
||||
CALL MPI_Init(ierr)
|
||||
#endif
|
||||
!CALL test%init()
|
||||
!CALL print_results(test)
|
||||
#if defined(__MPI)
|
||||
CALL mpi_finalize(ierr)
|
||||
#endif
|
||||
END SUBROUTINE no_test
|
|
@ -141,6 +141,8 @@ w0gauss.o \
|
|||
w1gauss.o \
|
||||
deviatoric.o
|
||||
|
||||
-include make.gpu
|
||||
|
||||
TLDEPS= libfox libutil libla libfft librxc
|
||||
|
||||
all : libqemod.a
|
||||
|
|
|
@ -0,0 +1,387 @@
|
|||
!
|
||||
! Copyright (C) 2002-2011 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
#define DIMS1D(my_array) lbound(my_array,1):ubound(my_array,1)
|
||||
#define DIMS2D(my_array) lbound(my_array,1):ubound(my_array,1),lbound(my_array,2):ubound(my_array,2)
|
||||
#define DIMS3D(my_array) lbound(my_array,1):ubound(my_array,1),lbound(my_array,2):ubound(my_array,2),lbound(my_array,3):ubound(my_array,3)
|
||||
#define DIMS4D(my_array) lbound(my_array,1):ubound(my_array,1),lbound(my_array,2):ubound(my_array,2),lbound(my_array,3):ubound(my_array,3),lbound(my_array,4):ubound(my_array,4)
|
||||
!=----------------------------------------------------------------------------=!
|
||||
MODULE becmod_gpum
|
||||
!=----------------------------------------------------------------------------=!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
SAVE
|
||||
INTEGER, PARAMETER :: DP = selected_real_kind(14,200)
|
||||
INTEGER, PARAMETER :: sgl = selected_real_kind(6,30)
|
||||
INTEGER, PARAMETER :: i4b = selected_int_kind(9)
|
||||
INTEGER, PARAMETER :: i8b = selected_int_kind(18)
|
||||
INTEGER :: iverbosity = 0
|
||||
#if defined(__DEBUG)
|
||||
iverbosity = 1
|
||||
#endif
|
||||
!
|
||||
TYPE bec_type_d
|
||||
#if defined(__CUDA)
|
||||
REAL(DP), ALLOCATABLE, DEVICE :: r_d(:, :)
|
||||
#else
|
||||
REAL(DP), ALLOCATABLE :: r_d(:, :)
|
||||
#endif
|
||||
#if defined(__CUDA)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: k_d(:, :)
|
||||
#else
|
||||
COMPLEX(DP), ALLOCATABLE :: k_d(:, :)
|
||||
#endif
|
||||
#if defined(__CUDA)
|
||||
COMPLEX(DP), ALLOCATABLE, DEVICE :: nc_d(:, :, :)
|
||||
#else
|
||||
COMPLEX(DP), ALLOCATABLE :: nc_d(:, :, :)
|
||||
#endif
|
||||
INTEGER :: comm
|
||||
INTEGER :: nbnd
|
||||
INTEGER :: nproc
|
||||
INTEGER :: mype
|
||||
INTEGER :: nbnd_loc
|
||||
INTEGER :: ibnd_begin
|
||||
END TYPE bec_type_d
|
||||
!
|
||||
TYPE (bec_type_d), TARGET :: becp_d ! <beta|psi>
|
||||
!
|
||||
|
||||
LOGICAL :: becp_r_ood = .false. ! used to flag out of date variables
|
||||
LOGICAL :: becp_d_r_d_ood = .false. ! used to flag out of date variables
|
||||
LOGICAL :: becp_k_ood = .false. ! used to flag out of date variables
|
||||
LOGICAL :: becp_d_k_d_ood = .false. ! used to flag out of date variables
|
||||
LOGICAL :: becp_nc_ood = .false. ! used to flag out of date variables
|
||||
LOGICAL :: becp_d_nc_d_ood = .false. ! used to flag out of date variables
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE using_becp_r(intento, debug_info)
|
||||
!
|
||||
! intento is used to specify what the variable will be used for :
|
||||
! 0 -> in , the variable needs to be synchronized but won't be changed
|
||||
! 1 -> inout , the variable needs to be synchronized AND will be changed
|
||||
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
|
||||
!
|
||||
USE becmod, ONLY : becp
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: intento
|
||||
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
|
||||
!
|
||||
#if defined(__CUDA) || defined(__CUDA_GNU)
|
||||
INTEGER :: intento_
|
||||
intento_ = intento
|
||||
!
|
||||
IF (PRESENT(debug_info) ) print *, debug_info
|
||||
!
|
||||
IF (becp_r_ood) THEN
|
||||
IF (.not. allocated(becp_d%r_d)) THEN
|
||||
CALL errore('using_r_d', 'PANIC: sync of becp%r from becp_d%r_d with unallocated array. Bye!!', 1)
|
||||
stop
|
||||
END IF
|
||||
IF (.not. allocated(becp%r)) THEN
|
||||
IF (intento_ /= 2) THEN
|
||||
print *, "WARNING: sync of becp%r with unallocated array and intento /= 2? Changed to 2!"
|
||||
intento_ = 2
|
||||
END IF
|
||||
END IF
|
||||
IF (intento_ < 2) THEN
|
||||
IF ( iverbosity > 0 ) print *, "Really copied becp%r D->H"
|
||||
becp%r = becp_d%r_d
|
||||
END IF
|
||||
! ALWAYS update auxiliary variables
|
||||
!IF ( becp%comm /= becp_d%comm ) &
|
||||
! print *, "WARNING: auxiliary variable becp%comm changed"
|
||||
becp%comm = becp_d%comm
|
||||
!IF ( becp%nbnd /= becp_d%nbnd ) &
|
||||
! print *, "WARNING: auxiliary variable becp%nbnd changed"
|
||||
becp%nbnd = becp_d%nbnd
|
||||
!IF ( becp%nproc /= becp_d%nproc ) &
|
||||
! print *, "WARNING: auxiliary variable becp%nproc changed"
|
||||
becp%nproc = becp_d%nproc
|
||||
!IF ( becp%mype /= becp_d%mype ) &
|
||||
! print *, "WARNING: auxiliary variable becp%mype changed"
|
||||
becp%mype = becp_d%mype
|
||||
!IF ( becp%nbnd_loc /= becp_d%nbnd_loc ) &
|
||||
! print *, "WARNING: auxiliary variable becp%nbnd_loc changed"
|
||||
becp%nbnd_loc = becp_d%nbnd_loc
|
||||
!IF ( becp%ibnd_begin /= becp_d%ibnd_begin ) &
|
||||
! print *, "WARNING: auxiliary variable becp%ibnd_begin changed"
|
||||
becp%ibnd_begin = becp_d%ibnd_begin
|
||||
!
|
||||
becp_r_ood = .false.
|
||||
ENDIF
|
||||
IF (intento_ > 0) becp_d_r_d_ood = .true.
|
||||
#endif
|
||||
END SUBROUTINE using_becp_r
|
||||
!
|
||||
SUBROUTINE using_becp_r_d(intento, debug_info)
|
||||
!
|
||||
USE becmod, ONLY : becp
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: intento
|
||||
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
|
||||
!
|
||||
#if defined(__CUDA) || defined(__CUDA_GNU)
|
||||
!
|
||||
IF (PRESENT(debug_info) ) print *, debug_info
|
||||
!
|
||||
IF (.not. allocated(becp%r)) THEN
|
||||
IF (intento /= 2) print *, "WARNING: sync of becp%r_d with unallocated array and intento /= 2?"
|
||||
IF (allocated(becp_d%r_d)) DEALLOCATE(becp_d%r_d)
|
||||
becp_d_r_d_ood = .false.
|
||||
RETURN
|
||||
END IF
|
||||
! here we know that r is allocated, check if size is 0
|
||||
IF ( SIZE(becp%r) == 0 ) THEN
|
||||
print *, "Refusing to allocate 0 dimensional array becp_d%r_d. If used, code will crash."
|
||||
RETURN
|
||||
END IF
|
||||
!
|
||||
IF (becp_d_r_d_ood) THEN
|
||||
IF ( allocated(becp_d%r_d) .and. (SIZE(becp_d%r_d)/=SIZE(becp%r))) deallocate(becp_d%r_d)
|
||||
IF (.not. allocated(becp_d%r_d)) ALLOCATE(becp_d%r_d(DIMS2D(becp%r))) ! MOLD does not work on all compilers
|
||||
IF (intento < 2) THEN
|
||||
IF ( iverbosity > 0 ) print *, "Really copied becp%r H->D"
|
||||
becp_d%r_d = becp%r
|
||||
END IF
|
||||
! ALWAYS update auxiliary variables
|
||||
becp_d%comm = becp%comm
|
||||
becp_d%nbnd = becp%nbnd
|
||||
becp_d%nproc = becp%nproc
|
||||
becp_d%mype = becp%mype
|
||||
becp_d%nbnd_loc = becp%nbnd_loc
|
||||
becp_d%ibnd_begin = becp%ibnd_begin
|
||||
!
|
||||
becp_d_r_d_ood = .false.
|
||||
ENDIF
|
||||
IF (intento > 0) becp_r_ood = .true.
|
||||
#else
|
||||
CALL errore('using_becp_d%r_d', 'Trying to use device data without device compilated code!', 1)
|
||||
#endif
|
||||
END SUBROUTINE using_becp_r_d
|
||||
!
|
||||
SUBROUTINE using_becp_k(intento, debug_info)
|
||||
!
|
||||
! intento is used to specify what the variable will be used for :
|
||||
! 0 -> in , the variable needs to be synchronized but won't be changed
|
||||
! 1 -> inout , the variable needs to be synchronized AND will be changed
|
||||
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
|
||||
!
|
||||
USE becmod, ONLY : becp
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: intento
|
||||
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
|
||||
!
|
||||
#if defined(__CUDA) || defined(__CUDA_GNU)
|
||||
INTEGER :: intento_
|
||||
intento_ = intento
|
||||
!
|
||||
IF (PRESENT(debug_info) ) print *, debug_info
|
||||
!
|
||||
IF (becp_k_ood) THEN
|
||||
IF (.not. allocated(becp_d%k_d)) THEN
|
||||
CALL errore('using_k_d', 'PANIC: sync of becp%k from becp_d%k_d with unallocated array. Bye!!', 1)
|
||||
stop
|
||||
END IF
|
||||
IF (.not. allocated(becp%k)) THEN
|
||||
IF (intento_ /= 2) THEN
|
||||
print *, "WARNING: sync of becp%k with unallocated array and intento /= 2? Changed to 2!"
|
||||
intento_ = 2
|
||||
END IF
|
||||
END IF
|
||||
IF (intento_ < 2) THEN
|
||||
IF ( iverbosity > 0 ) print *, "Really copied becp%k D->H"
|
||||
becp%k = becp_d%k_d
|
||||
END IF
|
||||
! ALWAYS update auxiliary variables
|
||||
!IF ( becp%comm /= becp_d%comm ) &
|
||||
! print *, "WARNING: auxiliary variable becp%comm changed"
|
||||
becp%comm = becp_d%comm
|
||||
!IF ( becp%nbnd /= becp_d%nbnd ) &
|
||||
! print *, "WARNING: auxiliary variable becp%nbnd changed"
|
||||
becp%nbnd = becp_d%nbnd
|
||||
!IF ( becp%nproc /= becp_d%nproc ) &
|
||||
! print *, "WARNING: auxiliary variable becp%nproc changed"
|
||||
becp%nproc = becp_d%nproc
|
||||
!IF ( becp%mype /= becp_d%mype ) &
|
||||
! print *, "WARNING: auxiliary variable becp%mype changed"
|
||||
becp%mype = becp_d%mype
|
||||
!IF ( becp%nbnd_loc /= becp_d%nbnd_loc ) &
|
||||
! print *, "WARNING: auxiliary variable becp%nbnd_loc changed"
|
||||
becp%nbnd_loc = becp_d%nbnd_loc
|
||||
!IF ( becp%ibnd_begin /= becp_d%ibnd_begin ) &
|
||||
! print *, "WARNING: auxiliary variable becp%ibnd_begin changed"
|
||||
becp%ibnd_begin = becp_d%ibnd_begin
|
||||
!
|
||||
becp_k_ood = .false.
|
||||
ENDIF
|
||||
IF (intento_ > 0) becp_d_k_d_ood = .true.
|
||||
#endif
|
||||
END SUBROUTINE using_becp_k
|
||||
!
|
||||
SUBROUTINE using_becp_k_d(intento, debug_info)
|
||||
!
|
||||
USE becmod, ONLY : becp
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: intento
|
||||
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
|
||||
!
|
||||
#if defined(__CUDA) || defined(__CUDA_GNU)
|
||||
!
|
||||
IF (PRESENT(debug_info) ) print *, debug_info
|
||||
!
|
||||
IF (.not. allocated(becp%k)) THEN
|
||||
IF (intento /= 2) print *, "WARNING: sync of becp%k_d with unallocated array and intento /= 2?"
|
||||
IF (allocated(becp_d%k_d)) DEALLOCATE(becp_d%k_d)
|
||||
becp_d_k_d_ood = .false.
|
||||
RETURN
|
||||
END IF
|
||||
! here we know that k is allocated, check if size is 0
|
||||
IF ( SIZE(becp%k) == 0 ) THEN
|
||||
print *, "Refusing to allocate 0 dimensional array becp_d%k_d. If used, code will crash."
|
||||
RETURN
|
||||
END IF
|
||||
!
|
||||
IF (becp_d_k_d_ood) THEN
|
||||
IF ( allocated(becp_d%k_d) .and. (SIZE(becp_d%k_d)/=SIZE(becp%k))) deallocate(becp_d%k_d)
|
||||
IF (.not. allocated(becp_d%k_d)) ALLOCATE(becp_d%k_d(DIMS2D(becp%k))) ! MOLD does not work on all compilers
|
||||
IF (intento < 2) THEN
|
||||
IF ( iverbosity > 0 ) print *, "Really copied becp%k H->D"
|
||||
becp_d%k_d = becp%k
|
||||
END IF
|
||||
! ALWAYS update auxiliary variables
|
||||
becp_d%comm = becp%comm
|
||||
becp_d%nbnd = becp%nbnd
|
||||
becp_d%nproc = becp%nproc
|
||||
becp_d%mype = becp%mype
|
||||
becp_d%nbnd_loc = becp%nbnd_loc
|
||||
becp_d%ibnd_begin = becp%ibnd_begin
|
||||
!
|
||||
becp_d_k_d_ood = .false.
|
||||
ENDIF
|
||||
IF (intento > 0) becp_k_ood = .true.
|
||||
#else
|
||||
CALL errore('using_becp_d%k_d', 'Trying to use device data without device compilated code!', 1)
|
||||
#endif
|
||||
END SUBROUTINE using_becp_k_d
|
||||
!
|
||||
SUBROUTINE using_becp_nc(intento, debug_info)
|
||||
!
|
||||
! intento is used to specify what the variable will be used for :
|
||||
! 0 -> in , the variable needs to be synchronized but won't be changed
|
||||
! 1 -> inout , the variable needs to be synchronized AND will be changed
|
||||
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
|
||||
!
|
||||
USE becmod, ONLY : becp
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: intento
|
||||
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
|
||||
!
|
||||
#if defined(__CUDA) || defined(__CUDA_GNU)
|
||||
INTEGER :: intento_
|
||||
intento_ = intento
|
||||
!
|
||||
IF (PRESENT(debug_info) ) print *, debug_info
|
||||
!
|
||||
IF (becp_nc_ood) THEN
|
||||
IF (.not. allocated(becp_d%nc_d)) THEN
|
||||
CALL errore('using_nc_d', 'PANIC: sync of becp%nc from becp_d%nc_d with unallocated array. Bye!!', 1)
|
||||
stop
|
||||
END IF
|
||||
IF (.not. allocated(becp%nc)) THEN
|
||||
IF (intento_ /= 2) THEN
|
||||
print *, "WARNING: sync of becp%nc with unallocated array and intento /= 2? Changed to 2!"
|
||||
intento_ = 2
|
||||
END IF
|
||||
END IF
|
||||
IF (intento_ < 2) THEN
|
||||
IF ( iverbosity > 0 ) print *, "Really copied becp%nc D->H"
|
||||
becp%nc = becp_d%nc_d
|
||||
END IF
|
||||
! ALWAYS update auxiliary variables
|
||||
!IF ( becp%comm /= becp_d%comm ) &
|
||||
! print *, "WARNING: auxiliary variable becp%comm changed"
|
||||
becp%comm = becp_d%comm
|
||||
!IF ( becp%nbnd /= becp_d%nbnd ) &
|
||||
! print *, "WARNING: auxiliary variable becp%nbnd changed"
|
||||
becp%nbnd = becp_d%nbnd
|
||||
!IF ( becp%nproc /= becp_d%nproc ) &
|
||||
! print *, "WARNING: auxiliary variable becp%nproc changed"
|
||||
becp%nproc = becp_d%nproc
|
||||
!IF ( becp%mype /= becp_d%mype ) &
|
||||
! print *, "WARNING: auxiliary variable becp%mype changed"
|
||||
becp%mype = becp_d%mype
|
||||
!IF ( becp%nbnd_loc /= becp_d%nbnd_loc ) &
|
||||
! print *, "WARNING: auxiliary variable becp%nbnd_loc changed"
|
||||
becp%nbnd_loc = becp_d%nbnd_loc
|
||||
!IF ( becp%ibnd_begin /= becp_d%ibnd_begin ) &
|
||||
! print *, "WARNING: auxiliary variable becp%ibnd_begin changed"
|
||||
becp%ibnd_begin = becp_d%ibnd_begin
|
||||
!
|
||||
becp_nc_ood = .false.
|
||||
ENDIF
|
||||
IF (intento_ > 0) becp_d_nc_d_ood = .true.
|
||||
#endif
|
||||
END SUBROUTINE using_becp_nc
|
||||
!
|
||||
SUBROUTINE using_becp_nc_d(intento, debug_info)
|
||||
!
|
||||
USE becmod, ONLY : becp
|
||||
implicit none
|
||||
INTEGER, INTENT(IN) :: intento
|
||||
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
|
||||
!
|
||||
#if defined(__CUDA) || defined(__CUDA_GNU)
|
||||
!
|
||||
IF (PRESENT(debug_info) ) print *, debug_info
|
||||
!
|
||||
IF (.not. allocated(becp%nc)) THEN
|
||||
IF (intento /= 2) print *, "WARNING: sync of becp%nc_d with unallocated array and intento /= 2?"
|
||||
IF (allocated(becp_d%nc_d)) DEALLOCATE(becp_d%nc_d)
|
||||
becp_d_nc_d_ood = .false.
|
||||
RETURN
|
||||
END IF
|
||||
! here we know that nc is allocated, check if size is 0
|
||||
IF ( SIZE(becp%nc) == 0 ) THEN
|
||||
print *, "Refusing to allocate 0 dimensional array becp_d%nc_d. If used, code will crash."
|
||||
RETURN
|
||||
END IF
|
||||
!
|
||||
IF (becp_d_nc_d_ood) THEN
|
||||
IF ( allocated(becp_d%nc_d) .and. (SIZE(becp_d%nc_d)/=SIZE(becp%nc))) deallocate(becp_d%nc_d)
|
||||
IF (.not. allocated(becp_d%nc_d)) ALLOCATE(becp_d%nc_d(DIMS3D(becp%nc))) ! MOLD does not work on all compilers
|
||||
IF (intento < 2) THEN
|
||||
IF ( iverbosity > 0 ) print *, "Really copied becp%nc H->D"
|
||||
becp_d%nc_d = becp%nc
|
||||
END IF
|
||||
! ALWAYS update auxiliary variables
|
||||
becp_d%comm = becp%comm
|
||||
becp_d%nbnd = becp%nbnd
|
||||
becp_d%nproc = becp%nproc
|
||||
becp_d%mype = becp%mype
|
||||
becp_d%nbnd_loc = becp%nbnd_loc
|
||||
becp_d%ibnd_begin = becp%ibnd_begin
|
||||
!
|
||||
becp_d_nc_d_ood = .false.
|
||||
ENDIF
|
||||
IF (intento > 0) becp_nc_ood = .true.
|
||||
#else
|
||||
CALL errore('using_becp_d%nc_d', 'Trying to use device data without device compilated code!', 1)
|
||||
#endif
|
||||
END SUBROUTINE using_becp_nc_d
|
||||
!
|
||||
SUBROUTINE deallocate_becmod_gpu
|
||||
IF( ALLOCATED( becp_d%r_d ) ) DEALLOCATE( becp_d%r_d )
|
||||
IF( ALLOCATED( becp_d%k_d ) ) DEALLOCATE( becp_d%k_d )
|
||||
IF( ALLOCATED( becp_d%nc_d ) ) DEALLOCATE( becp_d%nc_d )
|
||||
END SUBROUTINE deallocate_becmod_gpu
|
||||
!=----------------------------------------------------------------------------=!
|
||||
END MODULE becmod_gpum
|
||||
!=----------------------------------------------------------------------------=!
|
|
@ -0,0 +1,609 @@
|
|||
!
|
||||
! Copyright (C) 2001-2007 PWSCF group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
MODULE becmod_subs_gpum
|
||||
|
||||
! NOTA BENE : THE SUBROUTINES IN THIS FILE ARE ONLY PARTIALLY TESTED!
|
||||
|
||||
!
|
||||
! ... *bec* contain <beta|psi> - used in h_psi, s_psi, many other places
|
||||
! ... calbec( npw, beta, psi, betapsi [, nbnd ] ) is an interface calculating
|
||||
! ... betapsi(i,j) = <beta(i)|psi(j)> (the sum is over npw components)
|
||||
! ... or betapsi(i,s,j)= <beta(i)|psi(s,j)> (s=polarization index)
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE control_flags, ONLY : gamma_only, smallmem
|
||||
USE gvect, ONLY : gstart
|
||||
USE noncollin_module, ONLY : noncolin, npol
|
||||
USE becmod_gpum, ONLY : bec_type_d
|
||||
!
|
||||
SAVE
|
||||
!
|
||||
PRIVATE
|
||||
!
|
||||
INTERFACE calbec_gpu
|
||||
!
|
||||
MODULE PROCEDURE calbec_k_gpu, calbec_gamma_gpu, calbec_gamma_nocomm_gpu, calbec_nc_gpu, calbec_bec_type_gpu
|
||||
!
|
||||
END INTERFACE
|
||||
|
||||
INTERFACE becscal_gpu
|
||||
!
|
||||
MODULE PROCEDURE becscal_nck_gpu, becscal_gamma_gpu
|
||||
!
|
||||
END INTERFACE
|
||||
!
|
||||
PUBLIC :: allocate_bec_type_gpu, deallocate_bec_type_gpu, calbec_gpu, &
|
||||
beccopy_gpu, becscal_gpu, is_allocated_bec_type_gpu, &
|
||||
synchronize_bec_type_gpu, &
|
||||
using_becp_auto, using_becp_d_auto
|
||||
!
|
||||
CONTAINS
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE calbec_bec_type_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
|
||||
!-----------------------------------------------------------------------
|
||||
!_
|
||||
USE mp_bands, ONLY: intra_bgrp_comm
|
||||
USE mp, ONLY: mp_get_comm_null
|
||||
!
|
||||
IMPLICIT NONE
|
||||
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
|
||||
TYPE (bec_type_d), TARGET, INTENT (inout) :: betapsi_d ! NB: must be INOUT otherwise
|
||||
! the allocatd array is lost
|
||||
INTEGER, INTENT (in) :: npw
|
||||
INTEGER, OPTIONAL :: nbnd
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: beta_d, psi_d
|
||||
#endif
|
||||
!
|
||||
INTEGER :: local_nbnd
|
||||
INTEGER, EXTERNAL :: ldim_block, gind_block
|
||||
INTEGER :: m_loc, m_begin, ip
|
||||
REAL(DP), ALLOCATABLE :: dtmp_d(:,:) ! replace this with buffers !
|
||||
INTEGER :: i, j, nkb
|
||||
REAL(DP), POINTER :: betapsi_d_r_d(:,:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: dtmp_d, betapsi_d_r_d
|
||||
#endif
|
||||
!
|
||||
IF ( present (nbnd) ) THEN
|
||||
local_nbnd = nbnd
|
||||
ELSE
|
||||
local_nbnd = size ( psi_d, 2)
|
||||
ENDIF
|
||||
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
IF( betapsi_d%comm == mp_get_comm_null() ) THEN
|
||||
!
|
||||
CALL calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d%r_d, local_nbnd, intra_bgrp_comm )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
ALLOCATE( dtmp_d( SIZE( betapsi_d%r_d, 1 ), SIZE( betapsi_d%r_d, 2 ) ) )
|
||||
!
|
||||
DO ip = 0, betapsi_d%nproc - 1
|
||||
m_loc = ldim_block( betapsi_d%nbnd , betapsi_d%nproc, ip )
|
||||
m_begin = gind_block( 1, betapsi_d%nbnd, betapsi_d%nproc, ip )
|
||||
IF( ( m_begin + m_loc - 1 ) > local_nbnd ) m_loc = local_nbnd - m_begin + 1
|
||||
IF( m_loc > 0 ) THEN
|
||||
CALL calbec_gamma_gpu ( npw, beta_d, psi_d(:,m_begin:m_begin+m_loc-1), dtmp_d, m_loc, betapsi_d%comm )
|
||||
IF( ip == betapsi_d%mype ) THEN
|
||||
nkb = SIZE( betapsi_d%r_d, 1 )
|
||||
betapsi_d_r_d => betapsi_d%r_d
|
||||
!$cuf kernel do(2) <<<*,*>>>
|
||||
DO j=1,m_loc
|
||||
DO i=1, nkb
|
||||
betapsi_d_r_d(i,j) = dtmp_d(i,j)
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
END IF
|
||||
END DO
|
||||
|
||||
DEALLOCATE( dtmp_d )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
ELSEIF ( noncolin) THEN
|
||||
!
|
||||
CALL calbec_nc_gpu ( npw, beta_d, psi_d, betapsi_d%nc_d, local_nbnd )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL calbec_k_gpu ( npw, beta_d, psi_d, betapsi_d%k_d, local_nbnd )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE calbec_bec_type_gpu
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE calbec_gamma_nocomm_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
|
||||
!-----------------------------------------------------------------------
|
||||
USE mp_bands, ONLY: intra_bgrp_comm
|
||||
IMPLICIT NONE
|
||||
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
|
||||
REAL (DP), INTENT (out) :: betapsi_d(:,:)
|
||||
INTEGER, INTENT (in) :: npw
|
||||
INTEGER, OPTIONAL :: nbnd
|
||||
INTEGER :: m
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: beta_d, psi_d, betapsi_d
|
||||
#endif
|
||||
IF ( present (nbnd) ) THEN
|
||||
m = nbnd
|
||||
ELSE
|
||||
m = size ( psi_d, 2)
|
||||
ENDIF
|
||||
CALL calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d, m, intra_bgrp_comm )
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE calbec_gamma_nocomm_gpu
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd, comm )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! ... matrix times matrix with summation index (k=1,npw) running on
|
||||
! ... half of the G-vectors or PWs - assuming k=0 is the G=0 component:
|
||||
! ... betapsi(i,j) = 2Re(\sum_k beta^*(i,k)psi(k,j)) + beta^*(i,0)psi(0,j)
|
||||
!
|
||||
USE mp, ONLY : mp_sum, mp_size
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
|
||||
REAL (DP), INTENT (out) :: betapsi_d(:,:)
|
||||
INTEGER, INTENT (in) :: npw
|
||||
INTEGER, INTENT (in) :: nbnd
|
||||
INTEGER, INTENT (in) :: comm
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: beta_d, psi_d, betapsi_d
|
||||
#endif
|
||||
INTEGER :: nkb, npwx, m
|
||||
INTEGER :: i,j
|
||||
!
|
||||
m = nbnd
|
||||
!
|
||||
nkb = size (beta_d, 2)
|
||||
IF ( nkb == 0 ) RETURN
|
||||
!
|
||||
CALL start_clock( 'calbec' )
|
||||
IF ( npw == 0 ) betapsi_d(:,:)=0.0_DP
|
||||
npwx= size (beta_d, 1)
|
||||
IF ( npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
|
||||
IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
|
||||
#if defined(DEBUG)
|
||||
WRITE (*,*) 'calbec gamma'
|
||||
WRITE (*,*) nkb, size (betapsi_d,1) , m , size (betapsi_d, 2)
|
||||
#endif
|
||||
IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 2) ) &
|
||||
CALL errore ('calbec', 'size mismatch', 3)
|
||||
!
|
||||
IF ( m == 1 ) THEN
|
||||
!
|
||||
CALL cudaDGEMV( 'C', 2*npw, nkb, 2.0_DP, beta_d, 2*npwx, psi_d, 1, 0.0_DP, &
|
||||
betapsi_d, 1 )
|
||||
IF ( gstart == 2 ) THEN
|
||||
!betapsi_d(:,1) = betapsi_d(:,1) - beta_d(1,:)*psi_d(1,1)
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i=1, nkb
|
||||
betapsi_d(i,1) = betapsi_d(i,1) - DBLE(beta_d(1,i)*psi_d(1,1))
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL DGEMM( 'C', 'N', nkb, m, 2*npw, 2.0_DP, beta_d, 2*npwx, psi_d, &
|
||||
2*npwx, 0.0_DP, betapsi_d, nkb )
|
||||
IF ( gstart == 2 ) &
|
||||
CALL cudaDGER( nkb, m, -1.0_DP, beta_d, 2*npwx, psi_d, 2*npwx, betapsi_d, nkb )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
IF (mp_size(comm) > 1) CALL mp_sum( betapsi_d( :, 1:m ), comm )
|
||||
!
|
||||
CALL stop_clock( 'calbec' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE calbec_gamma_gpu
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE calbec_k_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! ... matrix times matrix with summation index (k=1,npw) running on
|
||||
! ... G-vectors or PWs : betapsi(i,j) = \sum_k beta^*(i,k) psi(k,j)
|
||||
!
|
||||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum, mp_size
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
|
||||
COMPLEX (DP), INTENT (out) :: betapsi_d(:,:)
|
||||
INTEGER, INTENT (in) :: npw
|
||||
INTEGER, OPTIONAL :: nbnd
|
||||
!
|
||||
INTEGER :: nkb, npwx, m
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: beta_d, psi_d, betapsi_d
|
||||
#endif
|
||||
nkb = size (beta_d, 2)
|
||||
IF ( nkb == 0 ) RETURN
|
||||
!
|
||||
CALL start_clock( 'calbec' )
|
||||
IF ( npw == 0 ) betapsi_d(:,:)=(0.0_DP,0.0_DP)
|
||||
npwx= size (beta_d, 1)
|
||||
IF ( npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
|
||||
IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
|
||||
IF ( present (nbnd) ) THEN
|
||||
m = nbnd
|
||||
ELSE
|
||||
m = size ( psi_d, 2)
|
||||
ENDIF
|
||||
#if defined(DEBUG)
|
||||
WRITE (*,*) 'calbec k'
|
||||
WRITE (*,*) nkb, size (betapsi_d,1) , m , size (betapsi_d, 2)
|
||||
#endif
|
||||
IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 2) ) &
|
||||
CALL errore ('calbec', 'size mismatch', 3)
|
||||
!
|
||||
IF ( m == 1 ) THEN
|
||||
!
|
||||
CALL ZGEMV( 'C', npw, nkb, (1.0_DP,0.0_DP), beta_d, npwx, psi_d, 1, &
|
||||
(0.0_DP, 0.0_DP), betapsi_d, 1 )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL ZGEMM( 'C', 'N', nkb, m, npw, (1.0_DP,0.0_DP), &
|
||||
beta_d, npwx, psi_d, npwx, (0.0_DP,0.0_DP), betapsi_d, nkb )
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
IF (mp_size(intra_bgrp_comm) > 1) CALL mp_sum( betapsi_d( :, 1:m ), intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock( 'calbec' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE calbec_k_gpu
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE calbec_nc_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! ... matrix times matrix with summation index (k below) running on
|
||||
! ... G-vectors or PWs corresponding to two different polarizations:
|
||||
! ... betapsi(i,1,j) = \sum_k=1,npw beta^*(i,k) psi(k,j)
|
||||
! ... betapsi(i,2,j) = \sum_k=1,npw beta^*(i,k) psi(k+npwx,j)
|
||||
!
|
||||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum, mp_size
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
|
||||
COMPLEX (DP), INTENT (out) :: betapsi_d(:,:,:)
|
||||
INTEGER, INTENT (in) :: npw
|
||||
INTEGER, OPTIONAL :: nbnd
|
||||
!
|
||||
INTEGER :: nkb, npwx, npol, m
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: beta_d, psi_d, betapsi_d
|
||||
#endif
|
||||
nkb = size (beta_d, 2)
|
||||
IF ( nkb == 0 ) RETURN
|
||||
!
|
||||
CALL start_clock ('calbec')
|
||||
IF ( npw == 0 ) betapsi_d(:,:,:)=(0.0_DP,0.0_DP)
|
||||
npwx= size (beta_d, 1)
|
||||
IF ( 2*npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
|
||||
IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
|
||||
IF ( present (nbnd) ) THEN
|
||||
m = nbnd
|
||||
ELSE
|
||||
m = size ( psi_d, 2)
|
||||
ENDIF
|
||||
npol= size (betapsi_d, 2)
|
||||
#if defined(DEBUG)
|
||||
WRITE (*,*) 'calbec nc'
|
||||
WRITE (*,*) nkb, size (betapsi_d,1) , m , size (betapsi_d, 3)
|
||||
#endif
|
||||
IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 3) ) &
|
||||
CALL errore ('calbec', 'size mismatch', 3)
|
||||
!
|
||||
CALL ZGEMM ('C', 'N', nkb, m*npol, npw, (1.0_DP, 0.0_DP), beta_d, &
|
||||
npwx, psi_d, npwx, (0.0_DP, 0.0_DP), betapsi_d, nkb)
|
||||
!
|
||||
IF (mp_size(intra_bgrp_comm) > 1) CALL mp_sum( betapsi_d( :, :, 1:m ), intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock( 'calbec' )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE calbec_nc_gpu
|
||||
!
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
FUNCTION is_allocated_bec_type_gpu (bec_d) RESULT (isalloc)
|
||||
!-----------------------------------------------------------------------
|
||||
IMPLICIT NONE
|
||||
TYPE (bec_type_d) :: bec_d
|
||||
LOGICAL :: isalloc
|
||||
isalloc = (allocated(bec_d%r_d) .or. allocated(bec_d%nc_d) .or. allocated(bec_d%k_d))
|
||||
RETURN
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
END FUNCTION is_allocated_bec_type_gpu
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE allocate_bec_type_gpu ( nkb, nbnd, bec_d, comm )
|
||||
!-----------------------------------------------------------------------
|
||||
USE mp, ONLY: mp_size, mp_rank, mp_get_comm_null
|
||||
USE device_memcpy_m, ONLY : dev_memset
|
||||
IMPLICIT NONE
|
||||
TYPE (bec_type_d) :: bec_d
|
||||
INTEGER, INTENT (in) :: nkb, nbnd
|
||||
INTEGER, INTENT (in), OPTIONAL :: comm
|
||||
INTEGER :: ierr, nbnd_siz
|
||||
INTEGER, EXTERNAL :: ldim_block, gind_block
|
||||
!
|
||||
nbnd_siz = nbnd
|
||||
bec_d%comm = mp_get_comm_null()
|
||||
bec_d%nbnd = nbnd
|
||||
bec_d%mype = 0
|
||||
bec_d%nproc = 1
|
||||
bec_d%nbnd_loc = nbnd
|
||||
bec_d%ibnd_begin = 1
|
||||
!
|
||||
IF( PRESENT( comm ) .AND. gamma_only .AND. smallmem ) THEN
|
||||
bec_d%comm = comm
|
||||
bec_d%nproc = mp_size( comm )
|
||||
IF( bec_d%nproc > 1 ) THEN
|
||||
nbnd_siz = nbnd / bec_d%nproc
|
||||
IF( MOD( nbnd, bec_d%nproc ) /= 0 ) nbnd_siz = nbnd_siz + 1
|
||||
bec_d%mype = mp_rank( bec_d%comm )
|
||||
bec_d%nbnd_loc = ldim_block( bec_d%nbnd , bec_d%nproc, bec_d%mype )
|
||||
bec_d%ibnd_begin = gind_block( 1, bec_d%nbnd, bec_d%nproc, bec_d%mype )
|
||||
END IF
|
||||
END IF
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
ALLOCATE( bec_d%r_d( nkb, nbnd_siz ), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' allocate_bec_type ', ' cannot allocate bec_d%r ', ABS(ierr) )
|
||||
!
|
||||
CALL dev_memset(bec_d%r_d, 0.0D0, (/1,nkb/), 1, (/1, nbnd_siz/), 1)
|
||||
!
|
||||
ELSEIF ( noncolin) THEN
|
||||
!
|
||||
ALLOCATE( bec_d%nc_d( nkb, npol, nbnd_siz ), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' allocate_bec_type ', ' cannot allocate bec_d%nc ', ABS(ierr) )
|
||||
!
|
||||
CALL dev_memset(bec_d%nc_d, (0.0D0,0.0D0), (/1, nkb/), 1, (/1, npol/), 1, (/1, nbnd_siz/), 1)
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
ALLOCATE( bec_d%k_d( nkb, nbnd_siz ), STAT=ierr )
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' allocate_bec_type ', ' cannot allocate bec_d%k ', ABS(ierr) )
|
||||
!
|
||||
CALL dev_memset(bec_d%k_d, (0.0D0,0.0D0), (/1, nkb/), 1, (/1, npol/), 1)
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE allocate_bec_type_gpu
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE deallocate_bec_type_gpu (bec_d)
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
USE mp, ONLY: mp_get_comm_null
|
||||
IMPLICIT NONE
|
||||
TYPE (bec_type_d) :: bec_d
|
||||
!
|
||||
bec_d%comm = mp_get_comm_null()
|
||||
bec_d%nbnd = 0
|
||||
!
|
||||
IF (allocated(bec_d%r_d)) DEALLOCATE(bec_d%r_d)
|
||||
IF (allocated(bec_d%nc_d)) DEALLOCATE(bec_d%nc_d)
|
||||
IF (allocated(bec_d%k_d)) DEALLOCATE(bec_d%k_d)
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE deallocate_bec_type_gpu
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE synchronize_bec_type_gpu (bec_d, bec, what)
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! ... Updates a device or host version of a bec_type variable.
|
||||
! ... Direction 'h' updates host version, direction 'd' the device
|
||||
! ... version.
|
||||
!
|
||||
USE mp, ONLY: mp_get_comm_null
|
||||
USE becmod, ONLY : bec_type
|
||||
IMPLICIT NONE
|
||||
TYPE (bec_type_d) :: bec_d
|
||||
TYPE (bec_type) :: bec
|
||||
CHARACTER, INTENT(IN) :: what
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
IF (.not. (allocated(bec_d%r_d) .and. allocated(bec%r))) &
|
||||
CALL errore('becmod_gpu', 'Unallocated array',1)
|
||||
SELECT CASE(what)
|
||||
CASE('d')
|
||||
bec_d%r_d = bec%r
|
||||
CASE('h')
|
||||
bec%r = bec_d%r_d
|
||||
CASE DEFAULT
|
||||
CALL errore('becmod_gpu', 'Invalid command',2)
|
||||
END SELECT
|
||||
!
|
||||
ELSEIF ( noncolin) THEN
|
||||
!
|
||||
IF (.not. (allocated(bec_d%nc_d) .and. allocated(bec%nc))) &
|
||||
CALL errore('becmod_gpu', 'Unallocated array',3)
|
||||
SELECT CASE(what)
|
||||
CASE('d')
|
||||
bec_d%nc_d = bec%nc
|
||||
CASE('h')
|
||||
bec%nc = bec_d%nc_d
|
||||
CASE DEFAULT
|
||||
CALL errore('becmod_gpu', 'Invalid command',4)
|
||||
END SELECT
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
IF (.not. (allocated(bec_d%k_d) .and. allocated(bec%k))) &
|
||||
CALL errore('becmod_gpu', 'Unallocated array',5)
|
||||
SELECT CASE(what)
|
||||
CASE('d')
|
||||
bec_d%k_d = bec%k
|
||||
CASE('h')
|
||||
bec%k = bec_d%k_d
|
||||
CASE DEFAULT
|
||||
CALL errore('becmod_gpu', 'Invalid command',6)
|
||||
END SELECT
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE synchronize_bec_type_gpu
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE beccopy_gpu(bec, bec1, nkb, nbnd)
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
TYPE(bec_type_d), INTENT(in) :: bec
|
||||
TYPE(bec_type_d) :: bec1
|
||||
INTEGER, INTENT(in) :: nkb, nbnd
|
||||
|
||||
IF (gamma_only) THEN
|
||||
CALL dcopy(nkb*nbnd, bec%r_d, 1, bec1%r_d, 1)
|
||||
ELSEIF (noncolin) THEN
|
||||
CALL zcopy(nkb*npol*nbnd, bec%nc_d, 1, bec1%nc_d, 1)
|
||||
ELSE
|
||||
CALL zcopy(nkb*nbnd, bec%k_d, 1, bec1%k_d, 1)
|
||||
ENDIF
|
||||
|
||||
RETURN
|
||||
END SUBROUTINE beccopy_gpu
|
||||
|
||||
SUBROUTINE becscal_nck_gpu(alpha, bec_d, nkb, nbnd)
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
TYPE(bec_type_d), INTENT(INOUT) :: bec_d
|
||||
COMPLEX(DP), INTENT(IN) :: alpha
|
||||
INTEGER, INTENT(IN) :: nkb, nbnd
|
||||
IF (gamma_only) THEN
|
||||
CALL errore('becscal_nck','called in the wrong case',1)
|
||||
ELSEIF (noncolin) THEN
|
||||
CALL zscal(nkb*npol*nbnd, alpha, bec_d%nc_d, 1)
|
||||
ELSE
|
||||
CALL zscal(nkb*nbnd, alpha, bec_d%k_d, 1)
|
||||
ENDIF
|
||||
|
||||
RETURN
|
||||
END SUBROUTINE becscal_nck_gpu
|
||||
|
||||
SUBROUTINE becscal_gamma_gpu(alpha, bec_d, nkb, nbnd)
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
TYPE(bec_type_d), INTENT(INOUT) :: bec_d
|
||||
REAL(DP), INTENT(IN) :: alpha
|
||||
INTEGER, INTENT(IN) :: nkb, nbnd
|
||||
|
||||
IF (gamma_only) THEN
|
||||
CALL dscal(nkb*nbnd, alpha, bec_d%r_d, 1)
|
||||
ELSE
|
||||
CALL errore('becscal_gamma','called in the wrong case',1)
|
||||
ENDIF
|
||||
|
||||
RETURN
|
||||
END SUBROUTINE becscal_gamma_gpu
|
||||
!
|
||||
SUBROUTINE using_becp_auto(intento)
|
||||
USE becmod_gpum, ONLY : using_becp_r
|
||||
USE becmod_gpum, ONLY : using_becp_k
|
||||
USE becmod_gpum, ONLY : using_becp_nc
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: intento
|
||||
!
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
CALL using_becp_r(intento)
|
||||
!
|
||||
ELSEIF ( noncolin) THEN
|
||||
!
|
||||
CALL using_becp_nc(intento)
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL using_becp_k(intento)
|
||||
!
|
||||
ENDIF
|
||||
END SUBROUTINE using_becp_auto
|
||||
!
|
||||
SUBROUTINE using_becp_d_auto(intento)
|
||||
USE becmod_gpum, ONLY : using_becp_r_d
|
||||
USE becmod_gpum, ONLY : using_becp_k_d
|
||||
USE becmod_gpum, ONLY : using_becp_nc_d
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: intento
|
||||
!
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
CALL using_becp_r_d(intento)
|
||||
!
|
||||
ELSEIF ( noncolin) THEN
|
||||
!
|
||||
CALL using_becp_nc_d(intento)
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL using_becp_k_d(intento)
|
||||
!
|
||||
ENDIF
|
||||
END SUBROUTINE using_becp_d_auto
|
||||
|
||||
|
||||
END MODULE becmod_subs_gpum
|
|
@ -38,6 +38,7 @@ MODULE command_line_options
|
|||
INTEGER :: nargs = 0
|
||||
! ... QE arguments read from command line
|
||||
INTEGER :: nimage_= 1, npool_= 1, ndiag_ = 0, nband_= 1, ntg_= 1, nyfft_ = 1, nmany_ = 1
|
||||
LOGICAL :: pencil_decomposition_ = .false.
|
||||
! ... Indicate if using library init
|
||||
LOGICAL :: library_init = .FALSE.
|
||||
! ... input file name read from command line
|
||||
|
@ -125,6 +126,14 @@ CONTAINS
|
|||
READ ( arg, *, ERR = 15, END = 15) ntg_ ! read the argument as ntg_
|
||||
nyfft_ = ntg_ ! set nyfft_ equal to ntg_
|
||||
narg = narg + 1
|
||||
CASE ( '-pd', 'use_pd', '-pencil_decomposition', '-use_pencil_decomposition' )
|
||||
IF (read_string) THEN
|
||||
CALL my_getarg ( input_command_line, narg, arg )
|
||||
ELSE
|
||||
CALL get_command_argument ( narg, arg )
|
||||
ENDIF
|
||||
READ ( arg, *, ERR = 15, END = 15) pencil_decomposition_
|
||||
narg = narg + 1
|
||||
CASE ( '-nb', '-nband', '-nbgrp', '-nband_group')
|
||||
IF (read_string) THEN
|
||||
CALL my_getarg ( input_command_line, narg, arg )
|
||||
|
@ -170,6 +179,7 @@ CONTAINS
|
|||
CALL mp_bcast( nyfft_ , root, world_comm )
|
||||
CALL mp_bcast( nband_ , root, world_comm )
|
||||
CALL mp_bcast( ndiag_ , root, world_comm )
|
||||
CALL mp_bcast( pencil_decomposition_ , root, world_comm )
|
||||
|
||||
END SUBROUTINE get_command_line
|
||||
!
|
||||
|
@ -216,17 +226,18 @@ CONTAINS
|
|||
|
||||
END SUBROUTINE my_getarg
|
||||
|
||||
SUBROUTINE set_command_line ( nimage, npool, ntg, nmany, nyfft, nband, ndiag)
|
||||
SUBROUTINE set_command_line ( nimage, npool, ntg, nmany, nyfft, nband, ndiag, pencil_decomposition)
|
||||
! directly set command line options without going through the command line
|
||||
IMPLICIT NONE
|
||||
|
||||
INTEGER, INTENT(IN), OPTIONAL :: nimage, npool, ntg, nmany, nyfft, nband, ndiag
|
||||
INTEGER, INTENT(IN), OPTIONAL :: nimage, npool, ntg, nmany, nyfft, nband, ndiag, pencil_decomposition
|
||||
!
|
||||
IF ( PRESENT(nimage) ) nimage_ = nimage
|
||||
IF ( PRESENT(npool) ) npool_ = npool
|
||||
IF ( PRESENT(nyfft) ) nyfft_ = nyfft
|
||||
IF ( PRESENT(nband) ) nband_ = nband
|
||||
IF ( PRESENT(ndiag) ) ndiag_ = ndiag
|
||||
IF ( PRESENT(pencil_decomposition) ) pencil_decomposition_ = pencil_decomposition
|
||||
IF ( PRESENT(ntg) .and. PRESENT(nmany) ) THEN
|
||||
! ERROR!!!!
|
||||
ELSEIF ( PRESENT(ntg) ) THEN
|
||||
|
|
|
@ -250,6 +250,12 @@ MODULE control_flags
|
|||
LOGICAL, PUBLIC :: &
|
||||
do_makov_payne = .FALSE. ! if .TRUE. makov-payne correction for isolated
|
||||
! system is used
|
||||
LOGICAL, PUBLIC :: &
|
||||
use_gpu = .FALSE. ! if .TRUE. selects the accelerated version of the subroutines
|
||||
! when available
|
||||
INTEGER, PUBLIC :: &
|
||||
many_fft = 16 ! the size of FFT batches in vloc_psi and
|
||||
! sumband. Only use in accelerated subroutines.
|
||||
!
|
||||
INTEGER :: ortho_max = 0 ! maximum number of iterations in routine ortho
|
||||
REAL(DP) :: ortho_eps = 0.0_DP ! threshold for convergence in routine ortho
|
||||
|
@ -276,6 +282,7 @@ MODULE control_flags
|
|||
|
||||
LOGICAL, PUBLIC :: treinit_gvecs = .FALSE.
|
||||
|
||||
LOGICAL, PUBLIC :: diagonalize_on_host = .FALSE.
|
||||
!
|
||||
! ... end of module-scope declarations
|
||||
!
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
!----------------------------------------------
|
||||
! ... this file contains a number of subroutines optionally interfaced
|
||||
! ... to cublas
|
||||
!----------------------------------------------
|
||||
|
||||
SUBROUTINE cudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
#if defined(__CUDA)
|
||||
use cudafor
|
||||
use cublas
|
||||
#endif
|
||||
implicit none
|
||||
DOUBLE PRECISION :: ALPHA,BETA
|
||||
INTEGER :: INCX,INCY,LDA,M,N
|
||||
CHARACTER :: TRANS
|
||||
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: A, X, Y
|
||||
#endif
|
||||
!
|
||||
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
!
|
||||
END SUBROUTINE cudaDGEMV
|
||||
|
||||
SUBROUTINE cudaDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
#if defined(__CUDA)
|
||||
use cudafor
|
||||
use cublas
|
||||
#endif
|
||||
! .. Scalar Arguments ..
|
||||
DOUBLE PRECISION :: ALPHA
|
||||
INTEGER :: INCX, INCY, LDA, M, N
|
||||
! .. Array Arguments ..
|
||||
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: A, X, Y
|
||||
#endif
|
||||
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
|
||||
|
||||
END SUBROUTINE cudaDGER
|
|
@ -0,0 +1,111 @@
|
|||
!
|
||||
! Copyright (C) 2001 PWSCF group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE dylmr2_gpu( nylm, ngy, g_d, gg_d, dylm_d, ipol )
|
||||
!-----------------------------------------------------------------------
|
||||
!! Compute \partial Y_lm(G) \over \partial (G)_ipol
|
||||
!! using simple numerical derivation (SdG).
|
||||
!! The spherical harmonics are calculated in ylmr2.
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: nylm
|
||||
! input: number of spherical harmonics
|
||||
INTEGER, INTENT(IN) :: ngy
|
||||
! input: the number of g vectors to compute
|
||||
INTEGER, INTENT(IN) :: ipol
|
||||
! input: desired polarization
|
||||
REAL(DP), INTENT(IN) :: g_d(3,ngy)
|
||||
!! input: the coordinates of g vectors
|
||||
REAL(DP), INTENT(IN) :: gg_d(ngy)
|
||||
!! input: the moduli of g vectors
|
||||
REAL(DP), INTENT(OUT) :: dylm_d(ngy,nylm)
|
||||
!! output: the spherical harmonics derivatives
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: ig, lm, i
|
||||
! counter on g vectors
|
||||
! counter on l,m component
|
||||
!
|
||||
INTEGER :: apol, bpol
|
||||
!
|
||||
REAL(DP), PARAMETER :: delta = 1.E-6_DP
|
||||
REAL(DP), ALLOCATABLE :: dg_d(:), dgi_d(:), gx_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: ggx_d(:), ylmaux_d(:,:)
|
||||
! dg is the finite increment for numerical derivation:
|
||||
! dg = delta |G| = delta * sqrt(gg)
|
||||
! dgi= 1 /(delta * sqrt(gg))
|
||||
! gx = g +/- dg
|
||||
! ggx = gx^2
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: g_d, gg_d, dylm_d, gx_d, ggx_d, dg_d, &
|
||||
dgi_d, ylmaux_d
|
||||
#endif
|
||||
!
|
||||
|
||||
!
|
||||
ALLOCATE( gx_d(3,ngy), ggx_d(ngy), dg_d(ngy) )
|
||||
ALLOCATE( dgi_d(ngy), ylmaux_d(ngy,nylm) )
|
||||
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ig = 1, ngy
|
||||
dg_d(ig) = delta * SQRT(gg_d(ig) )
|
||||
IF (gg_d(ig) > 1.E-9_DP) THEN
|
||||
dgi_d(ig) = 1._DP / dg_d(ig)
|
||||
ELSE
|
||||
dgi_d(ig) = 0._DP
|
||||
ENDIF
|
||||
ENDDO
|
||||
!
|
||||
IF ( ipol==1 ) THEN
|
||||
apol = 2 ; bpol = 3
|
||||
ELSEIF ( ipol==2 ) THEN
|
||||
apol = 1 ; bpol = 3
|
||||
ELSEIF ( ipol==3 ) THEN
|
||||
apol = 1 ; bpol = 2
|
||||
ENDIF
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ig = 1, ngy
|
||||
gx_d(apol,ig) = g_d(apol,ig)
|
||||
gx_d(bpol,ig) = g_d(bpol,ig)
|
||||
gx_d(ipol,ig) = g_d(ipol,ig) + dg_d(ig)
|
||||
ggx_d(ig) = gx_d(1,ig) * gx_d(1,ig) + &
|
||||
gx_d(2,ig) * gx_d(2,ig) + &
|
||||
gx_d(3,ig) * gx_d(3,ig)
|
||||
ENDDO
|
||||
!
|
||||
CALL ylmr2_gpu( nylm, ngy, gx_d, ggx_d, dylm_d )
|
||||
!
|
||||
!$cuf kernel do (1) <<<*,*>>>
|
||||
DO ig = 1, ngy
|
||||
gx_d(ipol,ig) = g_d(ipol,ig) - dg_d(ig)
|
||||
ggx_d(ig) = gx_d(1,ig) * gx_d(1,ig) + &
|
||||
gx_d(2,ig) * gx_d(2,ig) + &
|
||||
gx_d(3,ig) * gx_d(3,ig)
|
||||
ENDDO
|
||||
!
|
||||
CALL ylmr2_gpu( nylm, ngy, gx_d, ggx_d, ylmaux_d )
|
||||
!
|
||||
!$cuf kernel do (2) <<<*,*>>>
|
||||
DO lm = 1, nylm
|
||||
DO ig = 1, ngy
|
||||
dylm_d(ig,lm) = (dylm_d(ig,lm)-ylmaux_d(ig,lm)) * 0.5_DP * dgi_d(ig)
|
||||
ENDDO
|
||||
ENDDO
|
||||
!
|
||||
DEALLOCATE( gx_d, ggx_d, dg_d, dgi_d, ylmaux_d )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE dylmr2_gpu
|
||||
|
|
@ -10,6 +10,9 @@
|
|||
!------------------------------------------------------------------------------!
|
||||
|
||||
USE kinds, ONLY: DP
|
||||
#if defined (__CUDA)
|
||||
USE cudafor
|
||||
#endif
|
||||
!
|
||||
IMPLICIT NONE
|
||||
SAVE
|
||||
|
@ -43,9 +46,13 @@
|
|||
REAL(DP) :: qbac = 0.0_DP ! background neutralizing charge
|
||||
INTEGER, ALLOCATABLE :: ispin(:) ! spin of each state
|
||||
|
||||
REAL(DP), ALLOCATABLE :: f_bgrp(:) ! occupation numbers ( at gamma )
|
||||
REAL(DP), ALLOCATABLE :: f_bgrp(:) ! occupation numbers ( at gamma )
|
||||
REAL(DP), ALLOCATABLE :: f_d(:) ! occupation numbers ( at gamma )
|
||||
INTEGER, ALLOCATABLE :: ispin_bgrp(:) ! spin of each state
|
||||
INTEGER, ALLOCATABLE :: ibgrp_g2l(:) ! local index of the i-th global band index
|
||||
#if defined (__CUDA)
|
||||
ATTRIBUTES( DEVICE ) :: f_d
|
||||
#endif
|
||||
!
|
||||
!------------------------------------------------------------------------------!
|
||||
CONTAINS
|
||||
|
@ -397,6 +404,7 @@
|
|||
IF( ALLOCATED( f ) ) DEALLOCATE( f )
|
||||
IF( ALLOCATED( ispin ) ) DEALLOCATE( ispin )
|
||||
IF( ALLOCATED( f_bgrp ) ) DEALLOCATE( f_bgrp )
|
||||
IF( ALLOCATED( f_d ) ) DEALLOCATE( f_d )
|
||||
IF( ALLOCATED( ispin_bgrp ) ) DEALLOCATE( ispin_bgrp )
|
||||
IF( ALLOCATED( ibgrp_g2l ) ) DEALLOCATE( ibgrp_g2l )
|
||||
telectrons_base_initval = .FALSE.
|
||||
|
@ -455,6 +463,10 @@
|
|||
END DO
|
||||
END DO
|
||||
|
||||
#if defined (__CUDA)
|
||||
ALLOCATE( f_d, SOURCE = f_bgrp )
|
||||
#endif
|
||||
|
||||
RETURN
|
||||
|
||||
END SUBROUTINE distribute_bands
|
||||
|
|
|
@ -174,6 +174,8 @@ CONTAINS
|
|||
& "395502 (2009);", &
|
||||
&/9X,"""P. Giannozzi et al., J. Phys.:Condens. Matter 29 ",&
|
||||
& "465901 (2017);", &
|
||||
&/9X,"""P. Giannozzi et al., J. Chem. Phys. 152 ",&
|
||||
& "154105 (2020);", &
|
||||
&/9X," URL http://www.quantum-espresso.org"", ", &
|
||||
&/5X,"in publications or presentations arising from this work. More details at",&
|
||||
&/5x,"http://www.quantum-espresso.org/quote")' )
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue