Merge branch 'merge_qegpu' into HEAD

This commit is contained in:
Pietro Delugas 2021-01-22 17:20:50 +01:00
commit db0da8b0d9
277 changed files with 43022 additions and 2109 deletions

View File

@ -1,104 +1,19 @@
# ====================
# UtilXlib UnitTesting
# ====================
build:cudampiomp:
tags: [galileo]
only:
- /utilxlib/i
script:
- module load profile/advanced pgi/17.10 cuda/8.0.61
- ./configure --enable-openmp
- cd UtilXlib/tests
- bash compile_and_run_tests.sh -smcn
build:intelmpiomp:
tags: [galileo]
only:
- /utilxlib/i
script:
- module load profile/advanced intel intelmpi
- ./configure --enable-openmp
- cd UtilXlib/tests
- bash compile_and_run_tests.sh -sm
build:cudampi:
tags: [galileo]
only:
- /utilxlib/i
script:
- module load profile/advanced pgi/17.10 cuda/8.0.61
- ./configure
- cd UtilXlib/tests
- bash compile_and_run_tests.sh -smcn
build:intelmpi:
tags: [galileo]
only:
- /utilxlib/i
script:
- module load profile/advanced intel intelmpi
- ./configure
- cd UtilXlib/tests
- bash compile_and_run_tests.sh -sm
build:pw:
image: ubuntu:latest
tags: [docker]
image: espressofoundation/ubuntu:latest
script:
- apt-get update
- apt-get install --yes build-essential gfortran wget python
- apt-get install --yes libopenmpi-dev openmpi-bin
- apt-get install --yes libblas-dev liblapack-dev fftw3 fftw3-dev pkg-config
- ./configure
- make pw
# - export OMP_NUM_THREADS=1
# - cd test-suite
# - mkdir /tmp/save
# - make run-tests-pw-serial
# - rm -r /tmp/save
build:cp:
image: ubuntu:latest
script:
- apt-get update
- apt-get install --yes build-essential gfortran wget python
- apt-get install --yes libopenmpi-dev openmpi-bin
- apt-get install --yes libblas-dev liblapack-dev fftw3 fftw3-dev pkg-config
- ./configure
- make cp
# - export OMP_NUM_THREADS=1
# - cd test-suite
# - mkdir /tmp/save
# - make run-tests-cp-serial
# - rm -r /tmp/save
#### BUILDS ON GALILEO ####
build:intel:
tags: [galileo]
intel parallel:
tags: [intel]
script:
- module purge
- module load intel/pe-xe-2018--binary intelmpi/2018--binary mkl/2018--binary
- module list
- module load intel/pe-xe-2018--binary intelmpi/2018--binary mkl/2018--binary python
- ./configure --enable-openmp
- make pw cp
- export OMP_NUM_THREADS=2
- cd test-suite
- mkdir -p /tmp/save
- make run-tests-pw-parallel
- make run-tests-cp-parallel
- rm -r /tmp/save
pgi cuda mpi:
tags: [k80]
script:
- module purge
- module load profile/global pgi/17.10 mkl/2018--binary cuda/8.0.61
- ./configure --enable-openmp --with-cuda-runtime=8.0 --with-cuda-cc=35 --with-cuda=$CUDA_HOME --with-scalapack=no
- make -j pw
- make -j pw cp
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
- echo "Using $SLURM_NTASKS procs and $OMP_NUM_THREADS threads"
- cd test-suite
@ -107,3 +22,163 @@ pgi cuda mpi:
- make run-tests-pw-parallel
- sed -i 's/export PARA_POSTFIX=" "/export PARA_POSTFIX=" -pd=.true."/g' run-pw.sh
- make clean && make run-tests-pw-parallel
pgi parallel k80:
tags: [k80]
script:
- module purge
- module load profile/global hpc-sdk/20.9--binary mkl/2019--binary python
- ./configure --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=35 --with-cuda=yes --enable-cuda-env-check=no --with-scalapack=no
- make -j pw cp
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
- echo "Using $SLURM_NTASKS procs and $OMP_NUM_THREADS threads"
- cd test-suite
- sed -i "s/TESTCODE_NPROCS=4/TESTCODE_NPROCS=$SLURM_NTASKS/" ENVIRONMENT
- make clean
- make run-tests-pw-parallel
- sed -i 's/export PARA_POSTFIX=" "/export PARA_POSTFIX=" -pd=.true."/g' run-pw.sh
- make clean && make run-tests-pw-parallel
- cd .. && cp PW/src/pw.x ./pwgpu-mpi-cuda8-cc35-${CI_COMMIT_SHA:0:8}.x
pgi parallel v100:
tags: [galileo,v100]
script:
- module purge
- module load profile/global hpc-sdk/20.9--binary mkl/2019--binary cuda/10.1 python
- ./configure --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=70 --with-cuda=$CUDA_HOME --with-scalapack=no
- make -j pw cp
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
- echo "Using $SLURM_NTASKS procs and $OMP_NUM_THREADS threads"
- cd test-suite
- sed -i "s/TESTCODE_NPROCS=4/TESTCODE_NPROCS=$SLURM_NTASKS/" ENVIRONMENT
- make clean
- make run-tests-pw-parallel
- sed -i 's/export PARA_POSTFIX=" "/export PARA_POSTFIX=" -pd=.true."/g' run-pw.sh
- make clean && make run-tests-pw-parallel
pgi serial k80:
tags: [k80]
script:
- module purge
- module load profile/global hpc-sdk/20.9--binary mkl/2019--binary python
- ./configure --disable-parallel --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=35 --with-cuda=yes --enable-cuda-env-check=no
- make -j pw cp
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
- echo "Using $OMP_NUM_THREADS threads"
- cd test-suite
- make clean
- make run-tests-pw-serial
- make run-tests-cp-serial
pgi serial v100 :
tags: [galileo,v100]
script:
- module purge
- module load profile/global hpc-sdk/20.9--binary mkl/2019--binary python
- ./configure --disable-parallel --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=70 --with-cuda=yes --enable-cuda-env-check=no
- make -j pw cp
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
- echo "Using $OMP_NUM_THREADS threads"
- cd test-suite
- make clean
- make run-tests-pw-serial
- make run-tests-cp-serial
pgi207 power v100:
tags: [marconi100]
script:
- module purge
- module load profile/global hpc-sdk/2020--binary cuda/10.1 spectrum_mpi/10.3.1--binary python/3.8.2
- pwd
- cd ..
- ./configure CC=pgcc F77=pgf90 FC=pgf90 F90=pgf90 MPIF90=mpipgifort --enable-openmp --with-cuda=$CUDA_ROOT --with-cuda-runtime=10.1 --with-cuda-cc=70
- make -j pw cp
- salloc --nodes=1 --ntasks-per-node=4 --ntasks-per-socket=2 --cpus-per-task=32 --gres=gpu:4 --mem=230000MB --time 01:00:00 -A $CINECA_QE_ACCOUNT -p m100_usr_prod
- if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=8; fi
- echo "Using $OMP_NUM_THREADS threads"
- cd test-suite
- make clean
- make run-tests-pw-parallel
- make run-tests-cp-parallel
- exit
#pgi cuda9.2 p100 nollvm:
# tags: [p100]
# script:
# - module purge
# - module load profile/global pgi/19.10--binary cuda/10.0
# - ./configure --enable-openmp --with-cuda-runtime=10.0 --with-cuda-cc=60 --with-cuda=$CUDA_HOME --with-scalapack=no
# - sed -i 's/traditional/traditional -Uvector/' make.inc # problem in PGI compiler mis-understanding comments.
# - sed -i 's/cuda10.0/cuda10.0,nollvm/' make.inc # this is needed for some reason yet to be clarified.
# - make -j pw
# - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
# - echo "Using $OMP_NUM_THREADS threads"
# - cd test-suite
# - make clean
# - make run-tests-pw-parallel
#
#pgi cuda9.2 p100:
# tags: [p100]
# script:
# - module purge
# - module load profile/global pgi/19.10--binary cuda/10.0
# - ./configure --enable-openmp --with-cuda-runtime=10.0 --with-cuda-cc=60 --with-cuda=$CUDA_HOME --with-scalapack=no
# - sed -i 's/traditional/traditional -Uvector/' make.inc # problem in PGI compiler mis-understanding comments.
# - make -j pw
# - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
# - echo "Using $OMP_NUM_THREADS threads"
# - cd test-suite
# - make clean
# - make run-tests-pw-parallel
#
## UtilXlib UnitTesting
#build:cudampiomp:
# tags: [galileo]
# script:
# - module load profile/advanced pgi/17.10 cuda/8.0.61
# - ./configure --enable-openmp
# - cd UtilXlib/tests
# - bash compile_and_run_tests.sh -smcn
#
#build:intelmpiomp:
# tags: [galileo]
# script:
# - module load profile/advanced intel intelmpi
# - ./configure --enable-openmp
# - cd UtilXlib/tests
# - bash compile_and_run_tests.sh -sm
#
#build:cudampi:
# tags: [galileo]
# script:
# - module load profile/advanced pgi/17.10 cuda/8.0.61
# - ./configure
# - cd UtilXlib/tests
# - bash compile_and_run_tests.sh -smcn
#
#build:intelmpi:
# tags: [galileo]
# script:
# - module load profile/advanced intel intelmpi
# - ./configure
# - cd UtilXlib/tests
# - bash compile_and_run_tests.sh -sm
#
##### BUILDS ON GALILEO ####
#
#build:laxlib-unittest:
# tags: [galileo]
# script:
# - module load profile/advanced pgi/17.10 cuda/8.0.61
# - ./configure && make pw
# - cd LAXlib/tests
# - make clean && make
# - for file in ./*.x; do mpirun -np 4 ./$file; done
# - cd ../../
# - ./configure --disable-parallel && make clean && make pw
# - cd LAXlib/tests
# - make clean && make
# - for file in ./*.x; do ./$file; done

View File

@ -22,12 +22,12 @@ build:cp:
# - make run-tests-cp-serial
# - rm -r /tmp/save
build:ph:
tags: [docker]
image: espressofoundation/ubuntu:latest
script:
- ./configure
- make ph
#build:ph:
# tags: [docker]
# image: espressofoundation/ubuntu:latest
# script:
# - ./configure
# - make ph
# - export OMP_NUM_THREADS=1
# - cd test-suite
# - mkdir /tmp/save

View File

@ -591,6 +591,12 @@ add_custom_target(gwl
qe_gww_util_grap_exe
qe_gww_util_abcoefftoeps_exe
qe_gww_util_memorypw4gww_exe
qe_gww_bse_bse_main_exe
qe_gww_gww_exe
qe_gww_gww_fit_exe
qe_gww_head_exe
qe_gww_simple_bse_exe
qe_gww_simple_ip_exe
COMMENT
"GW with Lanczos chains")

View File

@ -175,12 +175,16 @@
!orthonormalize c0
call calbec(1,nsp,eigr,c0,bec)
call calbec(nbsp, betae,c0,bec)
CALL gram_bgrp( betae, bec, nkb, c0, ngw )
!calculates phi for pcdaga
#if defined (__CUDA)
CALL errore(' runcg_uspp ', ' GPU version not yet implemented', 1 )
#else
CALL calphi_bgrp( c0, SIZE(c0,1), bec, nkb, betae, phi, nbsp )
#endif
!calculates the factors for S and K inversion in US case
if(nkbus>0) then
@ -205,13 +209,13 @@
ENERGY_CHECK: if(.not. ene_ok ) then
call calbec(1,nsp,eigr,c0,bec)
call calbec(nbsp, betae,c0,bec)
if(.not.tens) then
call rhoofr(nfi,c0(:,:),irb,eigrb,bec,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
else
if(newscheme.or.firstiter) then
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,c0,bec,dbec,firstiter,vpot)
firstiter=.false.
endif
@ -238,7 +242,7 @@
!
! put core charge (if present) in rhoc(r)
!
if (nlcc_any) call set_cc(irb,eigrb,rhoc)
if (nlcc_any) call set_cc(rhoc)
!
!---ensemble-DFT
@ -301,7 +305,7 @@
!update d
call newd(vpot,irb,eigrb,rhovan,fion)
call newd(vpot,rhovan,fion,.true.)
call prefor(eigr,betae)!ATTENZIONE
@ -341,13 +345,13 @@
hpsi0=hpsi
gi = hpsi
call calbec(1,nsp,eigr,hpsi,becm)
call calbec(nbsp, betae,hpsi,becm)
call xminus1(hpsi,betae,dumm,becm,s_minus1,.false.)
! call sminus1(hpsi,becm,betae)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!look if the following two lines are really needed
call calbec(1,nsp,eigr,hpsi,becm)
call calbec(nbsp, betae,hpsi,becm)
call pc2(c0,bec,hpsi,becm)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@ -357,13 +361,13 @@
else
call xminus1_state(gi,betae,ema0bg,becm,k_minus1,.true.,ave_ene)
endif
call calbec(1,nsp,eigr,gi,becm)
call calbec(nbsp, betae,gi,becm)
call pc2(c0,bec,gi,becm)
if(tens) call calcmt( nrlx, f, z0t, fmat0 )
call calbec(1,nsp,eigr,hpsi,bec0)
call calbec(nbsp, betae,hpsi,bec0)
! calculates gamma
gamma=0.d0
@ -496,7 +500,7 @@
!project hi on conduction sub-space
call calbec(1,nsp,eigr,hi,bec0)
call calbec(nbsp, betae,hi,bec0)
call pc2(c0,bec,hi,bec0)
@ -565,7 +569,7 @@
!orthonormalize
call calbec(1,nsp,eigr,cm,becm)
call calbec(nbsp, betae,cm,becm)
CALL gram_bgrp( betae, becm, nkb, cm, ngw )
!calculate energy
@ -573,7 +577,7 @@
call rhoofr(nfi,cm(:,:),irb,eigrb,becm,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
else
if(newscheme) then
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,cm,becm,dbec,.false., vpot )
endif
@ -588,7 +592,7 @@
!
! put core charge (if present) in rhoc(r)
!
if (nlcc_any) call set_cc(irb,eigrb,rhoc)
if (nlcc_any) call set_cc(rhoc)
!
vpot = rhor
!
@ -628,7 +632,7 @@
cm(1,:)=0.5d0*(cm(1,:)+CONJG(cm(1,:)))
endif
call calbec(1,nsp,eigr,cm,becm)
call calbec(nbsp, betae,cm,becm)
CALL gram_bgrp( betae, becm, nkb, cm, ngw )
!test on energy: check the energy has really diminished
@ -638,7 +642,7 @@
call rhoofr(nfi,cm(:,:),irb,eigrb,becm,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
else
if(newscheme) then
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,cm,becm,dbec,.false., vpot )
endif
! calculation of the rotated quantities
@ -652,7 +656,7 @@
!
! put core charge (if present) in rhoc(r)
!
if (nlcc_any) call set_cc(irb,eigrb,rhoc)
if (nlcc_any) call set_cc(rhoc)
!
vpot = rhor
!
@ -699,7 +703,7 @@
endif
c0=c0+spasso*passov*hi
restartcg=.true.
call calbec(1,nsp,eigr,c0,bec)
call calbec(nbsp, betae,c0,bec)
CALL gram_bgrp( betae, bec, nkb, c0, ngw )
ene_ok=.false.
!if ene1 << energy < ene0; go to ene1
@ -709,7 +713,7 @@
endif
c0=c0+spasso*passov*hi
restartcg=.true.!ATTENZIONE
call calbec(1,nsp,eigr,c0,bec)
call calbec(nbsp, betae,c0,bec)
CALL gram_bgrp( betae, bec, nkb, c0, ngw )
!if ene > ene0,en1 do a steepest descent step
ene_ok=.false.
@ -725,14 +729,14 @@
cm=c0+spasso*passov*hi
! chenge the searching direction
spasso=spasso*(-1.d0)
call calbec(1,nsp,eigr,cm,becm)
call calbec(nbsp, betae,cm,becm)
CALL gram_bgrp( betae, bec, nkb, cm, ngw )
call calbec(1,nsp,eigr,cm,becm)
call calbec(nbsp, betae,cm,becm)
if(.not.tens) then
call rhoofr(nfi,cm(:,:),irb,eigrb,becm,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
else
if(newscheme) then
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,cm,becm,dbec,.false., vpot )
endif
! calculation of the rotated quantities
@ -746,7 +750,7 @@
!
! put core charge (if present) in rhoc(r)
!
if (nlcc_any) call set_cc(irb,eigrb,rhoc)
if (nlcc_any) call set_cc(rhoc)
!
vpot = rhor
!
@ -776,10 +780,14 @@
if(tens.and.newscheme) enever=enever-entropy
if(.not. ene_ok) call calbec (1,nsp,eigr,c0,bec)
if(.not. ene_ok) call calbec (nbsp, betae,c0,bec)
#if defined (__CUDA)
CALL errore(' runcg_uspp ', ' GPU version not yet implemented', 1 )
#else
!calculates phi for pc_daga
CALL calphi_bgrp( c0, SIZE(c0,1), bec, nkb, betae, phi, nbsp )
#endif
!=======================================================================
!
@ -788,7 +796,7 @@
!
!=======================================================================
if(tens.and. .not.newscheme) then
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
call inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,c0,bec,dbec,firstiter, vpot )
!the following sets up the new energy
enever=etot
@ -809,7 +817,7 @@
!calculates atomic forces and lambda
if(tpre) then!if pressure is need the following is written because of caldbec
call calbec(1,nsp,eigr,c0,bec)
call calbec(nbsp, betae,c0,bec)
if(.not.tens) then
call caldbec_bgrp( eigr, c0, dbec, idesc )
call rhoofr(nfi,c0(:,:),irb,eigrb,bec,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
@ -827,7 +835,7 @@
!
! put core charge (if present) in rhoc(r)
!
if (nlcc_any) call set_cc(irb,eigrb,rhoc)
if (nlcc_any) call set_cc(rhoc)
!
!---ensemble-DFT
@ -844,12 +852,16 @@
call calcmt( nrlx, f, z0t, fmat0 )
call newd(vpot,irb,eigrb,rhovan,fion)
call newd(vpot,rhovan,fion,.true.)
#if defined (__CUDA)
CALL errore(' runcg_uspp ', ' GPU version not yet implemented', 1 )
#else
if (.not.tens) then
if (tfor .or. tprnfor) call nlfq_bgrp( c0, eigr, bec, becdr, fion ) ! call nlfq(c0,eigr,bec,becdr,fion)
if (tfor .or. tprnfor) call nlfq_bgrp( c0, betae, bec, becdr, fion )
else
if (tfor .or. tprnfor) call nlfq_bgrp( c0diag, eigr, becdiag, becdrdiag, fion ) ! call nlfq(c0diag,eigr,becdiag,becdrdiag,fion)
if (tfor .or. tprnfor) call nlfq_bgrp( c0diag, betae, becdiag, becdrdiag, fion )
endif
#endif
call prefor(eigr,betae)
do i=1,nbsp,2
@ -955,9 +967,9 @@
enddo
enddo
call calbec (1,nsp,eigr,c0,bec)
call calbec (nbsp,betae,c0,bec)
CALL gram_bgrp( betae, bec, nkb, c0, ngw )
call calbec(1,nsp,eigr,c0,bec)
call calbec(nbsp, betae,c0,bec)
@ -998,7 +1010,7 @@
CALL mp_sum( lambda_repl, intra_bgrp_comm )
CALL distribute_lambda( lambda_repl, lambda( :, :, 1 ), idesc( :, 1 ) )
cm(:,:)=c0(:,:)
call calbec (1,nsp,eigr,cm,becm)
call calbec (nbsp, betae,cm,becm)
endif
DEALLOCATE( lambda_repl )
@ -1031,7 +1043,7 @@
!
DEALLOCATE( lambda_dist )
!
call nlsm2_bgrp( ngw, nkb, eigr, c0, becdr, nbspx, nbsp )
call nlsm2_bgrp( ngw, nkb, betae, c0, becdr, nbspx, nbsp )
!
endif
!

View File

@ -6,9 +6,16 @@
! or http://www.gnu.org/copyleft/gpl.txt .
!
#if defined(__CUDA)
#define DEVICEATTR ,DEVICE
#else
#define DEVICEATTR
#endif
!-----------------------------------------------------------------------
SUBROUTINE rhoofr_cp &
( nfi, c_bgrp, irb, eigrb, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
( nfi, c_bgrp, c_d, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, &
enl, denl, ekin, dekin, tstress, ndwwf )
!-----------------------------------------------------------------------
!
! this routine computes:
@ -48,7 +55,7 @@
USE uspp, ONLY: nkb
USE uspp_param, ONLY: nh, nhm
USE cell_base, ONLY: omega
USE electrons_base, ONLY: nspin, nbsp_bgrp, ispin_bgrp, f_bgrp
USE electrons_base, ONLY: nspin, nbsp_bgrp, ispin_bgrp, f_bgrp, f_d
USE constants, ONLY: pi, fpi
USE mp, ONLY: mp_sum
USE io_global, ONLY: stdout, ionode
@ -60,7 +67,7 @@
USE fft_interfaces, ONLY: fwfft, invfft
USE fft_base, ONLY: dffts, dfftp
USE cp_interfaces, ONLY: checkrho, ennl, calrhovan, dennl
USE cp_main_variables, ONLY: iprint_stdout, idesc
USE cp_main_variables, ONLY: iprint_stdout, idesc, irb, eigrb
USE wannier_base, ONLY: iwf
USE exx_module, ONLY: rhopr
USE input_parameters, ONLY: tcpbo ! BS
@ -68,6 +75,8 @@
USE io_files, ONLY: restart_dir
USE fft_rho
USE fft_helper_subroutines, ONLY: c2psi_gamma
USE mp, ONLY: mp_barrier
USE mp_world, ONLY: mpime, world_comm
!
IMPLICIT NONE
INTEGER nfi
@ -79,11 +88,10 @@
REAL(DP) rhos(:,:)
REAL(DP) enl, ekin
REAL(DP) denl(3,3), dekin(6)
COMPLEX(DP) eigrb( :, : )
COMPLEX(DP) rhog( :, : )
COMPLEX(DP) drhog( :, :, :, : )
COMPLEX(DP) c_bgrp( :, : )
INTEGER irb( :, : )
COMPLEX(DP) DEVICEATTR :: c_d( :, : )
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
@ -119,7 +127,11 @@
!
! calculation of kinetic energy ekin
!
#if defined (__CUDA)
ekin = enkin( c_d, f_d, nbsp_bgrp )
#else
ekin = enkin( c_bgrp, f_bgrp, nbsp_bgrp )
#endif
!
IF( nbgrp > 1 ) &
CALL mp_sum( ekin, inter_bgrp_comm )
@ -217,6 +229,9 @@
CALL errore( ' rhoofr ', ' c second dimension too small ', SIZE( c_bgrp, 2 ) )
!
c_bgrp( :, nbsp_bgrp + 1 ) = ( 0.d0, 0.d0 )
#if defined (__CUDA)
c_d( :, nbsp_bgrp + 1 ) = ( 0.d0, 0.d0 )
#endif
!
ENDIF
!
@ -236,7 +251,11 @@
!
ELSE
!
#if defined (__CUDA)
CALL loop_over_states_gpu()
#else
CALL loop_over_states()
#endif
!
END IF
!
@ -260,7 +279,7 @@
DEALLOCATE( drhovan )
END IF
!
CALL rhov( irb, eigrb, rhovan, rhog, rhor )
CALL rhov( rhovan, rhog, rhor )
ENDIF COMPUTE_CHARGE
!
@ -359,6 +378,10 @@
#endif
REAL(DP), ALLOCATABLE :: tmp_rhos(:,:)
IF( fftx_ntgrp(dffts) > 1 ) THEN
CALL errore('rhoofr','Task group not supported',1)
END IF
ALLOCATE( psis( dffts%nnr_tg ) )
!
CALL tg_get_group_nr3( dffts, tg_nr3 )
@ -369,17 +392,9 @@
do i = 1, nbsp_bgrp, 2 * fftx_ntgrp(dffts)
#if defined(__MPI)
!
CALL c2psi_gamma_tg(dffts, psis, c_bgrp, i, nbsp_bgrp )
CALL invfft ('tgWave', psis, dffts )
#else
CALL c2psi_gamma( dffts, psis, c_bgrp(:,i), c_bgrp(:,i+1) )
CALL invfft('Wave', psis, dffts )
#endif
!
! Now the first proc of the group holds the first two bands
! of the 2*nogrp bands that we are processing at the same time,
@ -446,6 +461,103 @@
RETURN
END SUBROUTINE loop_over_states
#if defined (__CUDA)
SUBROUTINE loop_over_states_gpu
!
USE parallel_include
USE fft_helper_subroutines
USE control_flags, ONLY : many_fft
USE cudafor
!
! MAIN LOOP OVER THE EIGENSTATES
! - This loop is also parallelized within the task-groups framework
! - Each group works on a number of eigenstates in parallel
!
IMPLICIT NONE
!
INTEGER :: from, i, ig, eig_index, eig_offset, ii, tg_nr3, ioff
!
REAL(DP), DEVICE, ALLOCATABLE :: rhos_d(:,:)
COMPLEX(DP), DEVICE, ALLOCATABLE :: psis(:)
COMPLEX(DP), DEVICE, ALLOCATABLE :: ptmp(:,:)
INTEGER, DEVICE, POINTER :: nl_d(:), nlm_d(:)
ALLOCATE( psis( dffts%nnr * many_fft ) ) ! dffts%nnr * many_fft
ALLOCATE( rhos_d ( SIZE(rhos,1), SIZE(rhos,2) ) )
!
rhos_d = 0_DP
nl_d => dffts%nl_d
nlm_d => dffts%nlm_d
do i = 1, nbsp_bgrp, 2 * many_fft
psis = 0.0d0
ioff = 0
DO ii = i, i + 2 * many_fft - 1, 2
IF( ii < nbsp_bgrp ) THEN
!$cuf kernel do(1)
do ig = 1, dffts%ngw
psis( nlm_d( ig ) + ioff) = CONJG( c_d( ig, ii ) ) + ci * conjg( c_d( ig, ii+1 ))
psis( nl_d( ig ) + ioff) = c_d( ig, ii ) + ci * c_d( ig, ii+1 )
end do
ELSE IF( ii == nbsp_bgrp ) THEN
!$cuf kernel do(1)
do ig = 1, dffts%ngw
psis( nlm_d( ig ) + ioff) = CONJG( c_d( ig, ii ) )
psis( nl_d( ig ) + ioff) = c_d( ig, ii )
end do
END IF
! CALL c2psi_gamma( dffts, psis, c_bgrp(:,ii), c_bgrp(:,ii+1) )
ioff = ioff + dffts%nnr
END DO
CALL invfft('Wave', psis, dffts, many_fft )
ioff = 0
DO ii = i, i + 2 * many_fft - 1, 2
IF( ii < nbsp_bgrp ) THEN
iss1=ispin_bgrp( ii )
sa1 =f_bgrp( ii )/omega
iss2=ispin_bgrp( ii + 1 )
sa2 =f_bgrp( ii + 1 )/omega
!$cuf kernel do(1)
do ir = 1, dffts%nnr
rhos_d(ir,iss1) = rhos_d(ir,iss1) + sa1*( real(psis(ir + ioff)))**2
rhos_d(ir,iss2) = rhos_d(ir,iss2) + sa2*(aimag(psis(ir + ioff)))**2
end do
ELSE IF( ii == nbsp_bgrp ) THEN
iss1=ispin_bgrp( ii )
sa1 =f_bgrp( ii )/omega
iss2=iss1
sa2=0.0d0
!$cuf kernel do(1)
do ir = 1, dffts%nnr
rhos_d(ir,iss1) = rhos_d(ir,iss1) + sa1*( real(psis(ir + ioff)))**2
END DO
END IF
ioff = ioff + dffts%nnr
END DO
!
END DO
rhos = rhos_d
IF( nbgrp > 1 ) THEN
CALL mp_sum( rhos, inter_bgrp_comm )
END IF
DEALLOCATE( rhos_d )
DEALLOCATE( psis )
DEALLOCATE( ptmp )
RETURN
END SUBROUTINE loop_over_states_gpu
#endif
!-----------------------------------------------------------------------
END SUBROUTINE rhoofr_cp
!-----------------------------------------------------------------------
@ -525,50 +637,32 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
REAL(DP), INTENT(OUT) :: drhor(dfftp%nnr,nspin,3,3)
COMPLEX(DP), INTENT(OUT) :: drhog(dfftp%ngm,nspin,3,3)
! local
INTEGER i, j, isup, isdw, iv, jv, ig, ijv, is, iss, ia, ir, ijs
INTEGER i, j, isup, isdw, iv, jv, ig, ijv, is, iss, ia, ir, ijs, itid
REAL(DP) :: asumt, dsumt
COMPLEX(DP) fp, fm, ci
#if defined(__INTEL_COMPILER)
#if __INTEL_COMPILER >= 1300
!dir$ attributes align: 4096 :: v, dqgbt,qv
#endif
#endif
COMPLEX(DP), ALLOCATABLE :: v(:)
COMPLEX(DP), ALLOCATABLE:: dqgbt(:,:)
COMPLEX(DP), ALLOCATABLE :: qv(:)
COMPLEX(DP), ALLOCATABLE :: fg1(:), fg2(:)
!
INTEGER :: itid, mytid, ntids
#if defined(_OPENMP)
INTEGER :: omp_get_thread_num, omp_get_num_threads
EXTERNAL :: omp_get_thread_num, omp_get_num_threads
#endif
!
!$omp parallel default(none), private(i,j,iss,ir,ig,mytid,ntids,itid), shared(nspin,dfftp,drhor,drhog,rhor,rhog,ainv)
#if defined(_OPENMP)
mytid = omp_get_thread_num() ! take the thread ID
ntids = omp_get_num_threads() ! take the number of threads
#else
mytid = 0
ntids = 1
#endif
itid = 0
!$omp parallel do collapse(3) default(none), private(i,j,iss,ir,ig), shared(nspin,dfftp,drhor,drhog,rhor,rhog,ainv)
DO j=1,3
DO i=1,3
DO iss=1,nspin
IF( MOD( itid, ntids ) == mytid ) THEN
DO ir=1,dfftp%nnr
drhor(ir,iss,i,j)=-rhor(ir,iss)*ainv(j,i)
END DO
DO ig=1,dfftp%ngm
drhog(ig,iss,i,j)=-rhog(ig,iss)*ainv(j,i)
END DO
END IF
itid = itid + 1
DO ir=1,dfftp%nnr
drhor(ir,iss,i,j)=-rhor(ir,iss)*ainv(j,i)
END DO
DO ig=1,dfftp%ngm
drhog(ig,iss,i,j)=-rhog(ig,iss)*ainv(j,i)
END DO
END DO
END DO
END DO
!$omp end parallel
!$omp end parallel do
IF ( nkbus <= 0 ) THEN
GO TO 1000
@ -589,8 +683,8 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
!$omp parallel default(none) &
!$omp shared(nat, ityp, ngb, nh, eigrb, dfftb, irb, v, &
!$omp ci, i, j, dqgb, qgb, nhm, rhovan, drhovan, upf ) &
!$omp private(mytid, ntids, is, ia, iv, jv, ijv, ig, iss, &
!$omp i, j, dqgb, qgb, nhm, rhovan, drhovan, upf ) &
!$omp private( is, ia, iv, jv, ijv, ig, iss, &
!$omp qv, fg1, fg2, itid, dqgbt, dsumt, asumt )
ALLOCATE( qv( dfftb%nnr ) )
@ -598,12 +692,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
ALLOCATE( fg1( ngb ) )
ALLOCATE( fg2( ngb ) )
#if defined(_OPENMP)
mytid = omp_get_thread_num() ! take the thread ID
ntids = omp_get_num_threads() ! take the number of threads
itid = 0
#endif
iss=1
DO ia=1,nat
@ -617,11 +706,11 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
#endif
#if defined(_OPENMP)
IF ( mytid /= itid ) THEN
itid = MOD( itid + 1, ntids )
IF ( omp_get_thread_num() /= itid ) THEN
itid = MOD( itid + 1, omp_get_num_threads() )
CYCLE
ELSE
itid = MOD( itid + 1, ntids )
itid = MOD( itid + 1, omp_get_num_threads() )
END IF
#endif
@ -761,7 +850,7 @@ END SUBROUTINE drhov
!
!-----------------------------------------------------------------------
SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
SUBROUTINE rhov(rhovan,rhog,rhor)
!-----------------------------------------------------------------------
! Add Vanderbilt contribution to rho(r) and rho(g)
!
@ -772,7 +861,7 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
USE kinds, ONLY: dp
USE ions_base, ONLY: nat, na, nsp, ityp
USE io_global, ONLY: stdout
USE mp_global, ONLY: intra_bgrp_comm
USE mp_global, ONLY: intra_bgrp_comm, inter_bgrp_comm
USE mp, ONLY: mp_sum
USE uspp_param, ONLY: nh, nhm, upf
USE uspp, ONLY: deeq, nkbus
@ -786,19 +875,21 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
USE fft_interfaces, ONLY: fwfft, invfft
USE fft_base, ONLY: dfftb, dfftp, dfftb
USE fft_helper_subroutines, ONLY: fftx_add_threed2oned_gamma
USE cp_main_variables, ONLY: irb, eigrb, iabox, nabox
!
IMPLICIT NONE
!
REAL(DP), INTENT(IN) :: rhovan(nhm*(nhm+1)/2,nat,nspin)
INTEGER, INTENT(in) :: irb(3,nat)
COMPLEX(DP), INTENT(in):: eigrb(ngb,nat)
!
REAL(DP), INTENT(inout):: rhor(dfftp%nnr,nspin)
COMPLEX(DP), INTENT(inout):: rhog(dfftp%ngm,nspin)
!
INTEGER :: isup, isdw, nfft, ifft, iv, jv, ig, ijv, is, iss, isa, ia, ir, i, j
INTEGER, PARAMETER :: isup = 1
INTEGER, PARAMETER :: isdw = 2
INTEGER :: iv, jv, ig, ijv, is, iss, ia, ir, i, j, iia
REAL(DP) :: sumrho
COMPLEX(DP) :: ci, fp, fm, ca
COMPLEX(DP) :: fp, fm, ca
COMPLEX(DP), PARAMETER :: ci=(0.d0,1.d0)
#if defined(__INTEL_COMPILER)
#if __INTEL_COMPILER >= 1300
!dir$ attributes align: 4096 :: qgbt, v, qv
@ -809,8 +900,8 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
COMPLEX(DP), ALLOCATABLE :: qv(:)
COMPLEX(DP), ALLOCATABLE :: fg1(:), fg2(:)
INTEGER :: mytid, ntids
#if defined(_OPENMP)
INTEGER :: itid, mytid, ntids
INTEGER :: omp_get_thread_num, omp_get_num_threads
EXTERNAL :: omp_get_thread_num, omp_get_num_threads
#endif
@ -822,7 +913,6 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
END IF
CALL start_clock( 'rhov' )
ci=(0.d0,1.d0)
!
!
ALLOCATE( v( dfftp%nnr ) )
@ -830,13 +920,8 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
! private variable need to be initialized, otherwise
! outside the parallel region they have an undetermined value
!
#if defined(_OPENMP)
mytid = 0
ntids = 1
itid = 0
#endif
iss = 1
isa = 1
!
IF(nspin.EQ.1) THEN
!
@ -845,12 +930,8 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
!$omp parallel default(none) &
!$omp shared(na, ngb, nh, rhovan, qgb, eigrb, dfftb, iverbosity, omegab, irb, v, &
!$omp stdout, ci, rhor, dfftp, upf, nsp, ityp, nat ) &
!$omp private(mytid, ntids, is, ia, nfft, ifft, iv, jv, ijv, sumrho, qgbt, ig, iss, isa, ca, &
!$omp qv, fg1, fg2, itid, ir )
iss=1
isa=1
!$omp stdout, rhor, dfftp, upf, nsp, ityp, nat, nspin, iabox, nabox, inter_bgrp_comm ) &
!$omp private(mytid, ntids, is, ia, iia, iv, jv, ijv, sumrho, qgbt, ig, ca, qv, ir )
!$omp workshare
v (:) = (0.d0, 0.d0)
@ -859,149 +940,96 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
#if defined(_OPENMP)
mytid = omp_get_thread_num() ! take the thread ID
ntids = omp_get_num_threads() ! take the number of threads
itid = 0
#endif
ALLOCATE( qgbt( ngb, 2 ) )
ALLOCATE( qgbt( ngb, nspin ) )
ALLOCATE( qv( dfftb%nnr ) )
ALLOCATE( fg1( ngb ) )
ALLOCATE( fg2( ngb ) )
DO ia = 1, nat
is = ityp(ia)
IF( upf(is)%tvanp ) THEN
nfft = 1
#if defined(__MPI)
IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) ) THEN
CYCLE
END IF
#endif
#if defined(_OPENMP)
IF ( mytid /= itid ) THEN
itid = MOD( itid + 1, ntids )
CYCLE
ELSE
itid = MOD( itid + 1, ntids )
END IF
#endif
DO ifft=1,nfft
qgbt(:,ifft) = (0.d0, 0.d0)
DO iv= 1,nh(is)
DO jv=iv,nh(is)
ijv = (jv-1)*jv/2 + iv
sumrho=rhovan(ijv,ia+ifft-1,iss)
IF(iv.NE.jv) sumrho=2.d0*sumrho
DO ig=1,ngb
qgbt(ig,ifft)=qgbt(ig,ifft) + sumrho*qgb(ig,ijv,is)
END DO
DO iia = 1, nabox
IF( MOD( iia - 1, ntids ) == mytid ) THEN
ia = iabox(iia)
is = ityp(ia)
qgbt(:,1) = (0.d0, 0.d0)
DO iv= 1,nh(is)
DO jv=iv,nh(is)
ijv = (jv-1)*jv/2 + iv
sumrho=rhovan(ijv,ia,1)
IF(iv.NE.jv) sumrho=2.d0*sumrho
DO ig=1,ngb
qgbt(ig,1)=qgbt(ig,1) + sumrho*qgb(ig,ijv,is)
END DO
END DO
END DO
!
! add structure factor
!
IF(nfft.EQ.2)THEN
fg1 = eigrb(1:ngb,ia )*qgbt(1:ngb,1)
fg2 = eigrb(1:ngb,ia+1 )*qgbt(1:ngb,2)
CALL fft_oned2box( qv, fg1, fg2 )
ELSE
fg1 = eigrb(1:ngb,ia )*qgbt(1:ngb,1)
CALL fft_oned2box( qv, fg1 )
ENDIF
qgbt(1:ngb,1) = eigrb(1:ngb,ia)*qgbt(1:ngb,1)
CALL fft_oned2box( qv, qgbt(:,1) )
CALL invfft( qv, dfftb, ia )
!
! qv = US augmentation charge in real space on box grid
! for atomic species is, real(qv)=atom ia, imag(qv)=atom ia+1
IF( iverbosity > 1 ) THEN
ca = SUM(qv)
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom g-sp = ', &
& omegab*DBLE(qgbt(1,1))
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom r-sp = ', &
& omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom g-sp = ', &
& omegab*DBLE(qgbt(1,2))
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom r-sp = ', &
& omegab*AIMAG(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
ENDIF
!
! add qv(r) to v(r), in real space on the dense grid
!
CALL box2grid(irb(:,ia),1,qv,v)
IF (nfft.EQ.2) CALL box2grid(irb(:,ia+1),2,qv,v)
!
END IF
END DO
DEALLOCATE( fg1 )
DEALLOCATE( fg2 )
DEALLOCATE(qv)
DEALLOCATE(qgbt)
!
! rhor(r) = total (smooth + US) charge density in real space
!
!$omp barrier
!$omp master
!
CALL mp_sum( v, inter_bgrp_comm )
!
!$omp end master
!$omp barrier
!$omp do
DO ir=1,dfftp%nnr
rhor(ir,1)=rhor(ir,1)+DBLE(v(ir))
END DO
!$omp end do
!$omp end parallel
iss = 1
DO ir=1,dfftp%nnr
rhor(ir,iss)=rhor(ir,iss)+DBLE(v(ir))
END DO
!
IF( iverbosity > 1 ) THEN
ca = SUM(v)
CALL mp_sum( ca, intra_bgrp_comm )
WRITE( stdout,'(a,2f12.8)') &
& ' rhov: int n_v(r) dr = ',omega*ca/(dfftp%nr1*dfftp%nr2*dfftp%nr3)
ENDIF
!
CALL fwfft('Rho',v, dfftp )
!
IF( iverbosity > 1 ) THEN
WRITE( stdout,*) ' rhov: smooth ',omega*rhog(1,iss)
WRITE( stdout,*) ' rhov: vander ',omega*v(1)
WRITE( stdout,*) ' rhov: all ',omega*(rhog(1,iss)+v(1))
ENDIF
!
! rhog(g) = total (smooth + US) charge density in G-space
!
CALL fftx_add_threed2oned_gamma( dfftp, v, rhog(:,iss) )
CALL fftx_add_threed2oned_gamma( dfftp, v, rhog(:,1) )
IF( iverbosity > 1 ) WRITE( stdout,'(a,2f12.8)') &
& ' rhov: n_v(g=0) = ',omega*DBLE(rhog(1,iss))
!
ELSE
!
! nspin=2: two fft at a time, one for spin up and one for spin down
!
isup=1
isdw=2
!$omp parallel default(none) &
!$omp shared(na, ngb, nh, rhovan, qgb, eigrb, dfftb, iverbosity, omegab, irb, v, &
!$omp stdout, rhor, dfftp, upf, nsp, ityp, nat, nspin, iabox, nabox, inter_bgrp_comm ) &
!$omp private(mytid, ntids, is, ia, iia, iv, jv, ijv, sumrho, qgbt, ig, ca, qv, fg1, fg2, ir )
!$omp workshare
v (:) = (0.d0, 0.d0)
!$omp end workshare
#if defined(_OPENMP)
mytid = omp_get_thread_num() ! take the thread ID
ntids = omp_get_num_threads() ! take the number of threads
#endif
ALLOCATE( qgbt( ngb, 2 ) )
ALLOCATE( qv( dfftb%nnr ) )
ALLOCATE( fg1( ngb ) )
ALLOCATE( fg2( ngb ) )
isa=1
DO ia=1,nat
is = ityp(ia)
#if defined(__MPI)
IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) ) CYCLE
#endif
IF( upf(is)%tvanp ) THEN
DO iia = 1, nabox
IF( MOD( iia - 1, ntids ) == mytid ) THEN
ia = iabox(iia)
is = ityp(ia)
DO iss=1,2
qgbt(:,iss) = (0.d0, 0.d0)
DO iv=1,nh(is)
@ -1015,42 +1043,40 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
END DO
END DO
END DO
!
! add structure factor
!
!
! add structure factor
!
fg1 = eigrb(1:ngb,ia)*qgbt(1:ngb,1)
fg2 = eigrb(1:ngb,ia)*qgbt(1:ngb,2)
CALL fft_oned2box( qv, fg1, fg2 )
!
!
CALL invfft( qv,dfftb,ia)
!
! qv is the now the US augmentation charge for atomic species is
! and atom ia: real(qv)=spin up, imag(qv)=spin down
!
IF( iverbosity > 1 ) THEN
ca = SUM(qv)
WRITE( stdout,'(a,f12.8)') ' rhov: up g-space = ', &
& omegab*DBLE(qgbt(1,1))
WRITE( stdout,'(a,f12.8)') ' rhov: up r-sp = ', &
& omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
WRITE( stdout,'(a,f12.8)') ' rhov: dw g-space = ', &
& omegab*DBLE(qgbt(1,2))
WRITE( stdout,'(a,f12.8)') ' rhov: dw r-sp = ', &
& omegab*AIMAG(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
ENDIF
!
! add qv(r) to v(r), in real space on the dense grid
!
!
! qv is the now the US augmentation charge for atomic species is
! and atom ia: real(qv)=spin up, imag(qv)=spin down
!
! add qv(r) to v(r), in real space on the dense grid
!
CALL box2grid(irb(:,ia),qv,v)
END IF
END IF
END DO
!
DEALLOCATE(qgbt)
DEALLOCATE( qv )
DEALLOCATE( fg1 )
DEALLOCATE( fg2 )
!$omp barrier
!$omp master
CALL mp_sum( v, inter_bgrp_comm )
!$omp end master
!$omp barrier
!$omp do
DO ir=1,dfftp%nnr
rhor(ir,isup)=rhor(ir,isup)+DBLE(v(ir))
rhor(ir,isdw)=rhor(ir,isdw)+AIMAG(v(ir))
END DO
!$omp end do
!$omp end parallel
!
IF( iverbosity > 1 ) THEN
ca = SUM(v)
@ -1078,10 +1104,6 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
& ' rhov: n_v(g=0) up = ',omega*DBLE (rhog(1,isup)), &
& ' rhov: n_v(g=0) down = ',omega*DBLE(rhog(1,isdw))
END IF
DEALLOCATE(qgbt)
DEALLOCATE( qv )
DEALLOCATE( fg1 )
DEALLOCATE( fg2 )
!
ENDIF
@ -1092,4 +1114,62 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
1000 CONTINUE
!
RETURN
CONTAINS
SUBROUTINE print_rhov()
IF( nspin == 1 ) THEN
IF( iverbosity > 1 ) THEN
ca = SUM(qv)
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom g-sp = ', &
& omegab*DBLE(qgbt(1,1))
WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom r-sp = ', &
& omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
ENDIF
ELSE
IF( iverbosity > 1 ) THEN
ca = SUM(qv)
WRITE( stdout,'(a,f12.8)') ' rhov: up g-space = ', &
& omegab*DBLE(qgbt(1,1))
WRITE( stdout,'(a,f12.8)') ' rhov: up r-sp = ', &
& omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
WRITE( stdout,'(a,f12.8)') ' rhov: dw g-space = ', &
& omegab*DBLE(qgbt(1,2))
WRITE( stdout,'(a,f12.8)') ' rhov: dw r-sp = ', &
& omegab*AIMAG(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
ENDIF
ENDIF
END SUBROUTINE
END SUBROUTINE rhov
SUBROUTINE rhoofr_host &
( nfi, c_bgrp, irb, eigrb, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, &
enl, denl, ekin, dekin, tstress, ndwwf )
USE kinds, ONLY: DP
#if defined (__CUDA)
USE cudafor
#endif
USE cp_interfaces
IMPLICIT NONE
INTEGER nfi
COMPLEX(DP) c_bgrp( :, : )
INTEGER irb( :, : )
COMPLEX(DP) eigrb( :, : )
REAL(DP) bec_bgrp(:,:)
REAL(DP) dbec(:,:,:,:)
REAL(DP) rhovan(:, :, : )
REAL(DP) rhor(:,:)
REAL(DP) drhor(:,:,:,:)
COMPLEX(DP) rhog( :, : )
COMPLEX(DP) drhog( :, :, :, : )
REAL(DP) rhos(:,:)
REAL(DP) enl, ekin
REAL(DP) denl(3,3), dekin(6)
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
COMPLEX(DP), ALLOCATABLE DEVICEATTR :: c(:,:)
ALLOCATE( c, SOURCE=c_bgrp )
CALL rhoofr(nfi, c_bgrp, c, bec_bgrp, dbec, rhovan, rhor, &
drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
DEALLOCATE( c )
END SUBROUTINE rhoofr_host

View File

@ -7,6 +7,12 @@
!
! written by Carlo Cavazzoni
#if defined(__CUDA)
#define DEVICEATTR ,DEVICE
#else
#define DEVICEATTR
#endif
!=----------------------------------------------------------------------------=!
MODULE cp_interfaces
!=----------------------------------------------------------------------------=!
@ -44,7 +50,6 @@
PUBLIC :: packgam
PUBLIC :: ortho
PUBLIC :: ortho_gamma
PUBLIC :: nlfh
PUBLIC :: nlfl_bgrp
@ -96,7 +101,6 @@
PUBLIC :: dotcsc
PUBLIC :: nlsm1
PUBLIC :: nlsm2_bgrp
PUBLIC :: calbec_bgrp
PUBLIC :: ennl
PUBLIC :: calrhovan
PUBLIC :: calbec
@ -104,8 +108,8 @@
PUBLIC :: dennl
PUBLIC :: nlfq_bgrp
PUBLIC :: collect_bec
PUBLIC :: beta_eigr
PUBLIC :: nlsm1us
PUBLIC :: dbeta_eigr
! ------------------------------------ !
@ -126,6 +130,22 @@
INTEGER, INTENT(IN) :: n, nspin
REAL(DP), OPTIONAL :: v1( ldv, * )
END SUBROUTINE dforce_x
#if defined (__CUDA)
SUBROUTINE dforce_gpu_x( i, bec, vkb, c, df, da, v, ldv, ispin, f, n, nspin )
USE kinds, ONLY: DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: i
REAL(DP) :: bec(:,:)
COMPLEX(DP), DEVICE :: vkb(:,:)
COMPLEX(DP), DEVICE :: c(:,:)
COMPLEX(DP) :: df(:), da(:)
INTEGER, INTENT(IN) :: ldv
REAL(DP), DEVICE :: v( :, : )
INTEGER :: ispin( : )
REAL(DP) :: f( : )
INTEGER, INTENT(IN) :: n, nspin
END SUBROUTINE dforce_gpu_x
#endif
END INTERFACE
@ -225,6 +245,26 @@
INTERFACE rhoofr
SUBROUTINE rhoofr_cp &
( nfi, c_bgrp, c_d, bec, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
USE kinds, ONLY: DP
IMPLICIT NONE
INTEGER nfi
COMPLEX(DP) :: c_bgrp( :, : )
COMPLEX(DP) DEVICEATTR :: c_d( :, : )
REAL(DP) bec(:,:)
REAL(DP) dbec(:,:,:,:)
REAL(DP) rhovan(:, :, : )
REAL(DP) rhor(:,:)
REAL(DP) drhor(:,:,:,:)
COMPLEX(DP) rhog( :, : )
COMPLEX(DP) drhog( :, :, :, : )
REAL(DP) rhos(:,:)
REAL(DP) enl, ekin
REAL(DP) denl(3,3), dekin(6)
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
END SUBROUTINE rhoofr_cp
SUBROUTINE rhoofr_host &
( nfi, c_bgrp, irb, eigrb, bec, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
USE kinds, ONLY: DP
IMPLICIT NONE
@ -244,7 +284,7 @@
REAL(DP) denl(3,3), dekin(6)
LOGICAL, OPTIONAL, INTENT(IN) :: tstress
INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
END SUBROUTINE rhoofr_cp
END SUBROUTINE rhoofr_host
END INTERFACE
INTERFACE checkrho
@ -309,7 +349,7 @@
INTERFACE runcp_uspp
SUBROUTINE runcp_uspp_x &
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp, fromscra, restart )
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart )
USE kinds, ONLY: DP
IMPLICIT NONE
integer, intent(in) :: nfi
@ -318,6 +358,7 @@
real(DP) :: rhos(:,:)
real(DP) :: bec_bgrp(:,:)
complex(DP) :: c0_bgrp(:,:), cm_bgrp(:,:)
complex(DP) DEVICEATTR :: c0_d(:,:), cm_d(:,:)
logical, optional, intent(in) :: fromscra
logical, optional, intent(in) :: restart
END SUBROUTINE
@ -377,36 +418,19 @@
INTERFACE ortho
SUBROUTINE ortho_x &
( eigr, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
( betae, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
USE kinds, ONLY: DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: idesc( :, : )
COMPLEX(DP) :: eigr( :, : )
COMPLEX(DP) :: betae( :, : )
COMPLEX(DP) :: cp_bgrp( :, : ), phi_bgrp( :, : )
REAL(DP) :: x0( :, :, : ), diff, ccc
INTEGER :: iter
REAL(DP) :: bephi(:,:)
REAL(DP) :: becp_bgrp(:,:)
END SUBROUTINE
END INTERFACE
INTERFACE ortho_gamma
SUBROUTINE ortho_gamma_x &
( iopt, cp, ngwx, phi, becp_dist, qbecp, nkbx, bephi, qbephi, &
x0, nx0, idesc, diff, iter, n, nss, istart )
USE kinds, ONLY: DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: iopt
INTEGER, INTENT(IN) :: ngwx, nkbx, nx0
INTEGER, INTENT(IN) :: n, nss, istart
COMPLEX(DP) :: phi( ngwx, n ), cp( ngwx, n )
REAL(DP) :: bephi( :, : )
REAL(DP) :: becp_dist(:,:)
REAL(DP) :: qbephi( :, : ), qbecp( :, : )
REAL(DP) :: x0( nx0, nx0 )
INTEGER, INTENT(IN) :: idesc( : )
INTEGER, INTENT(OUT) :: iter
REAL(DP), INTENT(OUT) :: diff
#if defined (__CUDA)
ATTRIBUTES( DEVICE ) :: becp_bgrp, bephi, cp_bgrp, phi_bgrp, betae
#endif
END SUBROUTINE
END INTERFACE
@ -748,15 +772,14 @@
INTERFACE move_electrons
SUBROUTINE move_electrons_x( &
nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, cm_bgrp, phi_bgrp, enthal, enb, &
nfi, tprint, tfirst, tlast, b1, b2, b3, fion, enthal, enb, &
& enbi, fccc, ccc, dt2bye, stress,l_cprestart )
USE kinds, ONLY: DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: nfi
LOGICAL, INTENT(IN) :: tfirst, tlast
LOGICAL, INTENT(IN) :: tprint, tfirst, tlast
REAL(DP), INTENT(IN) :: b1(3), b2(3), b3(3)
REAL(DP) :: fion(:,:)
COMPLEX(DP) :: c0_bgrp(:,:), cm_bgrp(:,:), phi_bgrp(:,:)
REAL(DP), INTENT(IN) :: dt2bye
REAL(DP) :: fccc, ccc
REAL(DP) :: enb, enbi
@ -820,6 +843,13 @@
IMPLICIT NONE
COMPLEX(DP) :: c_bgrp(:,:)
END SUBROUTINE c_bgrp_expand_x
#if defined (__CUDA)
SUBROUTINE c_bgrp_expand_gpu_x( c_bgrp )
USE kinds, ONLY: DP
IMPLICIT NONE
COMPLEX(DP), DEVICE :: c_bgrp(:,:)
END SUBROUTINE c_bgrp_expand_gpu_x
#endif
END INTERFACE
INTERFACE c_bgrp_pack
SUBROUTINE c_bgrp_pack_x( c_bgrp )
@ -827,6 +857,13 @@
IMPLICIT NONE
COMPLEX(DP) :: c_bgrp(:,:)
END SUBROUTINE c_bgrp_pack_x
#if defined (__CUDA)
SUBROUTINE c_bgrp_pack_gpu_x( c_bgrp )
USE kinds, ONLY: DP
IMPLICIT NONE
COMPLEX(DP), DEVICE :: c_bgrp(:,:)
END SUBROUTINE c_bgrp_pack_gpu_x
#endif
END INTERFACE
INTERFACE vofrho
@ -856,6 +893,16 @@
REAL(DP), INTENT(IN) :: f( : )
REAL(DP) :: enkin_x
END FUNCTION enkin_x
#if defined (__CUDA)
FUNCTION enkin_gpu_x( c, f, n )
USE kinds, ONLY: dp
IMPLICIT NONE
INTEGER, INTENT(IN) :: n
COMPLEX(DP), DEVICE, INTENT(IN) :: c( :, : )
REAL(DP), DEVICE, INTENT(IN) :: f( : )
REAL(DP) :: enkin_gpu_x
END FUNCTION enkin_gpu_x
#endif
END INTERFACE
INTERFACE newinit
@ -889,44 +936,45 @@
END INTERFACE
INTERFACE dotcsc
SUBROUTINE dotcsc_x( eigr, cp, ngw, n )
SUBROUTINE dotcsc_x( betae, cp, ngw, n )
USE kinds, ONLY: dp
IMPLICIT NONE
INTEGER, INTENT(IN) :: ngw, n
COMPLEX(DP), INTENT(IN) :: eigr(:,:), cp(:,:)
COMPLEX(DP), INTENT(IN) :: cp(:,:)
COMPLEX(DP), INTENT(INOUT) :: betae(:,:)
END SUBROUTINE dotcsc_x
END INTERFACE
INTERFACE nlsm1
SUBROUTINE nlsm1_x ( n, nspmn, nspmx, eigr, c, becp, pptype_ )
SUBROUTINE nlsm1_x ( n, betae, c, becp, pptype_ )
USE kinds, ONLY : DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: n, nspmn, nspmx
COMPLEX(DP), INTENT(IN) :: eigr( :, : ), c( :, : )
INTEGER, INTENT(IN) :: n
COMPLEX(DP), INTENT(IN) :: c( :, : )
COMPLEX(DP), INTENT(INOUT) :: betae( :, : )
REAL(DP), INTENT(OUT) :: becp( :, : )
INTEGER, INTENT(IN), OPTIONAL :: pptype_
END SUBROUTINE nlsm1_x
END INTERFACE
INTERFACE nlsm2_bgrp
SUBROUTINE nlsm2_bgrp_x( ngw, nkb, eigr, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
SUBROUTINE nlsm2_bgrp_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
USE kinds, ONLY : DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
COMPLEX(DP), INTENT(IN) :: eigr( :, : ), c_bgrp( :, : )
COMPLEX(DP), INTENT(IN) :: betae( :, : ), c_bgrp( :, : )
REAL(DP), INTENT(OUT) :: becdr_bgrp( :, :, : )
END SUBROUTINE nlsm2_bgrp_x
END INTERFACE
INTERFACE calbec_bgrp
SUBROUTINE calbec_bgrp_x ( nspmn, nspmx, eigr, c_bgrp, bec_bgrp, pptype_ )
#if defined (__CUDA)
SUBROUTINE nlsm2_bgrp_gpu_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
USE kinds, ONLY : DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: nspmn, nspmx
COMPLEX(DP), INTENT(IN) :: eigr( :, : ), c_bgrp( :, : )
REAL(DP), INTENT(OUT) :: bec_bgrp( :, : )
INTEGER, INTENT(IN), OPTIONAL :: pptype_
END SUBROUTINE calbec_bgrp_x
INTEGER, INTENT(IN) :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
COMPLEX(DP), INTENT(IN), DEVICE :: betae( :, : )
COMPLEX(DP), INTENT(IN), DEVICE :: c_bgrp( :, : )
REAL(DP), INTENT(OUT) :: becdr_bgrp( :, :, : )
END SUBROUTINE nlsm2_bgrp_gpu_x
#endif
END INTERFACE
INTERFACE ennl
@ -950,12 +998,13 @@
END INTERFACE
INTERFACE calbec
SUBROUTINE calbec_x( nspmn, nspmx, eigr, c, bec, pptype_ )
SUBROUTINE calbec_x( n, betae, c, bec, pptype_ )
USE kinds, ONLY: DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: nspmn, nspmx
REAL(DP), INTENT(OUT) :: bec( :, : )
COMPLEX(DP), INTENT(IN) :: c( :, : ), eigr( :, : )
INTEGER, INTENT(IN) :: n
REAL(DP), INTENT(OUT) :: bec( :, : )
COMPLEX(DP), INTENT(IN) :: c( :, : )
COMPLEX(DP), INTENT(INOUT) :: betae( :, : )
INTEGER, INTENT(IN), OPTIONAL :: pptype_
END SUBROUTINE calbec_x
END INTERFACE
@ -983,10 +1032,11 @@
END INTERFACE
INTERFACE nlfq_bgrp
SUBROUTINE nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
SUBROUTINE nlfq_bgrp_x( c_bgrp, betae, bec_bgrp, becdr_bgrp, fion )
USE kinds, ONLY: DP
IMPLICIT NONE
COMPLEX(DP), INTENT(IN) :: c_bgrp( :, : ), eigr( :, : )
COMPLEX(DP), INTENT(IN) DEVICEATTR :: c_bgrp( :, : )
COMPLEX(DP), INTENT(IN) DEVICEATTR :: betae( :, : )
REAL(DP), INTENT(IN) :: bec_bgrp( :, : )
REAL(DP), INTENT(OUT) :: becdr_bgrp( :, :, : )
REAL(DP), INTENT(OUT) :: fion( :, : )
@ -1003,28 +1053,25 @@
END SUBROUTINE collect_bec_x
END INTERFACE
INTERFACE beta_eigr
SUBROUTINE beta_eigr_x ( beigr, nspmn, nspmx, eigr, pptype_ )
USE kinds, ONLY : DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: nspmn, nspmx
COMPLEX(DP), INTENT(IN) :: eigr( :, : )
COMPLEX(DP), INTENT(OUT) :: beigr( :, : )
INTEGER, INTENT(IN), OPTIONAL :: pptype_
END SUBROUTINE beta_eigr_x
END INTERFACE
INTERFACE nlsm1us
SUBROUTINE nlsm1us_x ( n, beigr, c, becp )
SUBROUTINE nlsm1us_x ( n, betae, c, becp )
USE kinds, ONLY : DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: n
COMPLEX(DP), INTENT(IN) :: beigr( :, : )
COMPLEX(DP), INTENT(IN) :: c( :, : )
REAL(DP), INTENT(OUT) :: becp( :, : )
COMPLEX(DP) DEVICEATTR , INTENT(INOUT) :: betae( :, : )
COMPLEX(DP) DEVICEATTR , INTENT(IN) :: c( :, : )
REAL(DP) DEVICEATTR , INTENT(OUT) :: becp( :, : )
END SUBROUTINE nlsm1us_x
END INTERFACE
INTERFACE dbeta_eigr
SUBROUTINE dbeta_eigr_x( dbeigr, eigr )
USE kinds, ONLY : DP
IMPLICIT NONE
COMPLEX(DP), INTENT(IN) :: eigr( :, : )
COMPLEX(DP), INTENT(OUT) :: dbeigr( :, :, :, :)
END SUBROUTINE dbeta_eigr_x
END INTERFACE
!=----------------------------------------------------------------------------=!

View File

@ -1092,6 +1092,7 @@ subroutine nlinit
use uspp, ONLY : aainit, beta, qq_nt, dvan, nhtol, nhtolm, indv,&
dbeta
use uspp_param, ONLY : upf, lmaxq, nbetam, lmaxkb, nhm, nh, ish
use uspp_gpum, ONLY : using_qq_nt, using_qq_nt_d, qq_nt_d
use atom, ONLY : rgrid
use qgb_mod, ONLY : qgb, dqgb
use smallbox_gvec, ONLY : ngb
@ -1503,7 +1504,7 @@ end subroutine dylmr2_
!-----------------------------------------------------------------------
SUBROUTINE dotcsc_x( eigr, cp, ngw, n )
SUBROUTINE dotcsc_x( betae, cp, ngw, n )
!-----------------------------------------------------------------------
!
USE kinds, ONLY: DP
@ -1514,13 +1515,14 @@ end subroutine dylmr2_
USE uspp_param, ONLY: nh, ish, upf
USE mp, ONLY: mp_sum
USE mp_global, ONLY: intra_bgrp_comm, nbgrp, inter_bgrp_comm
USE cp_interfaces, ONLY: nlsm1
USE cp_interfaces, ONLY: calbec
USE electrons_base, ONLY: ispin, ispin_bgrp, nbspx_bgrp, ibgrp_g2l, iupdwn, nupdwn, nbspx
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ngw, n
COMPLEX(DP), INTENT(IN) :: eigr(:,:), cp(:,:)
COMPLEX(DP), INTENT(IN) :: cp(:,:)
COMPLEX(DP), INTENT(INOUT) :: betae(:,:)
! local variables
REAL(DP) rsum, csc(n) ! automatic array
COMPLEX(DP) temp(ngw) ! automatic array
@ -1536,7 +1538,7 @@ end subroutine dylmr2_
! < beta | phi > is real. only the i lowest:
!
CALL nlsm1( nbspx_bgrp, 1, nsp, eigr, cp, becp, 2 )
CALL calbec( nbspx_bgrp, betae, cp, becp, 2 )
nnn = MIN( 12, n )
@ -1640,30 +1642,73 @@ end subroutine dylmr2_
! local
INTEGER :: ig, i
REAL(DP) :: sk(n) ! automatic array
REAL(DP) :: sk, rsum
!
sk = 0.0d0
!$omp parallel do reduction(+:sk) default(none) &
!$omp shared(c,g2kin,gstart,ngw,n,f) private(i,ig,rsum)
DO i=1,n
sk(i)=0.0d0
rsum = 0.0d0
DO ig=gstart,ngw
sk(i)=sk(i)+DBLE(CONJG(c(ig,i))*c(ig,i))*g2kin(ig)
rsum = rsum + DBLE(CONJG(c(ig,i))*c(ig,i)) * g2kin(ig)
END DO
sk = sk + f(i) * rsum
END DO
!$omp end parallel do
CALL mp_sum( sk(1:n), intra_bgrp_comm )
enkin_x=0.0d0
DO i=1,n
enkin_x=enkin_x+f(i)*sk(i)
END DO
CALL mp_sum( sk, intra_bgrp_comm )
! ... reciprocal-space vectors are in units of alat/(2 pi) so a
! ... multiplicative factor (2 pi/alat)**2 is required
enkin_x = enkin_x * tpiba2
enkin_x = tpiba2 * sk
!
RETURN
END FUNCTION enkin_x
#if defined (__CUDA)
!-----------------------------------------------------------------------
FUNCTION enkin_gpu_x( c, f, n )
!-----------------------------------------------------------------------
!
USE kinds, ONLY: DP
USE constants, ONLY: pi, fpi
USE gvecw, ONLY: ngw
USE gvect, ONLY: gstart
USE gvecw, ONLY: g2kin_d
USE mp, ONLY: mp_sum
USE mp_global, ONLY: intra_bgrp_comm
USE cell_base, ONLY: tpiba2
USE cudafor
IMPLICIT NONE
REAL(DP) :: enkin_gpu_x
INTEGER, INTENT(IN) :: n
COMPLEX(DP), DEVICE, INTENT(IN) :: c( :, : )
REAL(DP), DEVICE, INTENT(IN) :: f( : )
!
! local
INTEGER :: ig, i
REAL(DP) :: sk
!
sk=0.0d0
!$cuf kernel do(2) <<<*,*>>>
DO i=1,n
DO ig=gstart,ngw
sk = sk + f(i) * DBLE(CONJG(c(ig,i))*c(ig,i)) * g2kin_d(ig)
END DO
END DO
CALL mp_sum( sk, intra_bgrp_comm )
enkin_gpu_x = tpiba2 * sk
!
RETURN
END FUNCTION enkin_gpu_x
#endif
!-------------------------------------------------------------------------
SUBROUTINE nlfl_bgrp_x( bec_bgrp, becdr_bgrp, lambda, idesc, fion )
!-----------------------------------------------------------------------
@ -1932,25 +1977,34 @@ end subroutine dylmr2_
USE gvecw, ONLY : ngw
USE uspp, ONLY : beta, nhtol, indv_ijkb0
USE uspp_param, ONLY : nh, upf
USE gvect, ONLY : gstart
!
IMPLICIT NONE
COMPLEX(DP), INTENT(IN) :: eigr( :, : )
COMPLEX(DP), INTENT(OUT) :: betae( :, : )
!
INTEGER :: is, iv, ia, inl, ig, isa
COMPLEX(DP), PARAMETER, DIMENSION(4) :: cfact = & ! (l == 0), (l == 1), (l == 2), (l == 3)
[( 1.0_dp , 0.0_dp ), ( 0.0_dp , -1.0_dp ), ( -1.0_dp , 0.0_dp ), ( 0.0_dp , 1.0_dp )]
COMPLEX(DP) :: ci
!
CALL start_clock( 'prefor' )
!$omp parallel do default(shared) private(ia,is,iv,ci,inl,ig)
DO ia=1,nat
is=ityp(ia)
DO iv=1,nh(is)
ci=(0.0d0,-1.0d0)**nhtol(iv,is)
ci=cfact( nhtol(iv,is) + 1 )
inl = indv_ijkb0(ia) + iv
DO ig=1,ngw
betae(ig,inl)=ci*beta(ig,iv,is)*eigr(ig,ia)
END DO
!beigr(1,inl)=betae(1,inl)
!DO ig=gstart,ngw
! beigr(ig,inl)=2.0d0 * betae(ig,inl)
!END DO
END DO
END DO
!$omp end parallel do
CALL stop_clock( 'prefor' )
!
RETURN

View File

@ -24,6 +24,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
USE core, ONLY : rhoc
USE uspp_param, ONLY : nhm, nh, ish
USE uspp, ONLY : nkb, vkb, becsum, deeq, okvan, nlcc_any
USE uspp_gpum, ONLY : vkb_d
USE energies, ONLY : eht, epseu, exc, etot, eself, enl, &
ekin, atot, entropy, egrand, enthal, &
ekincm, print_energies
@ -80,7 +81,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
electrons_nosevel, electrons_noseupd
USE pres_ai_mod, ONLY : P_ext, P_in, P_fin, pvar, volclu, &
surfclu, Surf_t, abivol, abisur
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, phi_bgrp
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, cm_d, phi, c0_d
USE wannier_module, ONLY : allocate_wannier
USE cp_interfaces, ONLY : printout_new, move_electrons, newinit
USE cell_nose, ONLY : xnhh0, xnhhm, xnhhp, vnhh, temph, &
@ -96,14 +97,14 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
ema0bg, sfac, eigr, iprint_stdout, &
irb, taub, eigrb, rhog, rhos, &
rhor, bephi, becp_bgrp, nfi, idesc, &
drhor, drhog, bec_bgrp, dbec
drhor, drhog, bec_bgrp, dbec, bec_d, iabox, nabox
USE autopilot, ONLY : event_step, event_index, &
max_event_step, restart_p
USE cell_base, ONLY : s_to_r, r_to_s
USE wannier_subroutines, ONLY : wannier_startup, wf_closing_options, &
ef_enthalpy
USE cp_interfaces, ONLY : writefile, eigs, strucf, phfacs
USE cp_interfaces, ONLY : ortho, elec_fakekine, calbec_bgrp, calbec, caldbec_bgrp
USE cp_interfaces, ONLY : ortho, elec_fakekine, calbec, caldbec_bgrp
USE constraints_module, ONLY : check_constraint, remove_constr_force
USE cp_autopilot, ONLY : pilot
USE ions_nose, ONLY : ions_nose_allocate, ions_nose_shiftvar
@ -117,6 +118,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
USE london_module, ONLY : energy_london, force_london, stres_london
USE input_parameters, ONLY : tcpbo
USE xc_lib, ONLY : xclib_dft_is, start_exx, exx_is_active
USE device_memcpy_m, ONLY : dev_memcpy
!
IMPLICIT NONE
!
@ -266,7 +268,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
!
IF ( (okvan .or. nlcc_any ) .AND. (tfor .OR. thdyn .OR. tfirst) ) THEN
!
CALL initbox( tau0, alat, at, ainv, taub, irb )
CALL initbox( tau0, alat, at, ainv, taub, irb, iabox, nabox )
!
CALL phbox( taub, iverbosity, eigrb )
!
@ -288,13 +290,15 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
!
END IF
!
! ... why this call ??? from Paolo Umari
IF( force_pairing ) THEN
c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
cm_bgrp(:,iupdwn(2):nbsp) = cm_bgrp(:,1:nupdwn(2))
!phi(:,iupdwn(2):nbsp) = phi(:,1:nupdwn(2))
CALL dev_memcpy(phi(:,iupdwn(2):), phi, [1, ubound(phi)], 1, [1, nbsp])
lambda(:,:, 2) = lambda(:,:, 1)
ENDIF
!
IF ( tefield .or. tefield2 ) THEN
!
CALL calbec( 1, nsp, eigr, c0_bgrp, bec_bgrp ) ! ATTENZIONE
!
END IF
CALL dev_memcpy( c0_d, c0_bgrp )
!
! Autopilot (Dynamic Rules) Implimentation
!
@ -322,13 +326,6 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
!
!=======================================================================
!
IF( force_pairing ) THEN
c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
cm_bgrp(:,iupdwn(2):nbsp) = cm_bgrp(:,1:nupdwn(2))
phi_bgrp(:,iupdwn(2):nbsp) = phi_bgrp(:,1:nupdwn(2))
lambda(:,:, 2) = lambda(:,:, 1)
ENDIF
!
! ... fake electronic kinetic energy
!
IF ( .NOT. tcg ) THEN
@ -339,9 +336,8 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
!
END IF
!
CALL move_electrons( nfi, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3), &
fion, c0_bgrp, cm_bgrp, phi_bgrp, &
enthal, enb, enbi, fccc, ccc, dt2bye, stress, .false. )
CALL move_electrons( nfi, tprint, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3), &
fion, enthal, enb, enbi, fccc, ccc, dt2bye, stress, .false. )
!
IF (lda_plus_u) fion = fion + forceh
!
@ -537,6 +533,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
! ... prefor calculates vkb
!
CALL prefor( eigr, vkb )
CALL dev_memcpy( vkb_d, vkb )
!
END IF
!
@ -553,13 +550,17 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
!
IF ( tortho ) THEN
!
CALL ortho( eigr, cm_bgrp, phi_bgrp, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
#if defined (__CUDA)
CALL ortho( vkb_d, cm_d, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
#else
CALL ortho( vkb, cm_bgrp, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
#endif
!
ELSE
!
CALL gram_bgrp( vkb, bec_bgrp, nkb, cm_bgrp, ngw )
!
IF ( iverbosity > 2 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp_bgrp )
IF ( iverbosity > 2 ) CALL dotcsc( vkb, cm_bgrp, ngw, nbsp_bgrp )
!
END IF
!
@ -568,23 +569,31 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
IF ( iverbosity > 1 ) CALL laxlib_print_matrix( lambda, idesc, nbsp, 9, nudx, 1.D0, ionode, stdout )
!
IF ( tortho ) THEN
CALL updatc( ccc, lambda, phi_bgrp, bephi, becp_bgrp, bec_bgrp, cm_bgrp, idesc )
#if defined (__CUDA)
CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_d, cm_d, idesc )
CALL dev_memcpy( bec_bgrp, bec_d )
CALL dev_memcpy( cm_bgrp, cm_d )
#else
CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_bgrp, cm_bgrp, idesc )
#endif
END IF
!
IF( force_pairing ) THEN
c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
cm_bgrp(:,iupdwn(2):nbsp) = cm_bgrp(:,1:nupdwn(2))
phi_bgrp(:,iupdwn(2):nbsp) = phi_bgrp(:,1:nupdwn(2))
c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
cm_bgrp(:,iupdwn(2):nbsp) = cm_bgrp(:,1:nupdwn(2))
!phi(:,iupdwn(2):nbsp) = phi(:,1:nupdwn(2))
CALL dev_memcpy(phi(:,iupdwn(2):), phi, [1, ubound(phi)], 1, [1, nbsp])
lambda(:,:, 2) = lambda(:,:, 1)
ENDIF
!
CALL calbec_bgrp( 1, nsp, eigr, cm_bgrp, bec_bgrp, 1 )
! the following compute only on NC pseudo components
CALL calbec( nbsp_bgrp, vkb, cm_bgrp, bec_bgrp, 1 )
!
IF ( tpre ) THEN
CALL caldbec_bgrp( eigr, cm_bgrp, dbec, idesc )
END IF
!
IF ( iverbosity > 1 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp_bgrp )
IF ( iverbosity > 1 ) CALL dotcsc( vkb, cm_bgrp, ngw, nbsp_bgrp )
!
END IF
!
@ -801,7 +810,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
! ... restart with CP
!
IF ( okvan .or. nlcc_any ) THEN
CALL initbox( tau0, alat, at, ainv, taub, irb )
CALL initbox( tau0, alat, at, ainv, taub, irb, iabox, nabox )
CALL phbox( taub, iverbosity, eigrb )
END IF
CALL r_to_s( tau0, taus, nat, ainv )
@ -816,9 +825,8 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
!
lambdam = lambda
!
CALL move_electrons( nfi, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3),&
fion, c0_bgrp, cm_bgrp, phi_bgrp, enthal, enb,&
enbi, fccc, ccc, dt2bye, stress,.true. )
CALL move_electrons( nfi, tprint, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3),&
fion, enthal, enb, enbi, fccc, ccc, dt2bye, stress,.true. )
!
END IF
!
@ -1094,6 +1102,7 @@ SUBROUTINE terminate_run()
CALL print_clock( 'nlfq' )
CALL print_clock( 'nlsm1' )
CALL print_clock( 'nlsm2' )
CALL print_clock( 'nlsm1us' )
CALL print_clock( 'fft' )
CALL print_clock( 'ffts' )
CALL print_clock( 'fftw' )
@ -1114,6 +1123,7 @@ SUBROUTINE terminate_run()
CALL print_clock( 'new_ns' )
CALL print_clock( 'strucf' )
CALL print_clock( 'calbec' )
CALL print_clock( 'exch_corr' )
!==============================================================
IF (ts_vdw) THEN
WRITE( stdout, '(/5x,"Called by tsvdw:")' )

View File

@ -15,11 +15,13 @@ SUBROUTINE deallocate_modules_var()
!
USE core, ONLY : deallocate_core
USE uspp, ONLY : deallocate_uspp
USE uspp_gpum, ONLY : deallocate_uspp_gpu
USE electrons_base, ONLY : deallocate_elct
USE efield_module, ONLY : deallocate_efield
USE ensemble_dft, ONLY : deallocate_ensemble_dft
USE cg_module, ONLY : deallocate_cg
USE gvect, ONLY : deallocate_gvect
USE gvect_gpum, ONLY : deallocate_gvect_gpu
USE gvecw, ONLY : deallocate_gvecw
USE smallbox_gvec, ONLY : deallocate_smallbox_gvec
USE local_pseudo, ONLY : deallocate_local_pseudo
@ -59,8 +61,10 @@ SUBROUTINE deallocate_modules_var()
CALL deallocate_cg( )
CALL deallocate_core()
CALL deallocate_uspp()
CALL deallocate_uspp_gpu()
CALL deallocate_gvect(.TRUE.) ! Value .true. is hard coded in init.f90:195,
! here it prevents double free of gg variable.
CALL deallocate_gvect_gpu()
CALL deallocate_gvecw()
CALL deallocate_smallbox_gvec( )
CALL deallocate_local_pseudo()

View File

@ -63,6 +63,8 @@
complex(DP), ALLOCATABLE :: self_rhog( :,: )
LOGICAL :: ttsic
real(DP) :: detmp(3,3)
CALL start_clock( 'exch_corr' )
!
! filling of gradr with the gradient of rho using fft's
!
@ -246,6 +248,8 @@
IF( ALLOCATED( gradr ) ) DEALLOCATE( gradr )
CALL stop_clock( 'exch_corr' )
5555 format(1x,f12.5,1x,f12.5,1x,f12.5/ &
& 1x,f12.5,1x,f12.5,1x,f12.5/ &
& 1x,f12.5,1x,f12.5,1x,f12.5//)
@ -511,29 +515,29 @@ subroutine exch_corr_cp(nnr,nspin,grhor,rhor,etxc)
! h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho|
!
!$omp parallel default(none), shared(nnr,grhor,h), private(ipol,k)
!$omp do collapse(2)
do ipol = 1, 3
!$omp do
do k = 1, nnr
grhor (ipol, k, 1) = h (k, 1, 1) * grhor (ipol, k, 1)
enddo
!$omp end do
end do
!$omp end do
!$omp end parallel
!
!
else
!
!$omp parallel default(none), shared(nnr,grhor,h), private(ipol,k,grup,grdw)
!$omp do collapse(2)
do ipol = 1, 3
!$omp do
do k = 1, nnr
grup = grhor (ipol, k, 1)
grdw = grhor (ipol, k, 2)
grhor (ipol, k, 1) = h (k, 1, 1) * grup + h (k, 1, 2) * grdw
grhor (ipol, k, 2) = h (k, 2, 2) * grdw + h (k, 2, 1) * grup
enddo
!$omp end do
enddo
!$omp end do
!$omp end parallel
!
end if

View File

@ -80,21 +80,14 @@
!=======================================================================
nogrp_ = fftx_ntgrp(dffts)
IF( nogrp_ > 1 ) THEN
CALL errore('dforce','Task group not supported',1)
END IF
ALLOCATE( psi( dffts%nnr_tg ) )
!
#if defined(__MPI)
CALL c2psi_gamma_tg( dffts, psi, c, i, n )
CALL invfft('tgWave', psi, dffts)
#else
CALL c2psi_gamma( dffts, psi, c(:,i), c(:,i+1) )
!
CALL invfft( 'Wave', psi, dffts )
#endif
!
! the following avoids a potential out-of-bounds error
!
@ -213,11 +206,7 @@
!
END IF
!
#if defined(__MPI)
CALL fwfft( 'tgWave', psi, dffts )
#else
CALL fwfft( 'Wave', psi, dffts )
#endif
!
! note : the factor 0.5 appears
! in the kinetic energy because it is defined as 0.5*g**2
@ -343,3 +332,231 @@
!
RETURN
END SUBROUTINE dforce_x
#if defined (__CUDA)
!-------------------------------------------------------------------------
SUBROUTINE dforce_gpu_x ( i, bec, vkb, c, df, da, v, ldv, ispin, f, n, nspin )
!-----------------------------------------------------------------------
!computes: the generalized force df=cmplx(dfr,dfi,kind=DP) acting on the i-th
! electron state at the gamma point of the brillouin zone
! represented by the vector c=cmplx(cr,ci,kind=DP)
!
! d_n(g) = f_n { 0.5 g^2 c_n(g) + [vc_n](g) +
! sum_i,ij d^q_i,ij (-i)**l beta_i,i(g)
! e^-ig.r_i < beta_i,j | c_n >}
!
USE parallel_include
USE kinds, ONLY: dp
USE control_flags, ONLY: iprint
USE uspp, ONLY: nhsa=>nkb, dvan, deeq, indv_ijkb0
USE uspp_param, ONLY: nhm, nh
USE constants, ONLY: pi, fpi
USE ions_base, ONLY: nsp, na, nat, ityp
USE gvecw, ONLY: ngw, g2kin_d
USE cell_base, ONLY: tpiba2
USE ensemble_dft, ONLY: tens
USE xc_lib, ONLY: xclib_dft_is, exx_is_active
USE fft_base, ONLY: dffts
USE fft_interfaces, ONLY: fwfft, invfft
USE mp_global, ONLY: me_bgrp
USE control_flags, ONLY: many_fft
USE fft_helper_subroutines
USE device_memcpy_m, ONLY : dev_memcpy
USE cudafor
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: i
REAL(DP) :: bec(:,:)
COMPLEX(DP), DEVICE :: vkb(:,:)
COMPLEX(DP), DEVICE :: c(:,:)
COMPLEX(DP) :: df(:), da(:)
INTEGER, INTENT(IN) :: ldv
REAL(DP), DEVICE :: v( :, : )
INTEGER :: ispin( : )
REAL(DP) :: f( : )
INTEGER, INTENT(IN) :: n, nspin
!
! local variables
!
INTEGER :: iv, jv, ia, is, iss1, iss2, ir, ig, inl, jnl
INTEGER :: igno, igrp, ierr, ii
INTEGER :: idx, ioff
REAL(DP) :: fi, fip, dd, dv
COMPLEX(DP) :: fp, fm
complex(DP), parameter :: ci=(0.0d0,1.0d0)
REAL(DP), ALLOCATABLE :: af( :, : ), aa( :, : )
REAL(DP), ALLOCATABLE, DEVICE :: af_d( :, : ), aa_d( :, : )
COMPLEX(DP), ALLOCATABLE, DEVICE :: psi(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: df_d(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: da_d(:)
INTEGER, DEVICE, POINTER :: nl_d(:), nlm_d(:)
!
CALL start_clock( 'dforce' )
!
IF(xclib_dft_is('hybrid').AND.exx_is_active()) THEN
CALL errore(' dforce ', ' dft_is_hybrid and exx_is_active NOT implemented ', 1 )
END IF
ALLOCATE( psi( dffts%nnr * many_fft ) )
ALLOCATE( df_d( SIZE( df ) ) )
ALLOCATE( da_d( SIZE( da ) ) )
!
psi = 0.0d0
nl_d => dffts%nl_d
nlm_d => dffts%nlm_d
ioff = 0
DO ii = i, i + 2 * many_fft - 1, 2
IF( ii < n ) THEN
!$cuf kernel do(1)
do ig = 1, dffts%ngw
psi( nlm_d( ig ) + ioff) = CONJG( c( ig, ii ) ) + ci * conjg( c( ig, ii+1))
psi( nl_d( ig ) + ioff) = c( ig, ii ) + ci * c( ig, ii+1)
end do
ELSE IF( ii == n ) THEN
!$cuf kernel do(1)
do ig = 1, dffts%ngw
psi( nlm_d( ig ) + ioff) = CONJG( c( ig, ii ) )
psi( nl_d( ig ) + ioff) = c( ig, ii )
end do
END IF
ioff = ioff + dffts%nnr
END DO
!
CALL invfft( 'Wave', psi, dffts, many_fft )
!
ioff = 0
DO ii = i, i + 2 * many_fft - 1, 2
IF( ii < n ) THEN
iss1=ispin( ii )
iss2=ispin( ii + 1 )
!$cuf kernel do(1)
DO ir=1,dffts%nnr
psi(ir+ioff)=CMPLX( v(ir,iss1)* DBLE(psi(ir+ioff)), &
v(ir,iss2)*AIMAG(psi(ir+ioff)) ,kind=DP)
END DO
ELSE IF( ii == n ) THEN
iss1=ispin( ii )
iss2=iss1
!$cuf kernel do(1)
DO ir=1,dffts%nnr
psi(ir+ioff)=CMPLX( v(ir,iss1)* DBLE(psi(ir+ioff)), &
v(ir,iss2)*AIMAG(psi(ir+ioff)) ,kind=DP)
END DO
END IF
ioff = ioff + dffts%nnr
END DO
CALL fwfft( 'Wave', psi, dffts, many_fft )
igno = 0
ioff = 0
DO idx = 1, 2 * many_fft, 2
IF( idx + i - 1 <= n ) THEN
if (tens) then
fi = -0.5d0
fip = -0.5d0
else
fi = -0.5d0*f(i+idx-1)
fip = -0.5d0*f(i+idx)
endif
CALL fftx_psi2c_gamma_gpu( dffts, psi( 1+ioff : ioff+dffts%nnr ), df_d(1+igno:igno+ngw), da_d(1+igno:igno+ngw))
!$cuf kernel do(1)
DO ig=1,ngw
df_d(ig+igno)= fi*(tpiba2*g2kin_d(ig)* c(ig,idx+i-1)+df_d(ig+igno))
da_d(ig+igno)=fip*(tpiba2*g2kin_d(ig)* c(ig,idx+i )+da_d(ig+igno))
END DO
END IF
igno = igno + ngw
ioff = ioff + dffts%nnr
ENDDO
!
IF( nhsa > 0 ) THEN
!
! aa_i,i,n = sum_j d_i,ij <beta_i,j|c_n>
!
ALLOCATE( af( nhsa, many_fft ), aa( nhsa, many_fft ) )
ALLOCATE( af_d( nhsa, many_fft ), aa_d( nhsa, many_fft ) )
!
!$omp parallel do default(none), &
!$omp shared(many_fft,i,n,tens,f,nat,ityp,nh,dvan,indv_ijkb0,deeq,af,aa,bec,ispin), &
!$omp private(idx,igrp,fi,fip,ia,is,iv,jv,inl,jnl,dv,dd,iss1,iss2)
DO idx = 1, 2*many_fft , 2
igrp = idx/2+1
af(:,igrp) = 0.0d0
aa(:,igrp) = 0.0d0
IF( idx + i - 1 <= n ) THEN
IF( i + idx - 1 /= n ) THEN
fi = f(i+idx-1)
fip= f(i+idx)
iss1=ispin( i+idx-1 )
iss2=ispin( i+idx )
ELSE
fi = f(i+idx-1)
iss1=ispin( i+idx-1 )
END IF
IF (tens) THEN
fi = 1.0d0
fip= 1.0d0
ENDIF
!
DO ia = 1, nat
is = ityp(ia)
DO iv = 1, nh(is)
DO jv = 1, nh(is)
dv = dvan(iv,jv,is)
inl = indv_ijkb0(ia) + iv
jnl = indv_ijkb0(ia) + jv
IF( i + idx - 1 /= n ) THEN
dd = deeq(iv,jv,ia,iss1) + dv
af(inl,igrp) = af(inl,igrp) - fi * dd * bec(jnl,i+idx-1)
dd = deeq(iv,jv,ia,iss2) + dv
aa(inl,igrp) = aa(inl,igrp) - fip * dd * bec(jnl,i+idx)
ELSE
dd = deeq(iv,jv,ia,iss1) + dv
af(inl,igrp) = af(inl,igrp) - fi * dd * bec(jnl,i+idx-1)
END IF
END DO
END DO
END DO
END IF
END DO
!$omp end parallel do
CALL dev_memcpy( af_d, af )
CALL dev_memcpy( aa_d, aa )
IF( ngw > 0 ) THEN
CALL MYDGEMM ( 'N', 'N', 2*ngw, many_fft , nhsa, 1.0d0, vkb, 2*ngw, af_d, nhsa, 1.0d0, df_d, 2*ngw)
CALL MYDGEMM ( 'N', 'N', 2*ngw, many_fft , nhsa, 1.0d0, vkb, 2*ngw, aa_d, nhsa, 1.0d0, da_d, 2*ngw)
END IF
!
DEALLOCATE( aa, af )
DEALLOCATE( aa_d, af_d )
!
ENDIF
CALL dev_memcpy( df, df_d )
CALL dev_memcpy( da, da_d )
!
DEALLOCATE( df_d )
DEALLOCATE( da_d )
DEALLOCATE( psi )
NULLIFY(nl_d)
NULLIFY(nlm_d)
!
CALL stop_clock( 'dforce' )
!
RETURN
END SUBROUTINE dforce_gpu_x
#endif

View File

@ -30,6 +30,7 @@ SUBROUTINE from_scratch( )
USE energies, ONLY : dft_energy_type, debug_energies
USE dener, ONLY : denl, denl6, dekin6, detot
USE uspp, ONLY : vkb, becsum, deeq, nkb, okvan, nlcc_any
USE uspp_gpum, ONLY : vkb_d
USE io_global, ONLY : stdout, ionode
USE core, ONLY : rhoc
USE gvecw, ONLY : ngw
@ -43,21 +44,24 @@ SUBROUTINE from_scratch( )
USE cp_interfaces, ONLY : runcp_uspp, runcp_uspp_force_pairing, &
strucf, phfacs, nlfh, vofrho, nlfl_bgrp, prefor
USE cp_interfaces, ONLY : rhoofr, ortho, wave_rand_init, elec_fakekine
USE cp_interfaces, ONLY : compute_stress, dotcsc, calbec_bgrp, caldbec_bgrp
USE cp_interfaces, ONLY : compute_stress, dotcsc, calbec, caldbec_bgrp
USE cp_interfaces, ONLY : nlfq_bgrp
USE printout_base, ONLY : printout_pos
USE orthogonalize_base, ONLY : updatc, calphi_bgrp
USE wave_base, ONLY : wave_steepest
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, phi_bgrp
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, c0_d, phi, cm_d
USE fft_base, ONLY : dfftp, dffts
USE time_step, ONLY : delt
USE cp_main_variables, ONLY : idesc, bephi, becp_bgrp, nfi, &
sfac, eigr, taub, irb, eigrb, bec_bgrp, &
USE cp_main_variables, ONLY : idesc, bephi, becp_bgrp, nfi, iabox, nabox, &
sfac, eigr, taub, irb, eigrb, bec_bgrp, bec_d, &
lambda, lambdam, lambdap, ema0bg, rhog, rhor, rhos, &
vpot, ht0, edft, becdr_bgrp, dbec, drhor, drhog
USE mp_global, ONLY : inter_bgrp_comm, nbgrp, me_bgrp
USE mp, ONLY : mp_sum
USE mp_world, ONLY : mpime, world_comm
USE mp, ONLY : mp_sum, mp_barrier
USE matrix_inversion
USE device_memcpy_m, ONLY : dev_memcpy
!
IMPLICIT NONE
!
@ -115,7 +119,7 @@ SUBROUTINE from_scratch( )
CALL strucf( sfac, eigts1, eigts2, eigts3, mill, dffts%ngm )
!
IF ( okvan .OR. nlcc_any ) THEN
CALL initbox ( tau0, alat, at, ainv, taub, irb )
CALL initbox ( tau0, alat, at, ainv, taub, irb, iabox, nabox )
CALL phbox( taub, iverbosity, eigrb )
END IF
!
@ -133,6 +137,7 @@ SUBROUTINE from_scratch( )
! ... prefor calculates vkb (used by gram)
!
CALL prefor( eigr, vkb )
CALL dev_memcpy( vkb_d, vkb )
!
nspin_wfc = nspin
IF( force_pairing ) nspin_wfc = 1
@ -141,7 +146,9 @@ SUBROUTINE from_scratch( )
IF( force_pairing ) cm_bgrp(:,iupdwn(2):iupdwn(2)+nupdwn(2)-1) = cm_bgrp(:,1:nupdwn(2))
!
if( iverbosity > 1 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp )
if( iverbosity > 1 ) CALL dotcsc( vkb, cm_bgrp, ngw, nbsp )
!
CALL dev_memcpy( cm_d, cm_bgrp )
!
! ... initialize bands
!
@ -188,11 +195,11 @@ SUBROUTINE from_scratch( )
!
IF( .NOT. tcg ) THEN
!
CALL calbec_bgrp ( 1, nsp, eigr, cm_bgrp, bec_bgrp )
CALL calbec ( nbsp_bgrp, vkb, cm_bgrp, bec_bgrp, 0 )
!
if ( tstress ) CALL caldbec_bgrp( eigr, cm_bgrp, dbec, idesc )
!
CALL rhoofr( nfi, cm_bgrp, irb, eigrb, bec_bgrp, dbec, becsum, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
CALL rhoofr( nfi, cm_bgrp, cm_d, bec_bgrp, dbec, becsum, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
!
edft%enl = enl
edft%ekin = ekin
@ -201,7 +208,7 @@ SUBROUTINE from_scratch( )
!
! put core charge (if present) in rhoc(r)
!
if ( nlcc_any ) CALL set_cc( irb, eigrb, rhoc )
if ( nlcc_any ) CALL set_cc( rhoc )
!
IF( .NOT. tcg ) THEN
@ -229,7 +236,7 @@ SUBROUTINE from_scratch( )
if( iverbosity > 1 ) &
CALL printout_pos( stdout, fion, nat, ityp, head = ' fion ' )
CALL newd( vpot, irb, eigrb, becsum, fion )
CALL newd( vpot, becsum, fion, .true. )
!
IF( force_pairing ) THEN
!
@ -240,26 +247,42 @@ SUBROUTINE from_scratch( )
!
ELSE
!
CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, cm_bgrp, c0_bgrp, fromscra = .TRUE. )
CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, cm_bgrp, cm_d, c0_bgrp, c0_d, fromscra = .TRUE. )
!
ENDIF
!
CALL dev_memcpy( c0_d, c0_bgrp ) ! c0 contains the updated wave functions
!
! nlfq needs deeq bec
!
IF( ttforce ) THEN
CALL nlfq_bgrp( cm_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
#if defined (__CUDA)
CALL nlfq_bgrp( cm_d, vkb_d, bec_bgrp, becdr_bgrp, fion )
#else
CALL nlfq_bgrp( cm_bgrp, vkb, bec_bgrp, becdr_bgrp, fion )
#endif
END IF
!
! calphi calculates phi
! the electron mass rises with g**2
!
CALL calphi_bgrp( cm_bgrp, ngw, bec_bgrp, nkb, vkb, phi_bgrp, nbspx_bgrp, ema0bg )
#if defined (__CUDA)
CALL calphi_bgrp( cm_d, ngw, bec_bgrp, nkb, vkb_d, phi, nbspx_bgrp, ema0bg )
#else
CALL calphi_bgrp( cm_bgrp, ngw, bec_bgrp, nkb, vkb, phi, nbspx_bgrp, ema0bg )
#endif
!
IF( force_pairing ) THEN
! phi( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi( :, 1:nupdwn(2))
CALL dev_memcpy(phi(:,iupdwn(2):), phi(:,1:), [1, ngw], 1 , [1, nupdwn(2)], 1)
END IF
!
IF( force_pairing ) &
& phi_bgrp( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi_bgrp( :, 1:nupdwn(2))
if( tortho ) then
CALL ortho( eigr, c0_bgrp, phi_bgrp, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
#if defined (__CUDA)
CALL ortho( vkb_d, c0_d, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
#else
CALL ortho( vkb, c0_bgrp, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
#endif
else
CALL gram_bgrp( vkb, bec_bgrp, nkb, c0_bgrp, ngw )
endif
@ -275,23 +298,30 @@ SUBROUTINE from_scratch( )
if ( tstress ) CALL nlfh( stress, bec_bgrp, dbec, lambda, idesc )
!
IF ( tortho ) THEN
CALL updatc( ccc, lambda, phi_bgrp, bephi, becp_bgrp, bec_bgrp, c0_bgrp, idesc )
#if defined (__CUDA)
CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_d, c0_d, idesc )
CALL dev_memcpy( c0_bgrp, c0_d )
CALL dev_memcpy( bec_bgrp, bec_d )
#else
CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_bgrp, c0_bgrp, idesc )
#endif
END IF
!
IF( force_pairing ) THEN
!
c0_bgrp ( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = c0_bgrp( :, 1:nupdwn(2))
phi_bgrp( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi_bgrp( :, 1:nupdwn(2))
CALL dev_memcpy(phi(:,iupdwn(2):), phi(:,1:), [1, ngw], 1 , [1, nupdwn(2)], 1)
!phi( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi( :, 1:nupdwn(2))
lambda(:,:,2) = lambda(:,:,1)
!
ENDIF
!
!
CALL calbec_bgrp ( 1, nsp, eigr, c0_bgrp, bec_bgrp, 1 )
! the following compute only on NC pseudo components
CALL calbec ( nbsp_bgrp, vkb, c0_bgrp, bec_bgrp, 1 )
!
if ( tstress ) CALL caldbec_bgrp( eigr, cm_bgrp, dbec, idesc )
if ( iverbosity > 1 ) CALL dotcsc( eigr, c0_bgrp, ngw, nbsp_bgrp )
if ( iverbosity > 1 ) CALL dotcsc( vkb, c0_bgrp, ngw, nbsp_bgrp )
!
xnhp0 = 0.0d0
xnhpm = 0.0d0
@ -325,3 +355,10 @@ SUBROUTINE from_scratch( )
RETURN
!
END SUBROUTINE from_scratch
subroutine hangup
USE mp_world, ONLY : mpime, world_comm
USE mp, ONLY : mp_sum, mp_barrier
call mp_barrier(world_comm)
CALL stop_cp_run()
end subroutine

View File

@ -16,33 +16,47 @@ SUBROUTINE gram_bgrp( betae, bec_bgrp, nkbx, cp_bgrp, ngwx )
USE gvecw, ONLY : ngw
USE electrons_base, ONLY : nbspx_bgrp, ibgrp_g2l, nupdwn, iupdwn, nbspx, iupdwn_bgrp, nspin
USE kinds, ONLY : DP
USE mp_global, ONLY : inter_bgrp_comm
USE mp, ONLY : mp_sum
USE gvect, ONLY : gstart
USE mp_global, ONLY : intra_bgrp_comm, inter_bgrp_comm, me_bgrp, nproc_bgrp
USE mp_world, ONLY : mpime
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: nkbx, ngwx
REAL(DP) :: bec_bgrp( nkbx, nbspx_bgrp )
COMPLEX(DP) :: cp_bgrp( ngwx, nbspx_bgrp ), betae( ngwx, nkbx )
COMPLEX(DP) :: cp_bgrp( ngwx, nbspx_bgrp )
COMPLEX(DP), INTENT(IN) :: betae( ngwx, nkbx )
!
REAL(DP) :: anorm
REAL(DP), ALLOCATABLE :: csc( : )
COMPLEX(DP), ALLOCATABLE :: ctmp( : )
REAL(DP), ALLOCATABLE :: temp(:)
COMPLEX(DP), ALLOCATABLE :: cp_tmp(:)
REAL(DP), ALLOCATABLE :: bec_tmp(:)
REAL(DP), ALLOCATABLE :: csc2( : )
INTEGER :: i,k,j, ig, ibgrp_k, ibgrp_i, nbgrp_im1, iss
REAL(DP), PARAMETER :: one = 1.d0
REAL(DP), PARAMETER :: mone = -1.d0
REAL(DP) :: g0
!
CALL start_clock( 'gram' )
g0 = 0.0d0
IF (gstart == 2) g0 = 1.0d0
ALLOCATE( csc( nbspx ) )
ALLOCATE( ctmp( ngwx ) )
ALLOCATE( cp_tmp( ngwx ) )
ALLOCATE( bec_tmp( nkbx ) )
ALLOCATE( csc2( SIZE( csc ) ) )
!
DO iss = 1, nspin
DO i = iupdwn(iss), iupdwn(iss) + nupdwn(iss) - 1
!
ibgrp_i = ibgrp_g2l( i )
!
CALL gracsc_bgrp( bec_bgrp, betae, cp_bgrp, i, csc, iss, nbgrp_im1 )
CALL gracsc_bgrp( i, csc, iss, nbgrp_im1 )
!
! calculate orthogonalized cp(i) : |cp(i)>=|cp(i)>-\sum_k<i csc(k)|cp(k)>
!
@ -60,14 +74,19 @@ SUBROUTINE gram_bgrp( betae, bec_bgrp, nkbx, cp_bgrp, ngwx )
IF( ibgrp_i > 0 ) THEN
cp_bgrp( :, ibgrp_i ) = ctmp
anorm = cscnorm( bec_bgrp, cp_bgrp, ibgrp_i, nbspx_bgrp )
CALL dscal( 2*ngw, 1.0d0/anorm, cp_bgrp(1,ibgrp_i), 1 )
CALL dscal( nkbx, 1.0d0/anorm, bec_bgrp(1,ibgrp_i), 1 )
cp_bgrp(:,ibgrp_i) = cp_bgrp(:,ibgrp_i) / anorm
bec_bgrp(:,ibgrp_i) = bec_bgrp(:,ibgrp_i) / anorm
!CALL dscal( 2*ngw, 1.0d0/anorm, cp_bgrp(1,ibgrp_i), 1 )
!CALL dscal( nkbx, 1.0d0/anorm, bec_bgrp(1,ibgrp_i), 1 )
END IF
END DO
END DO
!
DEALLOCATE( ctmp )
DEALLOCATE( csc )
DEALLOCATE( csc2 )
DEALLOCATE( bec_tmp )
DEALLOCATE( cp_tmp )
CALL stop_clock( 'gram' )
!
@ -84,7 +103,6 @@ CONTAINS
!
USE ions_base, ONLY: nat, ityp
USE gvecw, ONLY: ngw
USE gvect, ONLY: gstart
USE uspp_param, ONLY: nh, upf
USE uspp, ONLY: qq_nt, indv_ijkb0
USE mp, ONLY: mp_sum
@ -97,47 +115,39 @@ CONTAINS
REAL(DP), INTENT(IN) :: bec( :, : )
COMPLEX(DP), INTENT(IN) :: cp( :, : )
!
REAL(DP) :: cscnorm
REAL(DP) :: cscnorm, ddot
!
INTEGER ig, is, iv, jv, ia, inl, jnl
INTEGER :: is, iv, jv, ia, indv
REAL(DP) rsum
REAL(DP), ALLOCATABLE:: temp(:)
!
ALLOCATE(temp(ngw))
rsum = 2.d0 * ddot(2*ngw,cp(1,i),1,cp(1,i),1) - g0 * REAL(CONJG(cp(1,i))*cp(1,i), DP)
!
DO ig=1,ngw
temp(ig)=DBLE(CONJG(cp(ig,i))*cp(ig,i))
DO ia=1,nat
IF ( MOD( ia, nproc_bgrp ) == me_bgrp ) THEN
is = ityp(ia)
IF( upf(is)%tvanp ) THEN
indv = indv_ijkb0(ia)
DO iv=1,nh(is)
DO jv=1,nh(is)
IF(ABS(qq_nt(iv,jv,is)).GT.1.e-5) THEN
rsum = rsum + qq_nt(iv,jv,is)*bec(indv+iv,i)*bec(indv+jv,i)
ENDIF
END DO
END DO
END IF
END IF
END DO
rsum=2.d0*SUM(temp)
IF (gstart == 2) rsum=rsum-temp(1)
CALL mp_sum( rsum, intra_bgrp_comm )
!
DO ia=1,nat
is = ityp(ia)
IF( upf(is)%tvanp ) THEN
DO iv=1,nh(is)
inl=indv_ijkb0(ia) + iv
DO jv=1,nh(is)
jnl=indv_ijkb0(ia) + jv
IF(ABS(qq_nt(iv,jv,is)).GT.1.e-5) THEN
rsum = rsum + qq_nt(iv,jv,is)*bec(inl,i)*bec(jnl,i)
ENDIF
END DO
END DO
END IF
END DO
!
cscnorm=SQRT(rsum)
DEALLOCATE(temp)
!
RETURN
END FUNCTION cscnorm
!
!
!-------------------------------------------------------------------------
SUBROUTINE gracsc_bgrp( bec_bgrp, betae, cp_bgrp, i, csc, iss, nk )
SUBROUTINE gracsc_bgrp( i, csc, iss, nk )
!-----------------------------------------------------------------------
! requires in input the updated bec(k) for k<i
! on output: bec(i) is recalculated
@ -148,7 +158,6 @@ CONTAINS
USE electrons_base, ONLY: ispin, ispin_bgrp, nbspx_bgrp, ibgrp_g2l, iupdwn, nupdwn, nbspx
USE gvecw, ONLY: ngw
USE mp, ONLY: mp_sum
USE mp_global, ONLY: intra_bgrp_comm, inter_bgrp_comm, me_bgrp, nproc_bgrp
USE kinds, ONLY: DP
USE gvect, ONLY: gstart
!
@ -156,97 +165,67 @@ CONTAINS
!
INTEGER, INTENT(IN) :: i, iss
INTEGER, INTENT(OUT) :: nk
COMPLEX(DP) :: betae( :, : )
REAL(DP) :: bec_bgrp( :, : )
COMPLEX(DP) :: cp_bgrp( :, : )
REAL(DP) :: csc( : )
INTEGER :: k, kmax,ig, is, iv, jv, ia, inl, jnl, ibgrp_k, ibgrp_i
REAL(DP) :: rsum
REAL(DP), ALLOCATABLE :: temp(:)
COMPLEX(DP), ALLOCATABLE :: cp_tmp(:)
REAL(DP), ALLOCATABLE :: bec_tmp(:)
REAL(DP), ALLOCATABLE :: csc2( : )
#if defined(_OPENMP)
INTEGER :: mytid, ntids, omp_get_thread_num, omp_get_num_threads
#endif
INTEGER :: k, kmax_bgrp, kmax,ig, is, iv, jv, ia, inl, jnl, ibgrp_k, ibgrp_i
REAL(DP) :: rsum, ddot
INTEGER :: omp_get_thread_num, omp_get_num_threads
!
! calculate csc(k)=<cp(i)|cp(k)>, k<i
!
kmax = i - 1
!
ALLOCATE( cp_tmp( ngwx ) )
ALLOCATE( bec_tmp( nkbx ) )
ALLOCATE( csc2( SIZE( csc ) ) )
cp_tmp = 0.0d0
csc = 0.0d0
ibgrp_i = ibgrp_g2l( i )
IF( ibgrp_i > 0 ) cp_tmp = cp_bgrp( :, ibgrp_i )
IF( ibgrp_i > 0 ) THEN
cp_tmp = cp_bgrp( :, ibgrp_i )
ELSE
cp_tmp = 0.0d0
END IF
CALL mp_sum( cp_tmp, inter_bgrp_comm )
!$omp parallel default(none), &
!$omp shared(iupdwn,kmax,ispin,ibgrp_g2l,ngw,cp_bgrp,cp_tmp,csc,betae,bec_bgrp,i,iss,gstart), &
!$omp shared(upf,nat,ityp,indv_ijkb0,nh), &
!$omp private( temp, k, ig, inl, ibgrp_k, ibgrp_i, is, ia )
ALLOCATE( temp( ngw ) )
!$omp do
kmax_bgrp = 0
nk = 0
DO k = iupdwn( iss ), kmax
IF ( ispin(i) .EQ. ispin(k) ) THEN
ibgrp_k = ibgrp_g2l( k )
IF( ibgrp_k > 0 ) THEN
DO ig = 1, ngw
temp(ig) = DBLE( cp_bgrp(ig,ibgrp_k) * CONJG(cp_tmp(ig)) )
END DO
csc(k) = 2.0d0 * SUM(temp)
IF (gstart == 2) csc(k) = csc(k) - temp(1)
END IF
ENDIF
IF( ibgrp_g2l( k ) > 0 ) THEN
kmax_bgrp = ibgrp_g2l( k )
nk = nk + 1
END IF
END DO
!$omp end do
!
!
! calculate bec(i)=<cp(i)|beta>
!
ibgrp_i = ibgrp_g2l( i )
!
kmax_bgrp = kmax_bgrp - iupdwn_bgrp(iss) + 1
IF( kmax_bgrp > 0 .AND. ngw > 0 ) &
CALL dgemv( 'T', 2*ngw, kmax_bgrp, 1.0d0, cp_bgrp(1,iupdwn_bgrp(iss)), 2*ngwx, cp_tmp, 1, 0.0d0, csc2, 1 )
nk = 0
DO k = iupdwn( iss ), kmax
ibgrp_k = ibgrp_g2l( k )
IF( ibgrp_k > 0 ) THEN
nk = nk + 1
csc(k) = 2.0d0 * csc2(nk) - g0 * DBLE( cp_bgrp(1,ibgrp_k) * CONJG(cp_tmp(1)) )
END IF
END DO
IF( ibgrp_i > 0 ) THEN
!$omp do
DO ia = 1, nat
is = ityp(ia)
IF( upf(is)%tvanp ) THEN
DO iv=1,nh(is)
inl=indv_ijkb0(ia)+iv
DO ig=1,ngw
temp(ig) = DBLE( cp_bgrp(ig,ibgrp_i) * CONJG(betae(ig,inl)) )
END DO
bec_bgrp(inl,ibgrp_i)=2.d0*SUM(temp)
IF (gstart == 2) bec_bgrp(inl,ibgrp_i)= bec_bgrp(inl,ibgrp_i)-temp(1)
bec_tmp(inl) = 2.d0 * DDOT( 2*ngw, cp_bgrp(1,ibgrp_i), 1, betae(1,inl), 1) &
- g0 * DBLE(cp_bgrp(1,ibgrp_i) * CONJG(betae(1,inl)))
END DO
ELSE
inl= indv_ijkb0(ia)
bec_tmp( inl + 1: inl + nh(is) ) = 0.0d0
END IF
END DO
!$omp end do
CALL mp_sum( bec_tmp, intra_bgrp_comm ) ! parallel sum over G vectors within a band group
bec_bgrp( : , ibgrp_i ) = bec_tmp( : )
ELSE
bec_tmp = 0.0d0
END IF
DEALLOCATE( temp )
!$omp end parallel
CALL mp_sum( csc, intra_bgrp_comm )
CALL mp_sum( csc, inter_bgrp_comm )
IF( ibgrp_i > 0 ) THEN
DO ia = 1, nat
is = ityp(ia)
IF( upf(is)%tvanp ) THEN
inl=indv_ijkb0(ia)
CALL mp_sum( bec_bgrp( inl + 1: inl + nh(is), ibgrp_i ), intra_bgrp_comm )
END IF
END DO
END IF
bec_tmp = 0.0d0
IF( ibgrp_i > 0 ) bec_tmp = bec_bgrp(:,ibgrp_i )
CALL mp_sum( bec_tmp, inter_bgrp_comm )
!
@ -254,62 +233,46 @@ CONTAINS
!
csc2 = 0.0d0
!$omp parallel default(none), &
!$omp shared(iupdwn,iss,kmax,nproc_bgrp,me_bgrp,ispin,i,ibgrp_g2l,nh), &
!$omp parallel if( (kmax - iupdwn( iss )) > omp_get_num_threads() ) default(none), &
!$omp shared(iupdwn,iss,kmax,nproc_bgrp,me_bgrp,nbspx,i,ibgrp_g2l,nh), &
!$omp shared(indv_ijkb0,qq_nt,na,bec_tmp,bec_bgrp,csc2,nat,ityp,upf), &
!$omp private( k, is, iv, jv, ia, inl, jnl, rsum, ibgrp_k, ntids, mytid )
#if defined(_OPENMP)
mytid = omp_get_thread_num() ! take the thread ID
ntids = omp_get_num_threads() ! take the number of threads
#endif
DO k=iupdwn(iss), kmax
IF ( MOD( k, nproc_bgrp ) /= me_bgrp ) CYCLE
#if defined(_OPENMP)
! distribute bands round robin to threads
IF( MOD( k / nproc_bgrp, ntids ) /= mytid ) CYCLE
#endif
IF (ispin(i).EQ.ispin(k)) THEN
rsum=0.d0
ibgrp_k = ibgrp_g2l( k )
IF( ibgrp_k > 0 ) THEN
DO ia = 1, nat
!$omp private( k, is, iv, jv, ia, inl, jnl, rsum, ibgrp_k )
!$omp do
DO k = iupdwn( iss ), kmax
rsum=0.d0
ibgrp_k = ibgrp_g2l( k )
IF( ibgrp_k > 0 ) THEN
DO ia = 1, nat
IF ( MOD( ia-1, nproc_bgrp ) == me_bgrp ) THEN
is=ityp(ia)
IF( upf(is)%tvanp ) THEN
inl = indv_ijkb0(ia)
DO iv=1,nh(is)
inl=indv_ijkb0(ia)+iv
DO jv=1,nh(is)
jnl=indv_ijkb0(ia)+jv
IF(ABS(qq_nt(iv,jv,is)).GT.1.e-5) THEN
rsum = rsum + qq_nt(iv,jv,is)*bec_tmp(inl)*bec_bgrp(jnl,ibgrp_k)
rsum = rsum + qq_nt(iv,jv,is)*bec_tmp(inl+iv)*bec_bgrp(inl+jv,ibgrp_k)
ENDIF
END DO
END DO
END IF
END DO
END IF
csc2(k)=csc2(k)+rsum
END IF
END DO
ENDIF
csc2(k)=csc2(k)+rsum
END DO
!$omp end do
!$omp end parallel
!
! orthogonalized cp(i) : |cp(i)>=|cp(i)>-\sum_k<i csc(k)|cp(k)>
!
! corresponing bec: bec(i)=<cp(i)|beta>-csc(k)<cp(k)|beta>
!
CALL mp_sum( csc, intra_bgrp_comm )
CALL mp_sum( csc2, intra_bgrp_comm )
CALL mp_sum( csc, inter_bgrp_comm )
CALL mp_sum( csc2, inter_bgrp_comm )
csc = csc + csc2
bec_tmp = 0.0d0
DO k = iupdwn(iss), kmax
ibgrp_k = ibgrp_g2l( k )
IF( ibgrp_k > 0 ) THEN
DO inl=1,nkbx
bec_tmp(inl)=bec_tmp(inl)-csc(k)*bec_bgrp(inl,ibgrp_k)
END DO
END IF
END DO
nk = 0
DO k = iupdwn(iss), kmax
ibgrp_k = ibgrp_g2l( k )
@ -318,12 +281,15 @@ CONTAINS
csc( nk ) = csc( k )
END IF
END DO
IF( nk > 0 .AND. ngw > 0 ) THEN
CALL dgemv( 'N', nkbx, nk, -1.0d0, bec_bgrp(1,iupdwn_bgrp(iss)), nkbx, csc, 1, 0.0d0, bec_tmp, 1 )
ELSE
bec_tmp = 0.0d0
END IF
CALL mp_sum( bec_tmp, inter_bgrp_comm )
IF( ibgrp_i > 0 ) bec_bgrp(:,ibgrp_i ) = bec_bgrp(:,ibgrp_i ) + bec_tmp
DEALLOCATE( csc2 )
DEALLOCATE( bec_tmp )
DEALLOCATE( cp_tmp )
!
RETURN
END SUBROUTINE gracsc_bgrp

View File

@ -31,6 +31,7 @@
USE gvect, ONLY : mill_g, eigts1,eigts2,eigts3, g, gg, &
ecutrho, gcutm, gvect_init, mill, &
ig_l2g, gstart, ngm, ngm_g, gshells
USE gvect_gpum, ONLY : using_g, using_g_d
use gvecs, only : gcutms, gvecs_init, ngms
use gvecw, only : gkcut, gvecw_init, g2kin_init
USE smallbox_subs, ONLY : ggenb
@ -195,6 +196,11 @@
!
END IF
CALL using_g(2)
#if defined (__CUDA)
CALL using_g_d(0)
#endif
CALL gshells (.TRUE.)
!
! ... allocate and generate (modified) kinetic energy
@ -401,6 +407,7 @@
cell_base_reinit
USE gvecw, ONLY : g2kin_init
USE gvect, ONLY : g, gg, ngm, mill
USE gvect_gpum, ONLY : using_g, using_g_d
USE fft_base, ONLY : dfftp, dfftb
USE small_box, ONLY : small_box_set
USE smallbox_subs, ONLY : gcalb
@ -425,10 +432,15 @@
!
! re-calculate G-vectors and kinetic energy
!
CALL using_g(2)
!
do ig = 1, dfftp%ngm
g(:,ig)= mill(1,ig)*bg(:,1) + mill(2,ig)*bg(:,2) + mill(3,ig)*bg(:,3)
gg(ig)=g(1,ig)**2 + g(2,ig)**2 + g(3,ig)**2
enddo
#if defined (__CUDA)
CALL using_g_d(0)
#endif
!
call g2kin_init ( gg, tpiba2 )
!

View File

@ -27,8 +27,9 @@ SUBROUTINE init_run()
USE fft_base, ONLY : dfftp, dffts
USE electrons_base, ONLY : nspin, nbsp, nbspx, nupdwn, f
USE uspp, ONLY : nkb, vkb, deeq, becsum,nkbus
USE uspp_gpum, ONLY : vkb_d
USE core, ONLY : rhoc
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, phi_bgrp
USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, allocate_cp_wavefunctions
USE ensemble_dft, ONLY : tens, z0t
USE cg_module, ONLY : tcg
USE electrons_base, ONLY : nudx
@ -80,6 +81,9 @@ SUBROUTINE init_run()
USE cell_base, ONLY : ref_tpiba2, init_tpiba2
USE tsvdw_module, ONLY : tsvdw_initialize
USE exx_module, ONLY : exx_initialize
#if defined (__CUDA)
USE cudafor
#endif
!
IMPLICIT NONE
!
@ -171,10 +175,7 @@ SUBROUTINE init_run()
!
! initialize wave functions descriptors and allocate wf
!
IF(lwfpbe0nscf) ALLOCATE(cv0( ngw, vnbsp ) ) ! Lingzhu Kong
ALLOCATE( c0_bgrp( ngw, nbspx ) )
ALLOCATE( cm_bgrp( ngw, nbspx ) )
ALLOCATE( phi_bgrp( ngw, nbspx ) )
CALL allocate_cp_wavefunctions( ngw, nbspx, vnbsp, lwfpbe0nscf )
!
IF ( iverbosity > 1 ) THEN
!
@ -203,6 +204,9 @@ SUBROUTINE init_run()
ALLOCATE( deeq( nhm, nhm, nat, nspin ) )
!
ALLOCATE( vkb( ngw, nkb ) )
#if defined(_CUDA)
ALLOCATE( vkb_d( ngw, nkb ) )
#endif
!
IF ( xclib_dft_is('meta') .AND. tens ) &
CALL errore( ' init_run ', 'ensemble_dft not implemented for metaGGA', 1 )
@ -264,11 +268,6 @@ SUBROUTINE init_run()
!
hnew = h
!
IF(lwfpbe0nscf) cv0=( 0.D0, 0.D0 ) ! Lingzhu Kong
cm_bgrp = ( 0.D0, 0.D0 )
c0_bgrp = ( 0.D0, 0.D0 )
phi_bgrp = ( 0.D0, 0.D0 )
!
IF ( tens ) then
CALL id_matrix_init( idesc, nspin )
CALL h_matrix_init( idesc, nspin )

View File

@ -7,7 +7,7 @@
!
!
!====================================================================
SUBROUTINE inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
SUBROUTINE inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, &
sfac, c0, bec, dbec, firstiter, vpot )
!====================================================================
@ -109,7 +109,7 @@
! calculates the overlaps bec between the wavefunctions c0
! and the beta functions
CALL calbec( 1, nsp, eigr, c0, bec )
CALL calbec( n, betae, c0, bec )
! rotates the wavefunctions c0 and the overlaps bec
! (the occupation matrix f_ij becomes diagonal f_i)
@ -119,7 +119,7 @@
! calculates the electronic charge density
CALL rhoofr( nfi, c0diag, irb, eigrb, becdiag, dbec, rhovan, &
rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
IF(nlcc_any) CALL set_cc( irb, eigrb, rhoc )
IF(nlcc_any) CALL set_cc( rhoc )
! calculates the SCF potential, the total energy
! and the ionic forces
@ -141,7 +141,7 @@
! the augmentation charges and the
! corresponding contribution to the ionic force
CALL newd( vpot, irb, eigrb, rhovan, fion2 )
CALL newd( vpot, rhovan, fion2, .true. )
! operates the Hamiltonian on the wavefunction c0
h0c0( :, : )= 0.D0
@ -260,7 +260,7 @@
! calculates the electronic charge density
CALL rhoofr( nfi, c0diag, irb, eigrb, becdiag, dbec, rhovan, &
rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
IF(nlcc_any) CALL set_cc( irb, eigrb, rhoc )
IF(nlcc_any) CALL set_cc( rhoc )
! calculates the SCF potential, the total energy
! and the ionic forces
@ -409,7 +409,7 @@
! calculates the electronic charge density
CALL rhoofr( nfi, c0diag, irb, eigrb, becdiag, dbec, rhovan, &
rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
IF(nlcc_any) CALL set_cc( irb, eigrb, rhoc )
IF(nlcc_any) CALL set_cc( rhoc )
! calculates the SCF potential, the total energy
! and the ionic forces

View File

@ -119,10 +119,11 @@
USE step_penalty, ONLY: penalty_e, penalty_f
USE mp_pools, ONLY: intra_pool_comm, me_pool, nproc_pool
USE mp_bands, only: nbgrp
USE cp_interfaces, only: nlsm1, nlsm2_bgrp
USE cp_interfaces, only: calbec, nlsm2_bgrp
!
implicit none
complex(DP), intent(in) :: c(ngw,nx), eigr(ngw,nat), betae(ngw,nkb)
complex(DP), intent(in) :: c(ngw,nx), eigr(ngw,nat)
complex(DP), intent(inout) :: betae(ngw,nkb)
complex(DP), intent(out) :: hpsi(ngw,nx)
real(DP), INTENT(OUT) :: forceh(3,nat)
!
@ -243,10 +244,10 @@
allocate(dns(ldmx,ldmx,nspin,nat))
allocate (spsi(ngw,n))
!
call nlsm1 ( n, 1, nsp, eigr, c, bp )
call calbec ( n, betae, c, bp )
call s_wfc ( n, bp, betae, c, spsi )
call nlsm2_bgrp( ngw, nkb, eigr, c, dbp, nx, n )
call nlsm2_bgrp( ngw, nkb, eigr, wfcU, wdb, nwfcU, nwfcU )
call nlsm2_bgrp( ngw, nkb, betae, c, dbp, nx, n )
call nlsm2_bgrp( ngw, nkb, betae, wfcU, wdb, nwfcU, nwfcU )
!
! poor-man parallelization over bands
! - if nproc_pool=1 : nb_s=1, nb_e=n, mykey=0
@ -620,12 +621,13 @@
USE gvect, ONLY: gstart
USE ions_base, ONLY: nsp, nat
USE uspp, ONLY: nkb
USE cp_interfaces, only: nlsm1
USE cp_interfaces, only: calbec
!
IMPLICIT NONE
INTEGER, INTENT(IN) :: nx, n, nwfcU, offset(nat), &
Hubbard_l(nsp)
COMPLEX(DP), INTENT(IN) :: c( ngw, nx ), eigr(ngw,nat), betae(ngw,nkb)
COMPLEX(DP), INTENT(IN) :: c( ngw, nx ), eigr(ngw,nat)
COMPLEX(DP), INTENT(INOUT) :: betae(ngw,nkb)
!
COMPLEX(DP), INTENT(OUT):: wfcU(ngw, nwfcU), &
& swfc(ngw, nwfcU)
@ -641,7 +643,7 @@
!
! calculate bec = <beta|wfc>
!
CALL nlsm1( nwfcU, 1, nsp, eigr, wfcU, becwfc )
CALL calbec( nwfcU, betae, wfcU, becwfc )
!
! calculate swfc = S|wfc>
!

View File

@ -42,6 +42,8 @@ MODULE cp_main_variables
REAL(DP), ALLOCATABLE :: taub(:,:)
COMPLEX(DP), ALLOCATABLE :: eigrb(:,:)
INTEGER, ALLOCATABLE :: irb(:,:)
INTEGER, ALLOCATABLE :: iabox(:)
INTEGER :: nabox
!
! ... nonlocal projectors:
! ... bec = scalar product of projectors and wave functions
@ -53,8 +55,12 @@ MODULE cp_main_variables
REAL(DP), ALLOCATABLE :: bephi(:,:) ! distributed (orhto group)
REAL(DP), ALLOCATABLE :: becp_bgrp(:,:) ! distributed becp (band group)
REAL(DP), ALLOCATABLE :: bec_bgrp(:,:) ! distributed bec (band group)
REAL(DP), ALLOCATABLE :: bec_d(:,:) ! distributed bec (band group)
REAL(DP), ALLOCATABLE :: becdr_bgrp(:,:,:) ! distributed becdr (band group)
REAL(DP), ALLOCATABLE :: dbec(:,:,:,:) ! derivative of bec distributed(ortho group)
#if defined (__CUDA)
ATTRIBUTES( DEVICE ) :: becp_bgrp, bephi, bec_d
#endif
!
! ... mass preconditioning
!
@ -148,6 +154,10 @@ MODULE cp_main_variables
ALLOCATE( irb( 3, nat ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' allocate_mainvar ', ' unable to allocate irb ', ierr )
ALLOCATE( iabox( nat ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' allocate_mainvar ', ' unable to allocate iabox ', ierr )
nabox = 0
!
IF ( xclib_dft_is('meta') ) THEN
!
@ -249,6 +259,11 @@ MODULE cp_main_variables
ALLOCATE( bec_bgrp( nhsa, nbspx_bgrp ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' allocate_mainvar ', ' unable to allocate bec_bgrp ', ierr )
#if defined (__CUDA)
ALLOCATE( bec_d( nhsa, nbspx_bgrp ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' allocate_mainvar ', ' unable to allocate bec_d ', ierr )
#endif
ALLOCATE( bephi( nhsa, nspin*nrcx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' allocate_mainvar ', ' unable to allocate becphi ', ierr )
@ -281,12 +296,14 @@ MODULE cp_main_variables
IF( ALLOCATED( sfac ) ) DEALLOCATE( sfac )
IF( ALLOCATED( eigrb ) ) DEALLOCATE( eigrb )
IF( ALLOCATED( irb ) ) DEALLOCATE( irb )
IF( ALLOCATED( iabox ) ) DEALLOCATE( iabox )
IF( ALLOCATED( rhor ) ) DEALLOCATE( rhor )
IF( ALLOCATED( rhos ) ) DEALLOCATE( rhos )
IF( ALLOCATED( rhog ) ) DEALLOCATE( rhog )
IF( ALLOCATED( drhog ) ) DEALLOCATE( drhog )
IF( ALLOCATED( drhor ) ) DEALLOCATE( drhor )
IF( ALLOCATED( bec_bgrp ) ) DEALLOCATE( bec_bgrp )
IF( ALLOCATED( bec_d ) ) DEALLOCATE( bec_d )
IF( ALLOCATED( becdr_bgrp ) ) DEALLOCATE( becdr_bgrp )
IF( ALLOCATED( bephi ) ) DEALLOCATE( bephi )
IF( ALLOCATED( becp_bgrp ) ) DEALLOCATE( becp_bgrp )

View File

@ -7,8 +7,8 @@
!
!
!----------------------------------------------------------------------------
SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
cm_bgrp, phi_bgrp, enthal, enb, enbi, fccc, ccc, dt2bye, stress, l_cprestart )
SUBROUTINE move_electrons_x( nfi, tprint, tfirst, tlast, b1, b2, b3, fion, &
enthal, enb, enbi, fccc, ccc, dt2bye, stress, l_cprestart )
!----------------------------------------------------------------------------
!
! ... this routine updates the electronic degrees of freedom
@ -17,10 +17,11 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
USE control_flags, ONLY : lwf, tfor, tprnfor, thdyn
USE cg_module, ONLY : tcg
USE cp_main_variables, ONLY : eigr, irb, eigrb, rhog, rhos, rhor, drhor, &
drhog, sfac, ema0bg, bec_bgrp, becdr_bgrp, &
drhog, sfac, ema0bg, bec_bgrp, becdr_bgrp, &
taub, lambda, lambdam, lambdap, vpot, dbec, idesc
USE cell_base, ONLY : omega, ibrav, h, press
USE uspp, ONLY : becsum, vkb, nkb, nlcc_any
USE uspp_gpum, ONLY : vkb_d
USE energies, ONLY : ekin, enl, entropy, etot
USE electrons_base, ONLY : nbsp, nspin, f, nudx, nupdwn, nbspx_bgrp, nbsp_bgrp
USE core, ONLY : rhoc
@ -43,16 +44,16 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
USE electrons_module, ONLY : distribute_c, collect_c, distribute_b
USE gvect, ONLY : eigts1, eigts2, eigts3
USE control_flags, ONLY : lwfpbe0nscf ! exx_wf related
USE wavefunctions, ONLY : cv0 ! Lingzhu Kong
USE wavefunctions, ONLY : cv0, c0_bgrp, cm_bgrp, phi, c0_d, cm_d
USE xc_lib, ONLY : xclib_dft_is, exx_is_active
USE device_memcpy_m, ONLY : dev_memcpy
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: nfi
LOGICAL, INTENT(IN) :: tfirst, tlast
LOGICAL, INTENT(IN) :: tprint, tfirst, tlast
REAL(DP), INTENT(IN) :: b1(3), b2(3), b3(3)
REAL(DP) :: fion(:,:)
COMPLEX(DP) :: c0_bgrp(:,:), cm_bgrp(:,:), phi_bgrp(:,:)
REAL(DP), INTENT(IN) :: dt2bye
REAL(DP) :: fccc, ccc
REAL(DP) :: enb, enbi
@ -67,10 +68,14 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
CALL start_clock('move_electrons')
electron_dynamic: IF ( tcg ) THEN
!
#if defined (__CUDA)
CALL errore(' move_electrons ', ' GPU version of runcg not yet implemented ', 1 )
#else
CALL runcg_uspp( nfi, tfirst, tlast, eigr, bec_bgrp, irb, eigrb, &
rhor, rhog, rhos, rhoc, eigts1, eigts2, eigts3, sfac, &
fion, ema0bg, becdr_bgrp, lambdap, lambda, SIZE(lambda,1), vpot, c0_bgrp, &
cm_bgrp, phi_bgrp, dbec, l_cprestart )
cm_bgrp, phi, dbec, l_cprestart )
#endif
!
CALL compute_stress( stress, detot, h, omega )
!
@ -80,7 +85,7 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
CALL get_wannier_center( tfirst, cm_bgrp, bec_bgrp, eigr, &
eigrb, taub, irb, ibrav, b1, b2, b3 )
!
CALL rhoofr( nfi, c0_bgrp, irb, eigrb, bec_bgrp, dbec, becsum, rhor, &
CALL rhoofr( nfi, c0_bgrp, c0_d, bec_bgrp, dbec, becsum, rhor, &
drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
!
!=================================================================
@ -105,7 +110,7 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
!=================================================================
! ... put core charge (if present) in rhoc(r)
!
IF ( nlcc_any ) CALL set_cc( irb, eigrb, rhoc )
IF ( nlcc_any ) CALL set_cc( rhoc )
!
IF ( lwf ) THEN
!
@ -154,9 +159,10 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
!
!=======================================================================
!
CALL newd( vpot, irb, eigrb, becsum, fion )
CALL newd( vpot, becsum, fion, tprint )
!
CALL prefor( eigr, vkb )
CALL dev_memcpy( vkb_d, vkb )
!
IF( force_pairing ) THEN
!
@ -165,23 +171,29 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
!
ELSE
!
CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp )
CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d )
!
ENDIF
!
CALL dev_memcpy( cm_d, cm_bgrp ) ! cm contains the updated wavefunctions
!
!----------------------------------------------------------------------
! contribution to fion due to lambda
!----------------------------------------------------------------------
!
! ... nlfq needs deeq bec
!
IF ( tfor .OR. tprnfor ) THEN
CALL nlfq_bgrp( c0_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
IF ( tfor .OR. ( tprnfor .AND. tprint ) ) THEN
#if defined (__CUDA)
CALL nlfq_bgrp( c0_d, vkb_d, bec_bgrp, becdr_bgrp, fion )
#else
CALL nlfq_bgrp( c0_bgrp, vkb, bec_bgrp, becdr_bgrp, fion )
#endif
END IF
!
IF ( (tfor.or.tprnfor) .AND. tefield ) &
IF ( (tfor.or.(tprnfor.AND.tprint)) .AND. tefield ) &
CALL bforceion( fion, .TRUE. , ipolp, qmat, bec_bgrp, becdr_bgrp, gqq, evalue )
IF ( (tfor.or.tprnfor) .AND. tefield2 ) &
IF ( (tfor.or.(tprnfor.AND.tprint)) .AND. tefield2 ) &
CALL bforceion( fion, .TRUE. , ipolp2, qmat2, bec_bgrp, becdr_bgrp, gqq2, evalue2 )
!
IF( force_pairing ) THEN
@ -199,13 +211,17 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
! ... calphi calculates phi
! ... the electron mass rises with g**2
!
CALL calphi_bgrp( c0_bgrp, ngw, bec_bgrp, nkb, vkb, phi_bgrp, nbspx_bgrp, ema0bg )
#if defined (__CUDA)
CALL calphi_bgrp( c0_d, ngw, bec_bgrp, nkb, vkb_d, phi, nbspx_bgrp, ema0bg )
#else
CALL calphi_bgrp( c0_bgrp, ngw, bec_bgrp, nkb, vkb, phi, nbspx_bgrp, ema0bg )
#endif
!
! ... begin try and error loop (only one step!)
!
! ... nlfl and nlfh need: lambda (guessed) becdr
!
IF ( tfor .OR. tprnfor ) THEN
IF ( tfor .OR. (tprnfor .AND. tprint) ) THEN
CALL nlfl_bgrp( bec_bgrp, becdr_bgrp, lambda, idesc, fion )
END IF
!

View File

@ -6,7 +6,7 @@
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-------------------------------------------------------------------------
SUBROUTINE newd(vr,irb,eigrb,rhovan,fion)
SUBROUTINE newd( vr, rhovan, fion, tprint )
!-----------------------------------------------------------------------
!
! this routine calculates array deeq:
@ -32,28 +32,28 @@
my_bgrp_id, nbgrp
USE fft_interfaces, ONLY: invfft
USE fft_base, ONLY: dfftb, dfftp
USE cp_main_variables,ONLY: irb, eigrb, iabox, nabox
!
IMPLICIT NONE
! input
INTEGER irb(3,nat)
REAL(DP) rhovan(nhm*(nhm+1)/2,nat,nspin)
COMPLEX(DP) eigrb(ngb,nat)
REAL(DP), INTENT(IN) :: rhovan(nhm*(nhm+1)/2,nat,nspin)
REAL(DP) vr(dfftp%nnr,nspin)
LOGICAL, INTENT(IN) :: tprint
! output
REAL(DP) fion(3,nat)
! local
INTEGER isup,isdw,iss, iv,ijv,jv, ik, nfft, ia, is, ig
REAL(DP) fac, fac1, fac2, res
COMPLEX(DP) ci, facg1, facg2
INTEGER :: isup,isdw,iss, iv,ijv,jv, ik, nfft, ia, iia, is, ig
REAL(DP) :: fac, fac1, fac2, res
COMPLEX(DP), PARAMETER :: ci = (0.d0,1.d0)
COMPLEX(DP) :: facg1, facg2
COMPLEX(DP), ALLOCATABLE :: qv(:), fg1(:), fg2(:)
REAL(DP), ALLOCATABLE :: fvan(:,:)
INTEGER :: na_bgrp, ia_bgrp
INTEGER :: mytid, ntids
#if defined(_OPENMP)
INTEGER :: itid, mytid, ntids, omp_get_thread_num, omp_get_num_threads
INTEGER :: omp_get_thread_num, omp_get_num_threads
EXTERNAL :: omp_get_thread_num, omp_get_num_threads
#endif
!
IF ( dfftb%nr1==0 .OR. dfftb%nr2==0 .OR. dfftb%nr3==0 ) THEN
RETURN
@ -62,263 +62,151 @@
CALL start_clock( 'newd' )
ALLOCATE( fvan( 3, nat ) )
ci=(0.d0,1.d0)
fac=omegab/DBLE(dfftb%nr1*dfftb%nr2*dfftb%nr3)
deeq (:,:,:,:) = 0.d0
fvan (:,:) = 0.d0
fac=omegab/DBLE(dfftb%nr1*dfftb%nr2*dfftb%nr3)
!$omp parallel default(none) &
!$omp shared(ngb, nh, qgb, eigrb, dfftb, irb, vr, ci, deeq, &
!$omp fac, nspin, my_bgrp_id, nbgrp, ityp, upf, nat ) &
!$omp private(mytid, ntids, is, ia, nfft, iv, jv, ijv, ig, qv, fg1, fg2, itid, res, iss )
!$omp shared(ngb, nh, qgb, eigrb, dfftb, irb, vr, deeq, tfor, thdyn, tprnfor, tprint, nabox, &
!$omp fac, nspin, my_bgrp_id, nbgrp, ityp, upf, nat, fvan, tpibab, gxb, rhovan, iabox ) &
!$omp private(mytid, ntids, is, ia, iia, nfft, iv, jv, ijv, ig, qv, fg1, fg2, res, &
!$omp iss, isup, isdw, fac2, facg1, fac1 )
#if defined(_OPENMP)
mytid = omp_get_thread_num() ! take the thread ID
ntids = omp_get_num_threads() ! take the number of threads
itid = 0
#else
mytid = 0
ntids = 1
#endif
!$omp workshare
deeq (:,:,:,:) = 0.d0
fvan (:,:) = 0.d0
!$omp end workshare
ALLOCATE( qv( dfftb%nnr ) )
ALLOCATE( fg1( ngb ) )
ALLOCATE( fg2( ngb ) )
!
! calculation of deeq_i,lm = \int V_eff(r) q_i,lm(r) dr
!
DO ia = 1, nat
is = ityp(ia)
IF( .NOT. upf(is)%tvanp ) &
CYCLE
#if defined(__MPI)
IF( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
CYCLE
!
! calculation of deeq_i,lm = \int V_eff(r) q_i,lm(r) dr
!
DO iia = 1, nabox
IF( MOD( iia - 1, ntids ) == mytid ) THEN
ia = iabox( iia )
is = ityp(ia)
nfft = 1
DO iv=1,nh(is)
DO jv=iv,nh(is)
ijv = (jv-1)*jv/2 + iv
fg1 = eigrb(1:ngb,ia )*qgb(1:ngb,ijv,is)
CALL fft_oned2box( qv, fg1 )
CALL invfft( qv, dfftb, ia )
DO iss=1,nspin
res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
deeq(iv,jv,ia,iss) = fac * res
IF (iv.NE.jv) &
& deeq(jv,iv,ia,iss)=deeq(iv,jv,ia,iss)
END DO
END DO
END DO
END IF
#endif
nfft = 1
#if defined(_OPENMP)
IF ( mytid /= itid ) THEN
itid = MOD( itid + 1, ntids )
CYCLE
ELSE
itid = MOD( itid + 1, ntids )
END IF
#endif
DO iv=1,nh(is)
DO jv=iv,nh(is)
ijv = (jv-1)*jv/2 + iv
IF (nfft.EQ.2) THEN
fg1 = eigrb(1:ngb,ia )*qgb(1:ngb,ijv,is)
fg2 = eigrb(1:ngb,ia+1)*qgb(1:ngb,ijv,is)
CALL fft_oned2box( qv, fg1, fg2 )
ELSE
fg1 = eigrb(1:ngb,ia )*qgb(1:ngb,ijv,is)
CALL fft_oned2box( qv, fg1 )
END IF
!
CALL invfft( qv, dfftb, ia )
!
DO iss=1,nspin
res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
deeq(iv,jv,ia,iss) = fac * res
IF (iv.NE.jv) &
& deeq(jv,iv,ia,iss)=deeq(iv,jv,ia,iss)
IF (nfft.EQ.2) THEN
res = boxdotgrid(irb(:,ia+1),2,qv,vr(:,iss))
deeq(iv,jv,ia+1,iss) = fac * res
IF (iv.NE.jv) &
& deeq(jv,iv,ia+1,iss)=deeq(iv,jv,ia+1,iss)
END IF
END DO
END DO
END DO
END DO
IF ( tfor .OR. thdyn .OR. (tprnfor.AND.tprint) ) THEN
!
! calculation of fion_i = \int V_eff(r) \sum_lm rho_lm (dq_i,lm(r)/dR_i) dr
!
IF( nspin == 1 ) THEN
!
! case nspin=1: two ffts at the same time, on two atoms (if possible)
!
iss=1
nfft=1
DO iia = 1, nabox
IF( MOD( iia - 1, ntids ) == mytid ) THEN
ia = iabox( iia )
is = ityp(ia)
DO ik=1,3
qv(:) = (0.d0, 0.d0)
DO iv=1,nh(is)
DO jv=iv,nh(is)
ijv = (jv-1)*jv/2 + iv
IF(iv.NE.jv) THEN
fac1=2.d0*fac*tpibab*rhovan(ijv,ia,iss)
ELSE
fac1= fac*tpibab*rhovan(ijv,ia,iss)
ENDIF
DO ig=1,ngb
facg1 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is)*fac1
fg1(ig) = eigrb(ig,ia )*facg1
END DO
CALL fft_add_oned2box( qv, fg1 )
END DO
END DO
CALL invfft( qv, dfftb, ia)
res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
fvan(ik,ia) = res
END DO
END IF
END DO
ELSE
!
! case nspin=2: up and down spin fft's combined into a single fft
!
isup=1
isdw=2
DO iia = 1, nabox
IF( MOD( iia - 1, ntids ) == mytid ) THEN
ia = iabox( iia )
is = ityp(ia)
DO ik=1,3
qv(:) = (0.d0, 0.d0)
DO iv=1,nh(is)
DO jv=iv,nh(is)
ijv = (jv-1)*jv/2 + iv
IF(iv.NE.jv) THEN
fac1=2.d0*fac*tpibab*rhovan(ijv,ia,isup)
fac2=2.d0*fac*tpibab*rhovan(ijv,ia,isdw)
ELSE
fac1= fac*tpibab*rhovan(ijv,ia,isup)
fac2= fac*tpibab*rhovan(ijv,ia,isdw)
END IF
DO ig=1,ngb
fg1(ig) = fac1 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
& qgb(ig,ijv,is) * eigrb(ig,ia)
fg2(ig) = fac2 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
& qgb(ig,ijv,is) * eigrb(ig,ia)
END DO
CALL fft_add_oned2box( qv, fg1, fg2 )
END DO
END DO
CALL invfft( qv, dfftb, ia)
fvan(ik,ia) = &
& boxdotgrid(irb(:,ia),isup,qv,vr(:,isup)) + &
& boxdotgrid(irb(:,ia),isdw,qv,vr(:,isdw))
END DO
END IF
END DO
END IF
END IF
DEALLOCATE( qv )
DEALLOCATE( fg1 )
DEALLOCATE( fg2 )
!$omp end parallel
CALL mp_sum( deeq, intra_bgrp_comm )
CALL mp_sum( deeq, inter_bgrp_comm )
IF (.NOT.( tfor .OR. thdyn .OR. tprnfor ) ) go to 10
!
! calculation of fion_i = \int V_eff(r) \sum_lm rho_lm (dq_i,lm(r)/dR_i) dr
!
IF( nspin == 1 ) THEN
! =================================================================
! case nspin=1: two ffts at the same time, on two atoms (if possible)
! -----------------------------------------------------------------
!$omp parallel default(none) &
!$omp shared(nat, ngb, nh, qgb, eigrb, dfftb, irb, vr, ci, deeq, &
!$omp fac, nspin, rhovan, tpibab, gxb, fvan, my_bgrp_id, nbgrp, ityp, upf ) &
!$omp private(mytid, ntids, is, ia, ik, nfft, iv, jv, ijv, ig, qv, itid, res, iss, &
!$omp fac1, fac2, facg1, facg2, fg1, fg2 )
ALLOCATE( qv( dfftb%nnr ) )
ALLOCATE( fg1( ngb ) )
ALLOCATE( fg2( ngb ) )
iss=1
#if defined(_OPENMP)
mytid = omp_get_thread_num() ! take the thread ID
ntids = omp_get_num_threads() ! take the number of threads
itid = 0
#endif
DO ia = 1, nat
is = ityp(ia)
IF( .NOT. upf(is)%tvanp ) &
CYCLE
#if defined(__MPI)
IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
CYCLE
END IF
#endif
nfft=1
#if defined(_OPENMP)
IF ( mytid /= itid ) THEN
itid = MOD( itid + 1, ntids )
CYCLE
ELSE
itid = MOD( itid + 1, ntids )
END IF
#endif
DO ik=1,3
qv(:) = (0.d0, 0.d0)
DO iv=1,nh(is)
DO jv=iv,nh(is)
ijv = (jv-1)*jv/2 + iv
IF(iv.NE.jv) THEN
fac1=2.d0*fac*tpibab*rhovan(ijv,ia,iss)
IF (nfft.EQ.2) fac2=2.d0*fac*tpibab* &
& rhovan(ijv,ia+1,iss)
ELSE
fac1= fac*tpibab*rhovan(ijv,ia,iss)
IF (nfft.EQ.2) fac2= fac*tpibab* &
& rhovan(ijv,ia+1,iss)
ENDIF
IF (nfft.EQ.2) THEN
DO ig=1,ngb
facg1 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is) * fac1
facg2 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is) * fac2
fg1(ig) = eigrb(ig,ia )*facg1
fg2(ig) = eigrb(ig,ia+1)*facg2
END DO
CALL fft_add_oned2box( qv, fg1, fg2 )
ELSE
DO ig=1,ngb
facg1 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is)*fac1
fg1(ig) = eigrb(ig,ia )*facg1
END DO
CALL fft_add_oned2box( qv, fg1 )
END IF
END DO
END DO
!
CALL invfft( qv, dfftb, ia)
!
res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
fvan(ik,ia) = res
!
IF (nfft.EQ.2) THEN
res = boxdotgrid(irb(:,ia+1),2,qv,vr(:,iss))
fvan(ik,ia+1) = res
END IF
END DO
END DO
DEALLOCATE( qv )
DEALLOCATE( fg1 )
DEALLOCATE( fg2 )
!$omp end parallel
ELSE
! =================================================================
! case nspin=2: up and down spin fft's combined into a single fft
! -----------------------------------------------------------------
ALLOCATE( qv( dfftb%nnr ) )
ALLOCATE( fg1( ngb ) )
ALLOCATE( fg2( ngb ) )
isup=1
isdw=2
DO ia = 1, nat
is = ityp(ia)
IF( .NOT. upf(is)%tvanp ) &
CYCLE
#if defined(__MPI)
IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
CYCLE
END IF
#endif
DO ik=1,3
qv(:) = (0.d0, 0.d0)
!
DO iv=1,nh(is)
DO jv=iv,nh(is)
ijv = (jv-1)*jv/2 + iv
IF(iv.NE.jv) THEN
fac1=2.d0*fac*tpibab*rhovan(ijv,ia,isup)
fac2=2.d0*fac*tpibab*rhovan(ijv,ia,isdw)
ELSE
fac1= fac*tpibab*rhovan(ijv,ia,isup)
fac2= fac*tpibab*rhovan(ijv,ia,isdw)
END IF
DO ig=1,ngb
fg1(ig) = fac1 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
& qgb(ig,ijv,is) * eigrb(ig,ia)
fg2(ig) = fac2 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
& qgb(ig,ijv,is) * eigrb(ig,ia)
END DO
CALL fft_add_oned2box( qv, fg1, fg2 )
END DO
END DO
!
CALL invfft( qv, dfftb, ia)
!
fvan(ik,ia) = &
& boxdotgrid(irb(:,ia),isup,qv,vr(:,isup)) + &
& boxdotgrid(irb(:,ia),isdw,qv,vr(:,isdw))
END DO
END DO
DEALLOCATE( qv )
DEALLOCATE( fg1 )
DEALLOCATE( fg2 )
IF ( tfor .OR. thdyn .OR. (tprnfor.AND.tprint) ) THEN
CALL mp_sum( fvan, intra_bgrp_comm )
CALL mp_sum( fvan, inter_bgrp_comm )
fion(:,:) = fion(:,:) - fvan(:,:)
END IF
CALL mp_sum( fvan, intra_bgrp_comm )
CALL mp_sum( fvan, inter_bgrp_comm )
fion(:,:) = fion(:,:) - fvan(:,:)
10 CONTINUE
CALL mp_sum( deeq, intra_bgrp_comm )
CALL mp_sum( deeq, inter_bgrp_comm )
!
CALL stop_clock( 'newd' )
!

View File

@ -5,108 +5,15 @@
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!-----------------------------------------------------------------------
SUBROUTINE beta_eigr_x ( beigr, nspmn, nspmx, eigr, pptype_ )
!-----------------------------------------------------------------------
! computes: the array becp
! beigr(ig,iv)=
! = [(-i)**l beta(g,iv,is) e^(-ig.r_ia)]^*
!
! routine makes use of c*(g)=c(-g) (g> see routine ggen)
! input : beta(ig,l,is), eigr, c
! output: becp as parameter
!
USE kinds, ONLY : DP
USE ions_base, only : nat, nsp, ityp
USE gvecw, only : ngw
USE uspp, only : nkb, nhtol, beta, indv_ijkb0
USE uspp_param, only : nh, upf, nhm
!
USE gvect, ONLY : gstart
!
implicit none
integer, intent(in) :: nspmn, nspmx
complex(DP), intent(in) :: eigr( :, : )
complex(DP), intent(out) :: beigr( :, : )
INTEGER, INTENT(IN), OPTIONAL :: pptype_
! pptype_: pseudo type to process: 0 = all, 1 = norm-cons, 2 = ultra-soft
!
integer :: ig, is, iv, ia, l, inl
complex(DP) :: cfact
integer :: pptype
!
call start_clock( 'beta_eigr' )
IF( PRESENT( pptype_ ) ) THEN
pptype = pptype_
ELSE
pptype = 0
END IF
!allocate( wrk2( ngw, nkb ) )
beigr = 0.0d0
!$omp parallel default(none), &
!$omp shared(nat,ngw,nh,nhtol,beigr,beta,eigr,ityp,pptype,nspmn,nspmx,upf,gstart,indv_ijkb0), &
!$omp private(is,ia,iv,inl,l,cfact,ig)
!$omp do
DO ia = 1, nat
is = ityp(ia)
!
IF( pptype == 2 .AND. .NOT. upf(is)%tvanp ) CYCLE
IF( pptype == 1 .AND. upf(is)%tvanp ) CYCLE
IF( is >= nspmn .AND. is <= nspmx ) THEN
!
inl = indv_ijkb0(ia)
do iv = 1, nh( is )
!
l = nhtol( iv, is )
!
if (l == 0) then
cfact = cmplx( 1.0_dp , 0.0_dp )
else if (l == 1) then
cfact = - cmplx( 0.0_dp , 1.0_dp )
else if (l == 2) then
cfact = - cmplx( 0.0_dp , 1.0_dp )
cfact = cfact * cfact
else if (l == 3) then
cfact = - cmplx( 0.0_dp , 1.0_dp )
cfact = cfact * cfact * cfact
endif
!
! q = 0 component (with weight 1.0)
!
if (gstart == 2) then
beigr( 1, iv + inl ) = cfact * beta(1,iv,is) * eigr(1,ia)
end if
!
! q > 0 components (with weight 2.0)
!
do ig = gstart, ngw
beigr( ig, iv + inl ) = 2.0d0 * cfact * beta(ig,iv,is) * eigr(ig,ia)
end do
!
end do
!
END IF
END DO
!$omp end do
!$omp end parallel
call stop_clock( 'beta_eigr' )
RETURN
END SUBROUTINE beta_eigr_x
!-----------------------------------------------------------------------
#if defined(__CUDA)
#define DEVICEATTR ,DEVICE
#else
#define DEVICEATTR
#endif
!
!
!-----------------------------------------------------------------------
subroutine nlsm1us_x ( n, beigr, c, becp )
subroutine nlsm1us_x ( n, betae, c, becp )
!-----------------------------------------------------------------------
! computes: the array becp
@ -124,17 +31,36 @@
USE mp_global, ONLY : nproc_bgrp, intra_bgrp_comm
USE gvecw, only : ngw
USE uspp, only : nkb
USE gvect, ONLY : gstart
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
!
implicit none
integer, intent(in) :: n
complex(DP), intent(in) :: beigr( :, : ), c( :, : )
real(DP), intent(out) :: becp( :, : )
complex(DP) DEVICEATTR , intent(in) :: c( :, : )
complex(DP) DEVICEATTR , intent(inout) :: betae( :, : )
real(DP) DEVICEATTR , intent(out) :: becp( :, : )
INTEGER :: i
!
call start_clock( 'nlsm1us' )
IF( ngw > 0 .AND. nkb > 0 ) THEN
CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 1.0d0, beigr, 2*ngw, c, 2*ngw, 0.0d0, becp, nkb )
IF( gstart > 1 ) THEN
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, nkb
betae( 1, i ) = 0.5d0 * betae( 1, i )
END DO
END IF
CALL MYDGEMM( 'T', 'N', nkb, n, 2*ngw, 2.0d0, betae, 2*ngw, c, 2*ngw, 0.0d0, becp, nkb )
IF( gstart > 1 ) THEN
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, nkb
betae( 1, i ) = 2.0d0 * betae( 1, i )
END DO
END IF
END IF
IF( nproc_bgrp > 1 ) THEN
@ -149,7 +75,7 @@
!
!
!-----------------------------------------------------------------------
subroutine nlsm1_x ( n, nspmn, nspmx, eigr, c, becp, pptype_ )
subroutine nlsm1_x ( n, betae, c, becp, pptype_ )
!-----------------------------------------------------------------------
! computes: the array becp
@ -170,21 +96,20 @@
USE uspp, only : nkb, nhtol, beta, indv_ijkb0
USE uspp_param, only : nh, upf, nhm
USE gvect, ONLY : gstart
USE cp_interfaces, only : beta_eigr
!
implicit none
integer, intent(in) :: n, nspmn, nspmx
complex(DP), intent(in) :: eigr( :, : ), c( :, : )
real(DP), intent(out) :: becp( :, : )
integer, intent(in) :: n
complex(DP), intent(in) :: c( :, : )
complex(DP), intent(inout) :: betae( :, : )
real(DP), intent(out) :: becp( :, : )
INTEGER, INTENT(IN), OPTIONAL :: pptype_
! pptype_: pseudo type to process: 0 = all, 1 = norm-cons, 2 = ultra-soft
!
integer :: ig, is, iv, ia, l, inl
real(DP), allocatable :: becps( :, : )
complex(DP), allocatable :: wrk2( :, : )
complex(DP) :: cfact
integer :: ig, is, iv, ia, l, inl
integer :: pptype
real(DP), allocatable :: becps( :, : )
LOGICAL :: nothing_to_do
!
call start_clock( 'nlsm1' )
@ -194,21 +119,40 @@
pptype = 0
END IF
allocate( wrk2( ngw, nkb ) )
allocate( becps( SIZE(becp,1), SIZE(becp,2) ) )
CALL beta_eigr ( wrk2, nspmn, nspmx, eigr, pptype_ )
IF( ngw > 0 .AND. nkb > 0 ) THEN
CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 1.0d0, wrk2, 2*ngw, c, 2*ngw, 0.0d0, becps, nkb )
IF( pptype == 1 ) THEN
nothing_to_do = .TRUE.
do is = 1, nsp
IF( .NOT. upf(is)%tvanp ) THEN
nothing_to_do = .FALSE.
END IF
END DO
IF( nothing_to_do ) GO TO 100
END IF
DEALLOCATE( wrk2 )
IF( pptype == 0 ) THEN
IF( ngw > 0 .AND. nkb > 0 ) THEN
IF( gstart > 1 ) betae( 1, : ) = 0.5d0 * betae( 1, : )
CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 2.0d0, betae, 2*ngw, c, 2*ngw, 0.0d0, becp, SIZE(becp,1) )
IF( gstart > 1 ) betae( 1, : ) = 2.0d0 * betae( 1, : )
END IF
IF( nproc_bgrp > 1 ) THEN
CALL mp_sum( becp, intra_bgrp_comm )
END IF
GO TO 100
END IF
allocate( becps( SIZE(becp,1), SIZE(becp,2) ) )
IF( ngw > 0 .AND. nkb > 0 ) THEN
IF( gstart > 1 ) betae( 1, : ) = 0.5d0 * betae( 1, : )
CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 2.0d0, betae, 2*ngw, c, 2*ngw, 0.0d0, becps, nkb )
IF( gstart > 1 ) betae( 1, : ) = 2.0d0 * betae( 1, : )
END IF
IF( nproc_bgrp > 1 ) THEN
CALL mp_sum( becps, intra_bgrp_comm )
END IF
do is = nspmn, nspmx
do is = 1, nsp
IF( pptype == 2 .AND. .NOT. upf(is)%tvanp ) CYCLE
IF( pptype == 1 .AND. upf(is)%tvanp ) CYCLE
DO ia = 1, nat
@ -218,21 +162,21 @@
becp(inl+iv,:) = becps( inl+iv, : )
end do
END IF
end do
end do
end do
!
DEALLOCATE( becps )
100 CONTINUE
call stop_clock( 'nlsm1' )
return
end subroutine nlsm1_x
!-----------------------------------------------------------------------
!-------------------------------------------------------------------------
subroutine nlsm2_bgrp_x( ngw, nkb, eigr, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
subroutine nlsm2_bgrp_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
!-----------------------------------------------------------------------
! computes: the array becdr
@ -245,9 +189,6 @@
!
USE kinds, ONLY : DP
use ions_base, only : nsp, ityp, nat
use uspp, only : nhtol, beta, indv_ijkb0
use uspp_param, only : nh, upf
use cell_base, only : tpiba
use mp, only : mp_sum
use mp_global, only : nproc_bgrp, intra_bgrp_comm
@ -256,69 +197,40 @@
implicit none
integer, intent(in) :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
complex(DP), intent(in) :: eigr(:,:), c_bgrp(:,:)
complex(DP), intent(in) :: betae(:,:), c_bgrp(:,:)
real(DP), intent(out) :: becdr_bgrp(:,:,:)
!
complex(DP), allocatable :: wrk2(:,:)
!
integer :: ig, is, iv, ia, k, l, inl
complex(DP) :: cfact
integer :: ig, iv, k, info
complex(DP) :: cfact1, cfact2
!
call start_clock( 'nlsm2' )
allocate( wrk2( ngw, nkb ) )
cfact2 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba * 2.0d0
cfact1 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba
allocate( wrk2, MOLD = betae, STAT = info )
IF( info /= 0 ) &
CALL errore( ' nlsm2 ', ' allocating wrk2', ABS( info ) )
!
DO k = 1, 3
!
DO ia = 1, nat
is = ityp(ia)
inl = indv_ijkb0(ia)
do iv=1,nh(is)
!
! order of states: s_1 p_x1 p_z1 p_y1 s_2 p_x2 p_z2 p_y2
!
l=nhtol(iv,is)
! compute (-i)^(l+1)
!
if (l == 0) then
cfact = - cmplx( 0.0_dp , 1.0_dp )
else if (l == 1) then
cfact = - cmplx( 0.0_dp , 1.0_dp )
cfact = cfact * cfact
else if (l == 2) then
cfact = - cmplx( 0.0_dp , 1.0_dp )
cfact = cfact * cfact * cfact
else if (l == 3) then
cfact = cmplx( 1.0_dp , 0.0_dp )
endif
cfact = cfact * tpiba
! q = 0 component (with weight 1.0)
if (gstart == 2) then
wrk2(1,iv+inl) = cfact * g(k,1) * beta(1,iv,is) * eigr(1,ia)
end if
! q > 0 components (with weight 2.0)
do ig=gstart,ngw
wrk2(ig,iv+inl) = cfact * 2.0d0 * g(k,ig) * beta(ig,iv,is) * eigr(ig,ia)
end do
!
end do
do iv=1,nkb
wrk2(1,iv) = cfact1 * g(k,1) * betae(1,iv)
end do
!$omp parallel do default(shared) private(iv,ig) collapse(2)
do iv=1,nkb
do ig=gstart,ngw
wrk2(ig,iv) = cfact2 * g(k,ig) * betae(ig,iv)
end do
end do
!$omp end parallel do
IF( ngw > 0 .AND. nkb > 0 ) THEN
CALL dgemm( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2, 2*ngw, &
CALL dgemm( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2(1,1), 2*ngw, &
c_bgrp, 2*ngw, 0.0d0, becdr_bgrp( 1, 1, k ), nkb )
END IF
end do
deallocate( wrk2 )
IF( nproc_bgrp > 1 ) THEN
@ -331,7 +243,86 @@
end subroutine nlsm2_bgrp_x
!-----------------------------------------------------------------------
#if defined (__CUDA)
!-------------------------------------------------------------------------
subroutine nlsm2_bgrp_gpu_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
!-----------------------------------------------------------------------
! computes: the array becdr
! becdr(ia,n,iv,is,k)
! =2.0 sum_g> g_k beta(g,iv,is) re[ (i)**(l+1) e^(ig.r_ia) c(g,n)]
!
! routine makes use of c*(g)=c(-g) (g> see routine ggen)
! input : eigr, c
! output: becdr
!
USE kinds, ONLY : DP
use cell_base, only : tpiba
use mp, only : mp_sum
use mp_global, only : nproc_bgrp, intra_bgrp_comm
use gvect, only : gstart
use gvect_gpum, only : g_d
USE device_memcpy_m, ONLY : dev_memcpy
USE cudafor
USE cublas
!
implicit none
integer, intent(in) :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
complex(DP), intent(in), DEVICE :: c_bgrp(:,:)
complex(DP), intent(in), DEVICE :: betae(:,:)
real(DP), intent(out) :: becdr_bgrp(:,:,:)
!
complex(DP), allocatable, DEVICE :: wrk2(:,:)
real(DP), allocatable, DEVICE :: becdr_d(:,:)
!
integer :: ig, iv, k, info
complex(DP) :: cfact1, cfact2
!
call start_clock( 'nlsm2' )
ALLOCATE( wrk2, MOLD=betae, STAT = info )
IF( info /= 0 ) &
CALL errore( ' nlsm2 ', ' allocating wrk2', ABS( info ) )
ALLOCATE( becdr_d( SIZE( becdr_bgrp, 1 ), SIZE( becdr_bgrp, 2 ) ), STAT=info )
IF( info /= 0 ) &
CALL errore( ' nlsm2 ', ' allocating becdr_d ', ABS( info ) )
cfact2 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba * 2.0d0
cfact1 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba
DO k = 1, 3
!$cuf kernel do(1) <<<*,*>>>
do iv=1,nkb
wrk2(1,iv) = cfact1 * g_d(k,1) * betae(1,iv)
end do
!$cuf kernel do(2) <<<*,*>>>
do iv=1,nkb
do ig=gstart,ngw
wrk2(ig,iv) = cfact2 * g_d(k,ig) * betae(ig,iv)
end do
end do
IF( ngw > 0 .AND. nkb > 0 ) THEN
CALL MYDGEMM( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2(1,1), 2*ngw, &
c_bgrp, 2*ngw, 0.0d0, becdr_d, nkb )
CALL dev_memcpy( becdr_bgrp(:,:,k), becdr_d )
END IF
end do
DEALLOCATE( becdr_d )
deallocate( wrk2 )
IF( nproc_bgrp > 1 ) THEN
CALL mp_sum( becdr_bgrp, intra_bgrp_comm )
END IF
call stop_clock( 'nlsm2' )
!
return
end subroutine nlsm2_bgrp_gpu_x
!-----------------------------------------------------------------------
#endif
!-----------------------------------------------------------------------
SUBROUTINE ennl_x( ennl_val, rhovan, bec_bgrp )
@ -356,35 +347,40 @@
! local
!
real(DP) :: sumt, sums(2), ennl_t
integer :: is, iv, jv, ijv, inl, jnl, ia, iss, i
integer :: is, iv, jv, ijv, inl, jnl, ia, iss, i, indv
INTEGER :: omp_get_num_threads
!
ennl_t = 0.d0
!
do is = 1, nsp
!$omp parallel num_threads(min(4,omp_get_num_threads())) default(none) &
!$omp shared(nat,ityp,indv_ijkb0,nh,nbsp_bgrp,ispin_bgrp,f_bgrp,bec_bgrp,rhovan,dvan,nspin,ennl_t) &
!$omp private(ia,is,indv,iv,inl,jv,ijv,jnl,sums,iss,i,sumt)
!$omp do reduction(+:ennl_t)
do ia = 1, nat
is = ityp(ia)
indv = indv_ijkb0(ia)
do iv = 1, nh(is)
inl = indv + iv
do jv = iv, nh(is)
ijv = (jv-1)*jv/2 + iv
do ia = 1, nat
IF( ityp(ia) == is ) THEN
inl = indv_ijkb0(ia) + iv
jnl = indv_ijkb0(ia) + jv
sums = 0.d0
do i = 1, nbsp_bgrp
iss = ispin_bgrp(i)
sums(iss) = sums(iss) + f_bgrp(i) * bec_bgrp(inl,i) * bec_bgrp(jnl,i)
end do
sumt = 0.d0
do iss = 1, nspin
rhovan( ijv, ia, iss ) = sums( iss )
sumt = sumt + sums( iss )
end do
if( iv .ne. jv ) sumt = 2.d0 * sumt
ennl_t = ennl_t + sumt * dvan( jv, iv, is)
END IF
jnl = indv + jv
sums = 0.d0
do i = 1, nbsp_bgrp
iss = ispin_bgrp(i)
sums(iss) = sums(iss) + f_bgrp(i) * bec_bgrp(inl,i) * bec_bgrp(jnl,i)
end do
sumt = 0.d0
do iss = 1, nspin
rhovan( ijv, ia, iss ) = sums( iss )
sumt = sumt + sums( iss )
end do
if( iv .ne. jv ) sumt = 2.d0 * sumt
ennl_t = ennl_t + sumt * dvan( jv, iv, is)
end do
end do
end do
!$omp end do
!$omp end parallel
!
ennl_val = ennl_t
!
@ -435,10 +431,8 @@
end subroutine calrhovan_x
!-----------------------------------------------------------------------
!-----------------------------------------------------------------------
subroutine calbec_x ( nspmn, nspmx, eigr, c, bec, pptype_ )
subroutine calbec_x ( n, betae, c, bec, pptype_ )
!-----------------------------------------------------------------------
! this routine calculates array bec
@ -450,59 +444,79 @@
!
USE kinds, ONLY : DP
use electrons_base, only : nbsp
use cp_interfaces, only : nlsm1
!
implicit none
!
integer, intent(in) :: nspmn, nspmx
real(DP), intent(out) :: bec( :, : )
complex(DP), intent(in) :: c( :, : ), eigr( :, : )
INTEGER, INTENT(IN) :: n
real(DP), intent(out) :: bec( :, : )
complex(DP), intent(in) :: c( :, : )
complex(DP), intent(inout) :: betae( :, : )
INTEGER, INTENT(IN), OPTIONAL :: pptype_
! local variables
!
call start_clock( 'calbec' )
!
call nlsm1( nbsp, nspmn, nspmx, eigr, c, bec, pptype_ )
!
call nlsm1( n, betae, c, bec, pptype_ )
call stop_clock( 'calbec' )
!
return
end subroutine calbec_x
!-----------------------------------------------------------------------
!-----------------------------------------------------------------------
subroutine calbec_bgrp_x ( nspmn, nspmx, eigr, c_bgrp, bec_bgrp, pptype_ )
!-----------------------------------------------------------------------
! this routine calculates array bec
!
! < psi_n | beta_i,i > = c_n(0) beta_i,i(0) +
! 2 sum_g> re(c_n*(g) (-i)**l beta_i,i(g) e^-ig.r_i)
!
! routine makes use of c(-g)=c*(g) and beta(-g)=beta*(g)
!
!-----------------------------------------------------------------------
SUBROUTINE dbeta_eigr_x( dbeigr, eigr )
!-----------------------------------------------------------------------
!
USE kinds, ONLY : DP
use ions_base, only : nat, ityp
use uspp, only : nhtol, nkb, dbeta, indv_ijkb0
use uspp_param, only : nh, nhm
use gvect, only : gstart
use gvecw, only : ngw
!
implicit none
!
include 'laxlib.fh'
!
complex(DP), intent(out) :: dbeigr( :, :, :, : )
complex(DP), intent(in) :: eigr(:,:)
!
integer :: ig, is, iv, ia, l, inl, i, j
complex(DP) :: cfact(4)
!
!if (l == 0) then
cfact(1) = cmplx( 1.0_dp , 0.0_dp )
!else if (l == 1) then
cfact(2) = - cmplx( 0.0_dp , 1.0_dp )
!else if (l == 2) then
cfact(3) = - cmplx( 0.0_dp , 1.0_dp )
cfact(3) = cfact(3) * cfact(3)
!else if (l == 3) then
cfact(4) = - cmplx( 0.0_dp , 1.0_dp )
cfact(4) = cfact(4) * cfact(4) * cfact(4)
!endif
USE kinds, ONLY : DP
use electrons_base, only : nbsp_bgrp
use cp_interfaces, only : nlsm1
!
implicit none
!
integer, intent(in) :: nspmn, nspmx
real(DP), intent(out) :: bec_bgrp( :, : )
complex(DP), intent(in) :: c_bgrp( :, : ), eigr( :, : )
INTEGER, INTENT(IN), OPTIONAL :: pptype_
!
call start_clock( 'calbec' )
!
call nlsm1( nbsp_bgrp, nspmn, nspmx, eigr, c_bgrp, bec_bgrp, pptype_ )
!
call stop_clock( 'calbec' )
!
return
end subroutine calbec_bgrp_x
do j=1,3
do i=1,3
do ia = 1, nat
is = ityp(ia)
inl = indv_ijkb0(ia)
do iv=1,nh(is)
l=nhtol(iv,is)
! q = 0 component (with weight 1.0)
dbeigr(1,iv+inl,i,j)= cfact(l+1)*dbeta(1,iv,is,i,j)*eigr(1,ia)
! q > 0 components (with weight 2.0)
do ig = gstart, ngw
dbeigr(ig,iv+inl,i,j) = 2.0d0*cfact(l+1)*dbeta(ig,iv,is,i,j)*eigr(ig,ia)
end do
end do
end do
end do
end do
!
return
end subroutine dbeta_eigr_x
!-----------------------------------------------------------------------
!-----------------------------------------------------------------------
@ -526,8 +540,9 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
use uspp_param, only : nh, nhm
use gvect, only : gstart
use gvecw, only : ngw
use electrons_base, only : nspin, iupdwn, nupdwn, nbspx_bgrp, iupdwn_bgrp, nupdwn_bgrp, &
ibgrp_g2l, i2gupdwn_bgrp, nbspx, nbsp_bgrp
use electrons_base, only : nspin, iupdwn, nupdwn, nbspx_bgrp, iupdwn_bgrp, nupdwn_bgrp, &
ibgrp_g2l, i2gupdwn_bgrp, nbspx, nbsp_bgrp
use cp_interfaces, only : dbeta_eigr
!
implicit none
!
@ -538,7 +553,7 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
real(DP), intent(out) :: dbec( :, :, :, : )
integer, intent(in) :: idesc( :, : )
!
complex(DP), allocatable :: wrk2(:,:)
complex(DP), allocatable :: wrk2(:,:,:,:)
real(DP), allocatable :: dwrk_bgrp(:,:)
!
integer :: ig, is, iv, ia, l, inl, i, j, ii, iw, iss, nr, ir, istart, nss
@ -549,47 +564,23 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
!
dbec = 0.0d0
!
allocate( wrk2( ngw, nkb, 3, 3 ) )
allocate( dwrk_bgrp( nkb, nbspx_bgrp ) )
!
CALL dbeta_eigr( wrk2, eigr )
!
do j=1,3
do i=1,3
IF( ngw > 0 .AND. nkb > 0 ) THEN
CALL dgemm( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2(1,1,i,j), 2*ngw, &
c_bgrp, 2*ngw, 0.0d0, dwrk_bgrp(1,1), nkb )
END IF
if( nproc_bgrp > 1 ) then
call mp_sum( dwrk_bgrp, intra_bgrp_comm )
end if
do ia = 1, nat
is = ityp(ia)
allocate( wrk2( ngw, nh(is) ) )
allocate( dwrk_bgrp( nh(is), nbspx_bgrp ) )
do iv=1,nh(is)
l=nhtol(iv,is)
if (l == 0) then
cfact = cmplx( 1.0_dp , 0.0_dp )
else if (l == 1) then
cfact = - cmplx( 0.0_dp , 1.0_dp )
else if (l == 2) then
cfact = - cmplx( 0.0_dp , 1.0_dp )
cfact = cfact * cfact
else if (l == 3) then
cfact = - cmplx( 0.0_dp , 1.0_dp )
cfact = cfact * cfact * cfact
else
CALL errore(' caldbec ', ' l not implemented ', ABS( l ) )
endif
!
! q = 0 component (with weight 1.0)
if (gstart == 2) then
wrk2(1,iv)= cfact*dbeta(1,iv,is,i,j)*eigr(1,ia)
end if
! q > 0 components (with weight 2.0)
do ig = gstart, ngw
wrk2(ig,iv) = 2.0d0*cfact*dbeta(ig,iv,is,i,j)*eigr(ig,ia)
end do
end do
IF( ngw > 0 .AND. nh(is) > 0 ) THEN
CALL dgemm( 'T', 'N', nh(is), nbsp_bgrp, 2*ngw, 1.0d0, wrk2, 2*ngw, &
c_bgrp, 2*ngw, 0.0d0, dwrk_bgrp(1,1), nh(is) )
END IF
if( nproc_bgrp > 1 ) then
call mp_sum( dwrk_bgrp, intra_bgrp_comm )
end if
inl = indv_ijkb0(ia)
do iss=1,nspin
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
nr = idesc( LAX_DESC_NR, iss )
@ -600,19 +591,18 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
ibgrp_i = ibgrp_g2l( ii + ir - 1 + istart - 1 )
IF( ibgrp_i > 0 ) THEN
do iw = 1, nh(is)
dbec( indv_ijkb0(ia) + iw, ii + (iss-1)*nrcx, i, j ) = dwrk_bgrp( iw, ibgrp_i )
dbec( inl + iw, ii + (iss-1)*nrcx, i, j ) = dwrk_bgrp( inl + iw, ibgrp_i )
end do
END IF
end do
END IF
end do
deallocate( wrk2 )
deallocate( dwrk_bgrp )
end do
end do
end do
deallocate( wrk2 )
deallocate( dwrk_bgrp )
if( nbgrp > 1 ) then
CALL mp_sum( dbec, inter_bgrp_comm )
end if
@ -659,6 +649,11 @@ subroutine dennl_x( bec_bgrp, dbec, drhovan, denl, idesc )
denl=0.d0
drhovan=0.0d0
!$omp parallel default(none) &
!$omp shared(nat,ityp,indv_ijkb0,nh,nbsp_bgrp,ispin_bgrp,f_bgrp,bec_bgrp,drhovan,dvan,nspin,denl) &
!$omp shared(idesc,iupdwn,nupdwn,ibgrp_g2l,nrcx,dbec) &
!$omp private(ia,is,iv,inl,jv,ijv,jnl,dsums,iss,i,dsum,ii,ir,k,j,nr,istart,nss,ibgrp)
!$omp do reduction(+:denl)
do ia=1,nat
is = ityp(ia)
do iv=1,nh(is)
@ -701,6 +696,8 @@ subroutine dennl_x( bec_bgrp, dbec, drhovan, denl, idesc )
end do
end do
end do
!$omp end do
!$omp end parallel
CALL mp_sum( denl, intra_bgrp_comm )
CALL mp_sum( drhovan, intra_bgrp_comm )
@ -720,7 +717,7 @@ end subroutine dennl_x
!-----------------------------------------------------------------------
subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
subroutine nlfq_bgrp_x( c_bgrp, betae, bec_bgrp, becdr_bgrp, fion )
!-----------------------------------------------------------------------
!
! contribution to fion due to nonlocal part
@ -739,12 +736,16 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
!
implicit none
!
COMPLEX(DP), INTENT(IN) :: c_bgrp( :, : ), eigr( :, : )
COMPLEX(DP), INTENT(IN) :: c_bgrp( :, : )
COMPLEX(DP), INTENT(IN) :: betae( :, : )
#if defined (__CUDA)
ATTRIBUTES( DEVICE ) :: c_bgrp, betae
#endif
REAL(DP), INTENT(IN) :: bec_bgrp( :, : )
REAL(DP), INTENT(OUT) :: becdr_bgrp( :, :, : )
REAL(DP), INTENT(OUT) :: fion( :, : )
!
integer :: k, is, ia, inl, iv, jv, i
integer :: k, is, ia, inl, jnl, iv, jv, i
real(DP) :: temp
real(DP) :: sum_tmpdr
!
@ -758,7 +759,7 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
!
! nlsm2 fills becdr
!
call nlsm2_bgrp( ngw, nkb, eigr, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
call nlsm2_bgrp( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
!
allocate ( fion_loc( 3, nat ) )
!
@ -767,17 +768,17 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
!$omp parallel default(none), &
!$omp shared(becdr_bgrp,bec_bgrp,fion_loc,f_bgrp,deeq,dvan,nbsp_bgrp,indv_ijkb0,nh, &
!$omp nat,nhm,nbspx_bgrp,ispin_bgrp,nproc_bgrp,me_bgrp,ityp), &
!$omp private(tmpbec,tmpdr,is,ia,iv,jv,k,inl,temp,i,mytid,ntids,sum_tmpdr)
!$omp private(tmpbec,tmpdr,is,ia,iv,jv,k,inl,jnl,temp,i,mytid,ntids,sum_tmpdr)
#if defined(_OPENMP)
mytid = omp_get_thread_num() ! take the thread ID
ntids = omp_get_num_threads() ! take the number of threads
#endif
allocate ( tmpbec( nhm, nbspx_bgrp ), tmpdr( nhm, nbspx_bgrp ) )
allocate ( tmpbec( nbspx_bgrp, nhm ), tmpdr( nbspx_bgrp, nhm ) )
DO k = 1, 3
DO ia=1,nat
DO ia = 1, nat
is = ityp(ia)
! better if we distribute to MPI tasks too!
@ -790,29 +791,27 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
IF( MOD( ( ia + (k-1)*nat ) / nproc_bgrp, ntids ) /= mytid ) CYCLE
#endif
tmpbec = 0.d0
tmpdr = 0.d0
do iv=1,nh(is)
do jv=1,nh(is)
inl = indv_ijkb0(ia) + jv
do jv=1,nh(is)
jnl = indv_ijkb0(ia) + jv
do iv=1,nh(is)
do i = 1, nbsp_bgrp
temp = dvan(iv,jv,is) + deeq(jv,iv,ia,ispin_bgrp( i ) )
tmpbec(iv,i) = tmpbec(iv,i) + temp * bec_bgrp(inl,i)
tmpbec(i,iv) = tmpbec(i,iv) + temp * bec_bgrp(jnl,i)
end do
end do
end do
do iv=1,nh(is)
do iv = 1, nh(is)
inl = indv_ijkb0(ia) + iv
do i = 1, nbsp_bgrp
tmpdr(iv,i) = f_bgrp( i ) * becdr_bgrp( inl, i, k )
tmpdr(i,iv) = f_bgrp( i ) * becdr_bgrp( inl, i, k )
end do
end do
sum_tmpdr = 0.0d0
do i = 1, nbsp_bgrp
do iv = 1, nh(is)
sum_tmpdr = sum_tmpdr + tmpdr(iv,i)*tmpbec(iv,i)
do iv = 1, nh(is)
do i = 1, nbsp_bgrp
sum_tmpdr = sum_tmpdr + tmpdr(i,iv)*tmpbec(i,iv)
end do
end do
@ -823,7 +822,6 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
deallocate ( tmpbec, tmpdr )
!$omp end parallel
!
CALL mp_sum( fion_loc, intra_bgrp_comm )
IF( nbgrp > 1 ) THEN

View File

@ -305,7 +305,7 @@
!
!-----------------------------------------------------------------------
subroutine set_cc( irb, eigrb, rhoc )
subroutine set_cc( rhoc )
!-----------------------------------------------------------------------
!
! Calculate core charge contribution in real space, rhoc(r)
@ -320,91 +320,76 @@
use core, only: rhocb
use fft_interfaces, only: invfft
use fft_base, only: dfftb, dfftp
USE cp_main_variables, ONLY: irb, eigrb
USE mp_global, ONLY: nproc_bgrp, me_bgrp, inter_bgrp_comm, my_bgrp_id, nbgrp
USE mp, ONLY: mp_sum
implicit none
! input
integer, intent(in) :: irb(3,nat)
complex(dp), intent(in):: eigrb(ngb,nat)
! output
real(dp), intent(out) :: rhoc(dfftp%nnr)
! local
integer nfft, ig, is, ia, isa
complex(dp) ci
integer :: ig, is, ia, isa, iia
INTEGER :: nabox, iabox( nat )
complex(dp), PARAMETER :: ci = (0.d0,1.d0)
complex(dp), allocatable :: wrk1(:)
complex(dp), allocatable :: qv(:), fg1(:), fg2(:)
INTEGER :: mytid, ntids
#if defined(_OPENMP)
INTEGER :: itid, mytid, ntids, omp_get_thread_num, omp_get_num_threads
INTEGER :: omp_get_thread_num, omp_get_num_threads
EXTERNAL :: omp_get_thread_num, omp_get_num_threads
#endif
!
call start_clock( 'set_cc' )
ci=(0.d0,1.d0)
allocate( wrk1 ( dfftp%nnr ) )
wrk1 (:) = (0.d0, 0.d0)
nabox = 0
DO ia = 1, nat
IF( .NOT. upf(ityp(ia))%nlcc ) CYCLE
IF( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2 ( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) CYCLE
nabox = nabox + 1
iabox( nabox ) = ia
END DO
!
!$omp parallel default(none) &
!$omp shared(nsp, na, ngb, eigrb, dfftb, irb, ci, rhocb, &
!$omp nat, upf, wrk1, ityp ) &
!$omp private(mytid, ntids, is, ia, nfft, ig, isa, qv, fg1, fg2, itid )
!$omp shared( nsp, na, ngb, eigrb, dfftb, irb, rhocb, &
!$omp nat, wrk1, ityp, nabox, iabox ) &
!$omp private( mytid, ntids, is, ia, iia, ig, qv, fg1 )
allocate( qv ( dfftb%nnr ) )
allocate( fg1 ( ngb ) )
allocate( fg2 ( ngb ) )
!
isa = 0
!$omp workshare
wrk1 = (0.d0, 0.d0)
!$omp end workshare
!
#if defined(_OPENMP)
mytid = omp_get_thread_num() ! take the thread ID
ntids = omp_get_num_threads() ! take the number of threads
itid = 0
#else
mytid = 0
ntids = 1
#endif
do ia = 1, nat
!
is = ityp(ia)
if (.not.upf(is)%nlcc) then
cycle
end if
!
#if defined(__MPI)
nfft=1
if ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2 ( ia ) <= 0 ) ) cycle
#endif
#if defined(_OPENMP)
IF ( mytid /= itid ) THEN
itid = MOD( itid + 1, ntids )
CYCLE
ELSE
itid = MOD( itid + 1, ntids )
END IF
#endif
if(nfft.eq.2)then
fg1 = eigrb(1:ngb,ia )*rhocb(1:ngb,is)
fg2 = eigrb(1:ngb,ia+1)*rhocb(1:ngb,is)
CALL fft_oned2box( qv, fg1, fg2 )
else
do iia = 1, nabox
IF( MOD( iia - 1, ntids ) == mytid ) THEN
ia = iabox(iia)
is = ityp(ia)
fg1 = eigrb(1:ngb,ia )*rhocb(1:ngb,is)
CALL fft_oned2box( qv, fg1 )
endif
!
call invfft( qv, dfftb, ia )
!
call box2grid(irb(:,ia),1,qv,wrk1)
if (nfft.eq.2) call box2grid(irb(:,ia+1),2,qv,wrk1)
!
call invfft( qv, dfftb, ia )
call box2grid(irb(:,ia),1,qv,wrk1)
END IF
end do
!
deallocate( qv )
deallocate( fg1 )
deallocate( fg2 )
deallocate( qv )
!$omp end parallel
CALL mp_sum( wrk1, inter_bgrp_comm )
call dcopy( dfftp%nnr, wrk1, 2, rhoc, 1 )
deallocate( wrk1 )

View File

@ -6,17 +6,208 @@
! or http://www.gnu.org/copyleft/gpl.txt .
!
#if defined(__CUDA)
#define DGEMMDRV cublasDgemm
#define DEVICEATTR ,DEVICE
#else
#define DGEMMDRV dgemm
#define DEVICEATTR
#endif
MODULE ortho_module
!
#if defined(__CUDA)
USE cudafor
#endif
USE kinds, ONLY: DP
IMPLICIT NONE
SAVE
REAL(DP), ALLOCATABLE DEVICEATTR :: s(:,:), sig(:,:), tau(:,:), stmp(:,:)
REAL(DP), ALLOCATABLE DEVICEATTR :: wrk(:,:), rhoa(:,:), rhos(:,:), rhod(:)
REAL(DP), ALLOCATABLE DEVICEATTR :: xloc(:,:)
CONTAINS
SUBROUTINE allocate_local_ortho_memory( nss, nx0 )
INTEGER, INTENT(IN) :: nss, nx0
INTEGER :: info
IF( ALLOCATED( rhos ) ) THEN
IF( nx0 == SIZE( rhos, 1 ) ) THEN
RETURN
ELSE
DEALLOCATE( rhos, rhoa, s, sig, tau, rhod )
IF(ALLOCATED(wrk)) DEALLOCATE(wrk)
IF(ALLOCATED(stmp)) DEALLOCATE(stmp)
END IF
END IF
ALLOCATE( rhos( nx0, nx0 ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating rhos ', ABS( info ) )
rhos = 0
ALLOCATE( rhoa( nx0, nx0 ), STAT = info ) ! antisymmetric part of rho
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating rhoa ', ABS( info ) )
rhoa = 0
ALLOCATE( s( nx0, nx0 ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating s ', ABS( info ) )
s = 0
ALLOCATE( sig( nx0, nx0 ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating sig ', ABS( info ) )
sig = 0
ALLOCATE( tau( nx0, nx0 ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( info ) )
tau = 0
ALLOCATE( rhod( nss ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( info ) )
rhod = 0
#if defined(__CUDA)
ALLOCATE( wrk( nss, nss ), STAT = info )
IF( info /= 0 ) CALL errore( ' ortho_gamma ', ' allocating wrk ', 1 )
ALLOCATE( stmp( nss, nss ), STAT = info )
IF( info /= 0 ) CALL errore( ' ortho_gamma ', ' allocating stmp ', 1 )
#endif
END SUBROUTINE
SUBROUTINE deallocate_local_ortho_memory()
IF(ALLOCATED(s)) DEALLOCATE(s)
IF(ALLOCATED(sig)) DEALLOCATE(sig)
IF(ALLOCATED(tau)) DEALLOCATE(tau)
IF(ALLOCATED(stmp)) DEALLOCATE(stmp)
IF(ALLOCATED(wrk)) DEALLOCATE(wrk)
IF(ALLOCATED(rhoa)) DEALLOCATE(rhoa)
IF(ALLOCATED(rhos)) DEALLOCATE(rhos)
IF(ALLOCATED(rhod)) DEALLOCATE(rhod)
IF(ALLOCATED(xloc)) DEALLOCATE(xloc)
END SUBROUTINE deallocate_local_ortho_memory
SUBROUTINE x0_to_xloc( x0, nx0, ccc_, idesc )
REAL(DP), INTENT(IN) :: x0(:,:)
REAL(DP), INTENT(IN) :: ccc_
INTEGER, INTENT(IN) :: nx0
INTEGER, INTENT(IN) :: idesc(:)
include 'laxlib.fh'
INTEGER :: i, j, info
REAL(DP) DEVICEATTR :: ccc
IF( ALLOCATED(xloc) ) THEN
IF( SIZE(x0,1) /= SIZE(xloc,1) .OR. SIZE(x0,2) /= SIZE(xloc,2) ) THEN
DEALLOCATE(xloc)
END IF
END IF
IF( .NOT. ALLOCATED(xloc) ) THEN
ALLOCATE( xloc, MOLD=x0, STAT = info )
IF( info /= 0 ) &
CALL errore( ' x0_to_xloc ', ' allocating xloc ', ABS( info ) )
END IF
IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
RETURN
ENDIF
xloc = x0
ccc = ccc_
!$cuf kernel do(2) <<<*,*>>>
DO j = 1, SIZE(xloc,2)
DO i = 1, SIZE(xloc,1)
xloc(i,j) = xloc(i,j) * ccc
END DO
END DO
END SUBROUTINE x0_to_xloc
SUBROUTINE xloc_to_x0( x0, nx0, ccc_, idesc )
REAL(DP), INTENT(OUT) :: x0(:,:)
INTEGER, INTENT(IN) :: nx0
REAL(DP), INTENT(IN) :: ccc_
INTEGER, INTENT(IN) :: idesc(:)
include 'laxlib.fh'
INTEGER :: i, j
REAL(DP) DEVICEATTR :: byccc
IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
RETURN
ENDIF
IF( .NOT. ALLOCATED(xloc) ) THEN
CALL errore( ' xloc_to_x0 ', ' xloc not allocated ', 1 )
END IF
byccc = 1.0d0 / ccc_
!$cuf kernel do(2) <<<*,*>>>
DO j = 1, SIZE(xloc,2)
DO i = 1, SIZE(xloc,1)
xloc(i,j) = xloc(i,j) * byccc
END DO
END DO
x0 = xloc
END SUBROUTINE xloc_to_x0
SUBROUTINE distribute_matrix( a, b, ir, nr, ic, nc, comm )
USE mp, ONLY: mp_bcast
REAL(DP) DEVICEATTR :: a(:,:), b(:,:)
INTEGER, INTENT(IN) :: ir, nr, ic, nc, comm
INTEGER :: i, j, info
CALL mp_bcast( a, 0, comm )
!$cuf kernel do(2) <<<*,*>>>
DO j = 1, nc
DO i = 1, nr
b( i, j ) = a( i + ir - 1, j + ic - 1 )
END DO
END DO
RETURN
END SUBROUTINE
SUBROUTINE collect_matrix( a, b, ir, nr, ic, nc, comm )
USE mp, ONLY: mp_sum
REAL(DP) DEVICEATTR :: a(:,:), b(:,:)
INTEGER, INTENT(IN) :: ir, nr, ic, nc, comm
INTEGER :: i, j, info
a = 0.0d0
!$cuf kernel do(2) <<<*,*>>>
DO j = 1, nc
DO i = 1, nr
a( ir + i - 1, ic + j - 1 ) = b( i, j )
END DO
END DO
CALL mp_sum( a, comm )
RETURN
END SUBROUTINE
SUBROUTINE consistency_check( a, idesc )
REAL(DP) DEVICEATTR, INTENT(IN) :: a(:,:)
INTEGER, INTENT(IN) :: idesc(:)
INTEGER :: i, j
include 'laxlib.fh'
!
! on some machines (IBM RS/6000 for instance) the following test allows
! to distinguish between Numbers and Sodium Nitride (NaN, Not a Number).
! If a matrix of Not-Numbers is passed to rs, the most likely outcome is
! that the program goes on forever doing nothing and writing nothing.
!
#if ! defined(__CUDA)
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
DO j = 1, idesc(LAX_DESC_NC)
DO i = 1, idesc(LAX_DESC_NR)
IF (a(i,j) /= a(i,j)) &
CALL errore(' ortho ',' ortho went bananas ',1)
END DO
END DO
END IF
#endif
RETURN
END SUBROUTINE
!=----------------------------------------------------------------------------=!
SUBROUTINE ortho_gamma_x( iopt, cp, ngwx, phi, becp_dist, qbecp, nkbx, bephi, qbephi, &
x0, nx0, idesc, diff, iter, n, nss, istart )
SUBROUTINE ortho_gamma( cp, ngwx, phi, becp_dist, qbecp, nkbx, bephi, qbephi, &
nx0, idesc, diff, iter, n, nss, istart )
!=----------------------------------------------------------------------------=!
!
USE kinds, ONLY: DP
USE orthogonalize_base, ONLY: rhoset, sigset, tauset, ortho_iterate, &
ortho_alt_iterate, use_parallel_diag
use_parallel_diag
USE control_flags, ONLY: diagonalize_on_host
USE mp_global, ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, my_bgrp_id, inter_bgrp_comm, nbgrp
USE mp, ONLY: mp_sum, mp_bcast
USE mp_world, ONLY: mpime
USE device_memcpy_m
IMPLICIT NONE
@ -24,23 +215,21 @@
! ... Arguments
INTEGER, INTENT(IN) :: iopt
INTEGER, INTENT(IN) :: ngwx, nkbx, nx0
INTEGER, INTENT(IN) :: n, nss, istart
COMPLEX(DP) :: phi( ngwx, n ), cp( ngwx, n )
REAL(DP) :: bephi( :, : )
REAL(DP) :: becp_dist( :, : )
REAL(DP) :: qbephi( :, : ), qbecp( :, : )
REAL(DP) :: x0( nx0, nx0 )
COMPLEX(DP) DEVICEATTR :: phi( :, : ), cp( :, : )
REAL(DP) DEVICEATTR :: bephi( :, : )
REAL(DP) DEVICEATTR :: becp_dist( :, : )
REAL(DP) DEVICEATTR :: qbephi( :, : ), qbecp( :, : )
INTEGER, INTENT(IN) :: idesc(:)
INTEGER, INTENT(OUT) :: iter
REAL(DP), INTENT(OUT) :: diff
! ... Locals
REAL(DP), ALLOCATABLE :: s(:,:), sig(:,:), tau(:,:), rhot(:,:)
REAL(DP), ALLOCATABLE :: wrk(:,:), rhoa(:,:), rhos(:,:), rhod(:)
INTEGER :: i, j, info, nr, nc, ir, ic
INTEGER, SAVE :: icnt = 1
REAL(DP), ALLOCATABLE :: rhos_h(:,:), s_h(:,:), rhod_h(:)
!
! ... Subroutine body
!
@ -66,98 +255,13 @@
!
END IF
!
ALLOCATE( rhos( nx0, nx0 ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating rhos ', ABS( info ) )
ALLOCATE( rhoa( nx0, nx0 ), STAT = info ) ! antisymmetric part of rho
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating rhoa ', ABS( info ) )
ALLOCATE( s( nx0, nx0 ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating s ', ABS( info ) )
ALLOCATE( sig( nx0, nx0 ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating sig ', ABS( info ) )
ALLOCATE( tau( nx0, nx0 ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( info ) )
!
ALLOCATE( rhod( nss ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( rhod ) )
CALL allocate_local_ortho_memory(nss, nx0)
!
! rho = <s'c0|s|cp>
!
CALL start_clock( 'rhoset' )
!
CALL rhoset( cp, ngwx, phi, bephi, nkbx, qbecp, n, nss, istart, rhos, nx0, idesc )
!
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
!
ALLOCATE( rhot( nx0, nx0 ), STAT = info ) ! transpose of rho
IF( info /= 0 ) &
CALL errore( ' ortho_gamma ', ' allocating rhot ', ABS( rhod ) )
!
! distributed array rhos contains "rho",
! now transpose rhos and store the result in distributed array rhot
!
CALL sqr_tr_cannon( nss, rhos, nx0, rhot, nx0, idesc )
!
! Compute the symmetric part of rho
!
DO j = 1, nc
DO i = 1, nr
rhos( i, j ) = 0.5d0 * ( rhos( i, j ) + rhot( i, j ) )
END DO
END DO
!
! distributed array rhos now contains symmetric part of "rho",
!
CALL consistency_check( rhos )
!
! Antisymmetric part of rho, alredy distributed across ortho procs.
!
DO j = 1, nc
DO i = 1, nr
rhoa( i, j ) = rhos( i, j ) - rhot( i, j )
END DO
END DO
!
DEALLOCATE( rhot )
!
END IF
CALL rhoset( cp, ngwx, phi, bephi, nkbx, qbecp, n, nss, istart, rhos, rhoa, nx0, idesc )
CALL stop_clock( 'rhoset' )
CALL start_clock( 'rsg' )
!
! ... Diagonalize symmetric part of rho (rhos)
! ... "s" is the matrix of eigenvectors, "rhod" is the array of eigenvalues
!
IF( use_parallel_diag ) THEN
!
CALL laxlib_diagonalize( nss, rhos, rhod, s, idesc )
!
ELSE
!
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
!
ALLOCATE( wrk( nss, nss ), STAT = info )
IF( info /= 0 ) CALL errore( ' ortho_gamma ', ' allocating wrk ', 1 )
!
CALL collect_matrix( wrk, rhos )
!
CALL laxlib_diagonalize( nss, wrk, rhod )
!
CALL distribute_matrix( wrk, s )
!
DEALLOCATE( wrk )
!
END IF
!
END IF
!
CALL stop_clock( 'rsg' )
!
! sig = 1-<cp|s|cp>
!
@ -171,6 +275,74 @@
CALL tauset( phi, ngwx, bephi, nkbx, qbephi, n, nss, istart, tau, nx0, idesc )
CALL stop_clock( 'tauset' )
!
CALL consistency_check(rhos,idesc)
CALL start_clock( 'rsg' )
!
! ... Diagonalize symmetric part of rho (rhos)
! ... "s" is the matrix of eigenvectors, "rhod" is the array of eigenvalues
!
IF( use_parallel_diag ) THEN
!
#if defined(__CUDA)
IF( idesc(LAX_DESC_NR) == idesc(LAX_DESC_NC) .AND. idesc(LAX_DESC_NR) == idesc(LAX_DESC_N) ) THEN
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
CALL laxlib_diagonalize( nss, rhos, rhod, s, info )
END IF
ELSE IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
IF( diagonalize_on_host ) THEN ! tune here
ALLOCATE( rhos_h, SOURCE = rhos )
ALLOCATE( rhod_h, MOLD = rhod )
ALLOCATE( s_h, MOLD = s )
CALL laxlib_diagonalize( nss, rhos_h, rhod_h, s_h, idesc )
CALL dev_memcpy( s, s_h )
CALL dev_memcpy( rhod, rhod_h )
DEALLOCATE( rhos_h, rhod_h, s_h )
ELSE
CALL collect_matrix( wrk, rhos, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
IF( idesc(LAX_DESC_IC) == 1 .AND. idesc(LAX_DESC_IR) == 1 ) THEN
CALL laxlib_diagonalize( nss, wrk, rhod, stmp, info )
END IF
CALL distribute_matrix( stmp, s, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
CALL mp_bcast( rhod, 0, idesc(LAX_DESC_COMM) )
END IF
END IF
#else
CALL laxlib_diagonalize( nss, rhos, rhod, s, idesc )
#endif
!
ELSE
!
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
!
IF( idesc(LAX_DESC_NR) == idesc(LAX_DESC_NC) .AND. idesc(LAX_DESC_NR) == idesc(LAX_DESC_N) ) THEN
!
! rhos and s matrixes, are replicated, no need of collect them
#if defined(__CUDA)
CALL laxlib_diagonalize( nss, rhos, rhod, s, info )
#else
s = rhos
CALL laxlib_diagonalize( nss, s, rhod )
#endif
ELSE
!
CALL collect_matrix( wrk, rhos, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
!
#if defined(__CUDA)
CALL laxlib_diagonalize( nss, wrk, rhod, s, info )
#else
CALL laxlib_diagonalize( nss, wrk, rhod )
#endif
!
CALL distribute_matrix( wrk, s, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
!
END IF
!
END IF
!
END IF
!
CALL stop_clock( 'rsg' )
CALL start_clock( 'ortho_iter' )
!
IF( my_bgrp_id == 0 ) THEN
@ -180,15 +352,7 @@
! group are enough. Moreover replicating the computation across groups could leads
! to small numerical differences and weird numerical effects.
!
IF( iopt == 0 ) THEN
!
CALL ortho_iterate( iter, diff, s, nx0, rhod, x0, nx0, sig, rhoa, rhos, tau, nss, idesc)
!
ELSE
!
CALL ortho_alt_iterate( iter, diff, s, nx0, rhod, x0, nx0, sig, rhoa, tau, nss, idesc)
!
END IF
CALL ortho_iterate( iter, diff, s, nx0, rhod, xloc, nx0, sig, rhoa, rhos, tau, nss, idesc)
!
END IF
!
@ -197,74 +361,137 @@
! All groups must have the same lambda matrix, in order to avoid weird
! numerical side effects.
!
CALL mp_bcast( x0, 0, inter_bgrp_comm )
CALL mp_bcast( xloc, 0, inter_bgrp_comm )
CALL mp_bcast( iter, 0, inter_bgrp_comm )
CALL mp_bcast( diff, 0, inter_bgrp_comm )
!
END IF
!
CALL consistency_check( xloc,idesc )
CALL stop_clock( 'ortho_iter' )
!
DEALLOCATE( rhoa, rhos, rhod, s, sig, tau )
!
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) CALL consistency_check( x0 )
RETURN
END SUBROUTINE ortho_gamma
CONTAINS
!
SUBROUTINE distribute_matrix( a, b )
REAL(DP) :: a(:,:), b(:,:)
INTEGER :: i, j
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
DO j = 1, nc
DO i = 1, nr
b( i, j ) = a( i + ir - 1, j + ic - 1 )
END DO
END DO
END IF
RETURN
END SUBROUTINE
SUBROUTINE compute_qs_times_betas( bephi, bec_row, qbephi, qbecp, idesc )
USE uspp, ONLY: nkb, qq_nt, indv_ijkb0, nkbus
USE uspp_gpum, ONLY: qq_nt_d, using_qq_nt
USE uspp_param, ONLY: nh, upf
USE electrons_base, ONLY: nspin, nbsp_bgrp, iupdwn_bgrp, nupdwn_bgrp, nbsp, nupdwn, iupdwn
USE ions_base, ONLY: na, nat, nsp, ityp
#if defined (__CUDA)
USE cublas
#endif
!
IMPLICIT NONE
!
include 'laxlib.fh'
!
REAL(DP), INTENT(OUT) DEVICEATTR :: qbephi(:,:,:), qbecp(:,:,:)
REAL(DP), INTENT(IN) DEVICEATTR :: bephi(:,:)
REAL(DP), INTENT(IN) DEVICEATTR :: bec_row(:,:)
INTEGER, INTENT(IN) :: idesc(:,:)
SUBROUTINE collect_matrix( a, b )
REAL(DP) :: a(:,:), b(:,:)
INTEGER :: i, j
a = 0.0d0
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
DO j = 1, nc
DO i = 1, nr
a( ir + i - 1, ic + j - 1 ) = b( i, j )
END DO
END DO
END IF
CALL mp_sum( a, idesc(LAX_DESC_COMM) )
RETURN
END SUBROUTINE
REAL(DP), ALLOCATABLE DEVICEATTR :: bec_col(:,:), bephi_col(:,:)
SUBROUTINE consistency_check( a )
REAL(DP) :: a(:,:)
INTEGER :: i, j
INTEGER :: nkbx, info, nrcx, iss, is, jv, iv, ia
INTEGER :: i, j, inl, jnl, indv, nc, nhs
nkbx = nkb
nrcx = idesc( LAX_DESC_NRCX, 1 )
IF( nspin > 1 ) nrcx = MAX( nrcx, idesc( LAX_DESC_NRCX, 2 ) )
qbephi = 0.d0
qbecp = 0.d0
CALL using_qq_nt(0)
!
IF( nkbus > 0 ) THEN
!
! on some machines (IBM RS/6000 for instance) the following test allows
! to distinguish between Numbers and Sodium Nitride (NaN, Not a Number).
! If a matrix of Not-Numbers is passed to rs, the most likely outcome is
! that the program goes on forever doing nothing and writing nothing.
ALLOCATE( bec_col ( nkbx, nrcx*nspin ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' compute_qs_times_betas ', ' allocating bec_col ', ABS( info ) )
ALLOCATE( bephi_col ( nkbx, nrcx*nspin ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' compute_qs_times_betas ', ' allocating bephi_col ', ABS( info ) )
!
DO j = 1, nc
DO i = 1, nr
IF (a(i,j) /= a(i,j)) CALL errore(' ortho ',' ortho went bananas ',1)
END DO
CALL redist_row2col( nupdwn(1), bephi, bephi_col, nkbx, nrcx, idesc(:,1) )
CALL redist_row2col( nupdwn(1), bec_row, bec_col, nkbx, nrcx, idesc(:,1) )
IF( nspin == 2 ) THEN
CALL redist_row2col( nupdwn(2), bephi(:,nrcx+1:), bephi_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
CALL redist_row2col( nupdwn(2), bec_row(:,nrcx+1:), bec_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
END IF
!
DO iss = 1, nspin
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
nc = idesc( LAX_DESC_NC, iss )
DO ia = 1, nat
is = ityp(ia)
IF( upf(is)%tvanp ) THEN
indv = indv_ijkb0(ia)
nhs = nh(is)
#if defined (__CUDA)
CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt_d(1,1,is), SIZE(qq_nt_d,1), &
bephi_col(indv+1,(iss-1)*nrcx+1), SIZE(bephi_col,1), 0.0d0, qbephi(indv+1,1,iss), SIZE(qbephi,1))
CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt_d(1,1,is), SIZE(qq_nt_d,1), &
bec_col(indv+1,(iss-1)*nrcx+1), SIZE(bec_col,1), 0.0d0, qbecp(indv+1,1,iss), SIZE(qbecp,1))
#else
CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt(1,1,is), SIZE(qq_nt,1), &
bephi_col(indv+1,(iss-1)*nrcx+1), SIZE(bephi_col,1), 0.0d0, qbephi(indv+1,1,iss), SIZE(qbephi,1))
CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt(1,1,is), SIZE(qq_nt,1), &
bec_col(indv+1,(iss-1)*nrcx+1), SIZE(bec_col,1), 0.0d0, qbecp(indv+1,1,iss), SIZE(qbecp,1))
#endif
!!$cuf kernel do (2)
! DO iv=1,nhs
! DO i = 1, nc
! DO jv = 1, nhs
! qbephi(indv+iv,i,iss) = qbephi(indv+iv,i,iss) + &
! qq_nt_d(iv,jv,is) * bephi_col(indv+jv,i+(iss-1)*nrcx)
! qbecp(indv+iv,i,iss) = qbecp(indv+iv,i,iss) + &
! qq_nt_d(iv,jv,is) * bec_col(indv+jv,i+(iss-1)*nrcx)
! END DO
! END DO
! END DO
END IF
END DO
ENDIF
END DO
RETURN
END SUBROUTINE
END SUBROUTINE ortho_gamma_x
DEALLOCATE( bec_col )
DEALLOCATE( bephi_col )
END IF
END SUBROUTINE compute_qs_times_betas
SUBROUTINE keep_only_us(wrk)
USE uspp, ONLY: indv_ijkb0
USE uspp_param, ONLY: nh, upf
USE ions_base, ONLY: na, nat, nsp, ityp
#if defined (__CUDA)
USE cublas
#endif
IMPLICIT NONE
COMPLEX(DP) DEVICEATTR, INTENT(OUT) :: wrk(:,:)
INTEGER :: ia, is, inl, nhs, iv
DO ia = 1, nat
is = ityp(ia)
inl = indv_ijkb0(ia)
nhs = nh(is)
IF( .NOT. upf(is)%tvanp ) THEN
!$cuf kernel do (1)
DO iv = 1, nhs
wrk(:,iv+inl) = 0.0d0
END DO
END IF
END DO
END SUBROUTINE
END MODULE ortho_module
!=----------------------------------------------------------------------------=!
SUBROUTINE ortho_x( eigr, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
SUBROUTINE ortho_x( betae, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
!=----------------------------------------------------------------------------=!
!
! input = cp (non-orthonormal), beta
@ -279,72 +506,63 @@
! for vanderbilt pseudo pot - kl & ap
!
USE kinds, ONLY: DP
USE ions_base, ONLY: na, nat, nsp, ityp
USE uspp, ONLY: nkb, qq_nt, indv_ijkb0, nkbus
USE uspp_param, ONLY: nh, upf
USE electrons_base, ONLY: f, nbsp_bgrp, iupdwn_bgrp, nupdwn_bgrp, i2gupdwn_bgrp, nbsp, nspin, nupdwn, iupdwn
USE uspp, ONLY: nkb, nkbus
USE ions_base, ONLY: nsp
USE electrons_base, ONLY: f, nbsp_bgrp, iupdwn_bgrp, nupdwn_bgrp, nbsp, nspin, nupdwn, iupdwn
USE gvecw, ONLY: ngw
USE control_flags, ONLY: iprint, iverbosity, ortho_max
USE control_flags, ONLY: force_pairing
USE io_global, ONLY: stdout, ionode
USE cp_interfaces, ONLY: ortho_gamma, c_bgrp_expand, c_bgrp_pack, nlsm1, collect_bec, beta_eigr, nlsm1us
USE mp_global, ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, inter_bgrp_comm ! DEBUG
USE cp_interfaces, ONLY: c_bgrp_expand, c_bgrp_pack, nlsm1, collect_bec, nlsm1us
USE mp_global, ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, inter_bgrp_comm! DEBUG
USE mp_world, ONLY: mpime
USE orthogonalize_base, ONLY: bec_bgrp2ortho
USE mp, ONLY : mp_sum
USE ortho_module
USE device_memcpy_m
!
IMPLICIT NONE
!
include 'laxlib.fh'
!
INTEGER, INTENT(IN) :: idesc(:,:)
COMPLEX(DP) :: eigr(:,:)
COMPLEX(DP) :: cp_bgrp(:,:), phi_bgrp(:,:)
COMPLEX(DP) DEVICEATTR :: betae(:,:)
COMPLEX(DP) DEVICEATTR :: cp_bgrp(:,:), phi_bgrp(:,:)
REAL(DP) :: x0(:,:,:), diff, ccc
INTEGER :: iter
REAL(DP) :: bephi(:,:)
REAL(DP) :: becp_bgrp(:,:)
REAL(DP) DEVICEATTR :: bephi(:,:)
REAL(DP) DEVICEATTR :: becp_bgrp(:,:)
!
REAL(DP), ALLOCATABLE :: xloc(:,:), becp_dist(:,:)
REAL(DP), ALLOCATABLE :: qbephi(:,:,:), qbecp(:,:,:), bec_col(:,:)
COMPLEX(DP), ALLOCATABLE :: beigr(:,:)
REAL(DP), ALLOCATABLE DEVICEATTR :: bec_row(:,:)
REAL(DP), ALLOCATABLE DEVICEATTR :: qbephi(:,:,:), qbecp(:,:,:)
COMPLEX(DP), ALLOCATABLE DEVICEATTR :: wrk2(:,:)
INTEGER :: nkbx
INTEGER :: info, i, j, iss, iv, jv, ia, is, inl, jnl
INTEGER :: n1, n2, m1, m2
INTEGER :: nspin_sub, nx0, ngwx, nrcx
REAL(DP) :: qqf, dum
INTEGER :: nkbx, info, iss, nspin_sub, nx0, ngwx, nrcx
!
CALL start_clock( 'ortho' )
!
nkbx = nkb
ngwx = SIZE( cp_bgrp, 1 )
!
nx0 = SIZE( x0, 1 )
nx0 = SIZE( x0, 1 )
nrcx = MAXVAL( idesc( LAX_DESC_NRCX, : ) )
!
! calculation of becp and bephi
!
CALL start_clock( 'ortho' )
nrcx = MAXVAL( idesc( LAX_DESC_NRCX, : ) )
ALLOCATE( becp_dist( nkbx, nrcx*nspin ), STAT = info )
ALLOCATE( bec_row( nkbx, nrcx*nspin ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho ', ' allocating becp_dist ', ABS( info ) )
CALL errore( ' ortho ', ' allocating bec_row ', ABS( info ) )
IF( nkbus > 0 ) THEN
!
ALLOCATE( beigr(ngw,nkb))
becp_bgrp = 0.0d0
!
CALL beta_eigr ( beigr, 1, nsp, eigr, 2 )
CALL nlsm1us ( nbsp_bgrp, beigr, phi_bgrp, becp_bgrp )
!CALL nlsm1 ( nbsp_bgrp, 1, nsp, eigr, phi_bgrp, becp_bgrp, 2 )
ALLOCATE( wrk2, MOLD = betae )
CALL dev_memcpy( wrk2, betae )
CALL keep_only_us( wrk2 )
CALL nlsm1us ( nbsp_bgrp, wrk2, phi_bgrp, becp_bgrp )
CALL bec_bgrp2ortho( becp_bgrp, bephi, nrcx, idesc )
!
becp_bgrp = 0.0d0
!
CALL nlsm1us ( nbsp_bgrp, beigr, cp_bgrp, becp_bgrp )
!CALL nlsm1 ( nbsp_bgrp, 1, nsp, eigr, cp_bgrp, becp_bgrp, 2 )
CALL bec_bgrp2ortho( becp_bgrp, becp_dist, nrcx, idesc )
DEALLOCATE( beigr )
CALL nlsm1us ( nbsp_bgrp, wrk2, cp_bgrp, becp_bgrp )
CALL bec_bgrp2ortho( becp_bgrp, bec_row, nrcx, idesc )
DEALLOCATE( wrk2 )
!
END IF
!
@ -353,95 +571,27 @@
ALLOCATE( qbephi( nkbx, nx0, nspin ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho ', ' allocating qbephi ', ABS( info ) )
!
IF( nkbus > 0 ) THEN
ALLOCATE( bec_col ( nkbx, nrcx*nspin ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho ', ' allocating bec_col ', ABS( info ) )
CALL redist_row2col( nupdwn(1), bephi, bec_col, nkbx, nrcx, idesc(:,1) )
IF( nspin == 2 ) THEN
CALL redist_row2col( nupdwn(2), bephi(:,nrcx+1:), bec_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
END IF
END IF
!
qbephi = 0.d0
!
DO iss = 1, nspin
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
DO ia = 1, nat
is = ityp(ia)
IF( upf(is)%tvanp ) THEN
DO iv=1,nh(is)
inl = indv_ijkb0(ia) + iv
DO jv=1,nh(is)
jnl = indv_ijkb0(ia) + jv
qqf = qq_nt(iv,jv,is)
IF( ABS( qqf ) > 1.D-5 ) THEN
DO i = 1, idesc( LAX_DESC_NC, iss )
qbephi(inl,i,iss) = qbephi(inl,i,iss) + qqf * bec_col(jnl,i+(iss-1)*nrcx)
END DO
END IF
END DO
END DO
END IF
END DO
ENDIF
END DO
!
ALLOCATE( qbecp ( nkbx, nx0, nspin ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho ', ' allocating qbecp ', ABS( info ) )
qbecp = 0.d0
IF( nkbus > 0 ) THEN
CALL redist_row2col( nupdwn(1), becp_dist, bec_col, nkbx, nrcx, idesc(:,1) )
IF( nspin == 2 ) THEN
CALL redist_row2col( nupdwn(2), becp_dist(:,nrcx+1:), bec_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
END IF
DO iss = 1, nspin
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
DO ia = 1, nat
is = ityp(ia)
IF( upf(is)%tvanp ) THEN
DO iv=1,nh(is)
inl = indv_ijkb0(ia) + iv
DO jv=1,nh(is)
jnl = indv_ijkb0(ia) + jv
qqf = qq_nt(iv,jv,is)
IF( ABS( qqf ) > 1.D-5 ) THEN
DO i = 1, idesc( LAX_DESC_NC, iss )
qbecp(inl,i,iss) = qbecp(inl,i,iss) + qqf * bec_col(jnl,i+(iss-1)*nrcx)
END DO
ENDIF
END DO
END DO
END IF
END DO
END IF
END DO
DEALLOCATE( bec_col )
END IF
!
CALL compute_qs_times_betas( bephi, bec_row, qbephi, qbecp, idesc )
!
! Expand cp and phi to contain all electronic band
!
CALL c_bgrp_expand( cp_bgrp )
CALL c_bgrp_expand( phi_bgrp )
!
ALLOCATE( xloc( nx0, nx0 ), STAT = info )
IF( info /= 0 ) &
CALL errore( ' ortho ', ' allocating xloc ', ABS( info ) )
!
nspin_sub = nspin
if( force_pairing ) nspin_sub = 1
!
DO iss = 1, nspin_sub
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) xloc = x0(:,:,iss) * ccc
CALL x0_to_xloc( x0(:,:,iss), nx0, ccc, idesc(:,iss) )
CALL ortho_gamma( 0, cp_bgrp, ngwx, phi_bgrp, becp_dist(:,(iss-1)*nrcx+1:iss*nrcx), qbecp(:,:,iss), nkbx, &
CALL ortho_gamma( cp_bgrp, ngwx, phi_bgrp, bec_row(:,(iss-1)*nrcx+1:iss*nrcx), qbecp(:,:,iss), nkbx, &
bephi(:,((iss-1)*nrcx+1):iss*nrcx), &
qbephi(:,:,iss), xloc, nx0, idesc(:,iss), diff, iter, nbsp, nupdwn(iss), iupdwn(iss) )
qbephi(:,:,iss), nx0, idesc(:,iss), diff, iter, nbsp, nupdwn(iss), iupdwn(iss) )
IF( iter > ortho_max ) THEN
WRITE( stdout, 100 ) diff, iter
@ -452,16 +602,17 @@
WRITE( stdout, 100 ) diff, iter
ENDIF
!
IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) x0( :, :, iss ) = xloc / ccc
CALL xloc_to_x0( x0(:,:,iss), nx0, ccc, idesc(:,iss) )
!
END DO
IF( force_pairing ) cp_bgrp(:, iupdwn(2):iupdwn(2)+nupdwn(2)-1 ) = cp_bgrp(:,1:nupdwn(2))
IF( force_pairing ) THEN
CALL dev_memcpy(cp_bgrp(:,iupdwn(2):), cp_bgrp(:,1:), [1, ngw], 1 , [1, nupdwn(2)], 1)
END IF
!
DEALLOCATE( xloc )
DEALLOCATE( qbecp )
DEALLOCATE( qbephi )
DEALLOCATE( becp_dist )
DEALLOCATE( bec_row )
!
! pack cp so that it contains only the bands in the band subgroup
!

File diff suppressed because it is too large Load Diff

View File

@ -351,7 +351,11 @@
ALLOCATE( ftmp( 3, SIZE( fion, 2 ) ) )
ftmp = 0.0d0
!$omp parallel do reduction(+:ftmp) default(none) &
!$omp shared( gstart, dffts, sfac, rhops, screen_coul, rhoeg, nsp, gg, tpiba2, mill, g, &
!$omp nat, ityp, vps, ei1, ei2, ei3, tscreen ) &
!$omp private(ig, rp, is, rhet, rhog, fpibg, ig1, ig2, ig3, gxc, gyc, gzc, ia, cnvg, cvn, tx, &
!$omp ty, tz, teigr )
DO ig = gstart, dffts%ngm
RP = (0.D0,0.D0)
@ -389,6 +393,8 @@
END DO
!
!$omp end parallel do
!
fion = fion + DBLE(ftmp) * 2.D0 * omega * tpiba
DEALLOCATE( ftmp )
@ -444,6 +450,8 @@
INTEGER, DIMENSION(6), PARAMETER :: ALPHA = (/ 1,2,3,2,3,3 /)
INTEGER, DIMENSION(6), PARAMETER :: BETA = (/ 1,1,1,2,2,3 /)
INTEGER :: omp_get_num_threads
! ... SUBROUTINE BODY
ALLOCATE( rc( nsp, nsp ) )
@ -474,9 +482,13 @@
IA_S = gind_block( 1, nat, nproc_bgrp, me_bgrp )
IA_E = IA_S + NA_LOC - 1
!$omp parallel do reduction(+:esr,desr) num_threads(min(max(1,na_loc),omp_get_num_threads())) default(none) &
!$omp private(ia,ib,k,j,zv2_kj,rckj_m1,fact_pre,xlm,ylm,zlm,tzero,xlm0,ylm0,zlm0,ix,iy,iz,sxlm,tshift, &
!$omp rxlm,erre2,rlm,arg,esrtzero,addesr,addpre,repand,i,fxx ) &
!$omp shared(ia_s,ia_e,nat,ityp,zv2,rc,taus,iesr,hmat,fionloc,tstress,na_loc)
DO ia = ia_s, ia_e
k = ityp(ia)
DO ib = ia, nat
k = ityp(ia)
j = ityp(ib)
zv2_kj = zv2(k,j)
@ -536,15 +548,14 @@
END DO ! IX
END DO
END DO
!$omp end parallel do
!
! each processor add its own contribution to the array FION
!
DO ia = 1, nat
FION(1,ia) = FION(1,ia)+FIONLOC(1,ia)
FION(2,ia) = FION(2,ia)+FIONLOC(2,ia)
FION(3,ia) = FION(3,ia)+FIONLOC(3,ia)
END DO
! FION = FION+FIONLOC
!
CALL daxpy( 3*nat, 1.0d0, fionloc, 1, fion, 1 )
CALL mp_sum(esr, intra_bgrp_comm)

View File

@ -615,6 +615,7 @@
USE atom, ONLY: rgrid
USE uspp, ONLY: indv
use uspp, only: qq_nt, beta
use uspp_gpum, ONLY: using_qq_nt, using_qq_nt_d, qq_nt_d
USE betax, only: refg, qradx, mmx, dqradx
use smallbox_gvec, only: ngb
use control_flags, only: iprint, iverbosity
@ -624,6 +625,9 @@
use smallbox_gvec, only: gb, gxb
use small_box, only: omegab, tpibab
USE cp_interfaces, ONLY: fill_qrl
#if defined (__CUDA)
USE cudafor
#endif
!
IMPLICIT NONE
!
@ -796,6 +800,22 @@
end do
end do
CALL using_qq_nt(2)
#if defined (__CUDA)
CALL using_qq_nt_d(0)
!$cuf kernel do (3)
DO is = 1, SIZE(qq_nt_d,3)
DO jv=1,SIZE(qq_nt_d,2)
DO iv=1,SIZE(qq_nt_d,1)
IF( ABS( qq_nt_d(iv,jv,is) ) <= 1.D-5 ) THEN
qq_nt_d(iv,jv,is) = 0.0d0
END IF
END DO
END DO
END DO
#endif
!
!
if (tpre) then
! ---------------------------------------------------------------
@ -1039,6 +1059,7 @@
use gvecw, only: ngw
use cell_base, only: ainv
use uspp, only: qq_nt, nhtolm, beta
use uspp_gpum, ONLY: using_qq_nt, using_qq_nt_d, qq_nt_d
use constants, only: pi, fpi
use ions_base, only: nsp
use uspp_param, only: upf, lmaxq, lmaxkb, nbetam, nh
@ -1046,6 +1067,9 @@
use smallbox_gvec, only: gb, gxb, ngb
use small_box, only: omegab, tpibab
USE betax, ONLY: qradx, dqradx, refg, mmx
#if defined (__CUDA)
USE cudafor
#endif
!
implicit none
@ -1131,6 +1155,21 @@
end do
end do
CALL using_qq_nt(2)
#if defined (__CUDA)
CALL using_qq_nt_d(0)
!$cuf kernel do (3)
DO is = 1, SIZE(qq_nt_d,3)
DO jv=1,SIZE(qq_nt_d,2)
DO iv=1,SIZE(qq_nt_d,1)
IF( ABS( qq_nt_d(iv,jv,is) ) <= 1.D-5 ) THEN
qq_nt_d(iv,jv,is) = 0.0d0
END IF
END DO
END DO
END DO
#endif
!
if (tpre) then
! ---------------------------------------------------------------

View File

@ -28,17 +28,19 @@ SUBROUTINE from_restart( )
USE gvect, ONLY : mill, eigts1, eigts2, eigts3
USE printout_base, ONLY : printout_pos
USE gvecw, ONLY : ngw
USE cp_interfaces, ONLY : phfacs, strucf, prefor, calbec_bgrp, caldbec_bgrp
USE cp_interfaces, ONLY : phfacs, strucf, prefor, calbec, caldbec_bgrp
USE energies, ONLY : eself, dft_energy_type
USE wave_base, ONLY : rande_base
USE efield_module, ONLY : efield_berry_setup, tefield, &
efield_berry_setup2, tefield2
USE uspp, ONLY : okvan, vkb, nkb, nlcc_any
USE uspp_gpum, ONLY : vkb_d
USE cp_main_variables, ONLY : ht0, htm, lambdap, lambda, lambdam, eigr, &
sfac, taub, irb, eigrb, edft, bec_bgrp, dbec, idesc
sfac, taub, irb, eigrb, edft, bec_bgrp, dbec, idesc, iabox, nabox
USE time_step, ONLY : delt
USE fft_base, ONLY : dfftp, dffts
USE matrix_inversion
USE device_memcpy_m, ONLY : dev_memcpy
!
IMPLICIT NONE
@ -153,7 +155,7 @@ SUBROUTINE from_restart( )
! ... to starting cell (from ndr or again standard input)
!
IF ( okvan .or. nlcc_any ) THEN
CALL initbox( tau0, alat, at, ainv, taub, irb )
CALL initbox( tau0, alat, at, ainv, taub, irb, iabox, nabox )
CALL phbox( taub, iverbosity, eigrb )
END IF
!
@ -162,6 +164,7 @@ SUBROUTINE from_restart( )
CALL strucf( sfac, eigts1, eigts2, eigts3, mill, dffts%ngm )
!
CALL prefor( eigr, vkb )
CALL dev_memcpy( vkb_d, vkb )
!
CALL formf( .TRUE. , eself )
!
@ -187,7 +190,7 @@ SUBROUTINE from_restart( )
!
END IF
!
CALL calbec_bgrp( 1, nsp, eigr, c0_bgrp, bec_bgrp )
CALL calbec( nbsp_bgrp, vkb, c0_bgrp, bec_bgrp, 0 )
!
IF ( tpre ) CALL caldbec_bgrp( eigr, c0_bgrp, dbec, idesc )
!

View File

@ -10,9 +10,15 @@
!=----------------------------------------------------------------------------------=!
#if defined(__CUDA)
#define DEVICEATTR ,DEVICE
#else
#define DEVICEATTR
#endif
SUBROUTINE runcp_uspp_x &
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp, fromscra, restart )
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart )
!
! This subroutine performs a Car-Parrinello or Steepest-Descent step
! on the electronic variables, computing forces on electrons
@ -31,7 +37,7 @@
USE mp, ONLY : mp_sum
USE fft_base, ONLY : dffts
use wave_base, only : wave_steepest, wave_verlet
use control_flags, only : lwf, tsde
use control_flags, only : lwf, tsde, many_fft
use uspp, only : deeq, vkb
use gvect, only : gstart
use electrons_base, only : nbsp_bgrp, ispin_bgrp, f_bgrp, nspin, nupdwn_bgrp, iupdwn_bgrp
@ -41,6 +47,10 @@
USE cp_interfaces, ONLY : dforce
USE ldaU_cp, ONLY : lda_plus_u, vupsi
USE fft_helper_subroutines
#if defined (__CUDA)
USE uspp_gpum, ONLY : vkb_d
USE cudafor
#endif
!
IMPLICIT NONE
!
@ -50,6 +60,7 @@
REAL(DP) :: rhos(:,:)
REAL(DP) :: bec_bgrp(:,:)
COMPLEX(DP) :: c0_bgrp(:,:), cm_bgrp(:,:)
COMPLEX(DP) DEVICEATTR :: c0_d(:,:), cm_d(:,:)
LOGICAL, OPTIONAL, INTENT(IN) :: fromscra
LOGICAL, OPTIONAL, INTENT(IN) :: restart
!
@ -64,11 +75,22 @@
real(DP), allocatable :: emaver(:)
complex(DP), allocatable :: c2(:), c3(:), c2tmp(:), c3tmp(:)
REAL(DP), ALLOCATABLE :: tg_rhos(:,:), ftmp(:)
#if defined (__CUDA)
REAL(DP), ALLOCATABLE, DEVICE :: rhos_d(:,:)
#endif
INTEGER, ALLOCATABLE :: itmp(:)
integer :: i, nsiz, incr, idx, idx_in, ierr
integer :: iwfc, nwfc, is, ii, tg_rhos_siz, c2_siz
integer :: iflag
logical :: ttsde
INTEGER :: omp_get_num_threads
#if defined (__CUDA)
IF( dffts%has_task_groups ) THEN
CALL errore(' runcp_uspp ', ' task groups not implemented on GPU ',1)
END IF
ALLOCATE( rhos_d, SOURCE = rhos )
#endif
iflag = 0
!
@ -85,6 +107,9 @@
ELSE
tg_rhos_siz = 1
c2_siz = ngw
#if defined (__CUDA)
c2_siz = c2_siz * many_fft
#endif
END IF
!
@ -117,12 +142,13 @@
emadt2, emaver, verl1, verl2 )
ELSE
allocate( c2( c2_siz ), c3( c2_siz ) )
allocate( tg_rhos( tg_rhos_siz, nspin ) )
c2 = 0D0
c3 = 0D0
IF( dffts%has_task_groups ) THEN
ALLOCATE( tg_rhos( tg_rhos_siz, nspin ) )
!
! The potential in rhos is distributed across all processors
! We need to redistribute it so that it is completely contained in the
@ -136,7 +162,11 @@
ELSE
#if defined (__CUDA)
incr = 2 * many_fft
#else
incr = 2
#endif
END IF
@ -230,8 +260,14 @@
ELSE
#if defined (__CUDA)
CALL dforce( i, bec_bgrp, vkb_d, c0_d, c2, c3, rhos_d, &
SIZE(rhos_d,1), ispin_bgrp, f_bgrp, nbsp_bgrp, nspin )
#else
CALL dforce( i, bec_bgrp, vkb, c0_bgrp, c2, c3, rhos, &
SIZE(rhos,1), ispin_bgrp, f_bgrp, nbsp_bgrp, nspin )
#endif
IF ( lda_plus_u ) THEN
c2(:) = c2(:) - vupsi(:,i)
c3(:) = c3(:) - vupsi(:,i+1)
@ -256,8 +292,10 @@
ENDDO
END IF
idx_in = 1
!$omp parallel num_threads(min(incr,omp_get_num_threads())) default(shared) private(idx_in, idx)
!$omp do
DO idx = 1, incr, 2
idx_in = idx/2+1
IF( i + idx - 1 <= nbsp_bgrp ) THEN
IF (tsde) THEN
CALL wave_steepest( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), emaver, c2(:), ngw, idx_in )
@ -271,21 +309,26 @@
cm_bgrp(1,i+idx ) = CMPLX(real(cm_bgrp(1,i+idx )),0.0d0,kind=dp)
END IF
END IF
!
idx_in = idx_in + 1
!
END DO
!$omp end do
!$omp end parallel
end do
END DO
DEALLOCATE( c2 )
DEALLOCATE( c3 )
DEALLOCATE( tg_rhos )
IF( dffts%has_task_groups ) THEN
DEALLOCATE( tg_rhos )
END IF
END IF
DEALLOCATE( emadt2 )
DEALLOCATE( emaver )
#if defined (__CUDA)
DEALLOCATE( rhos_d )
#endif
!
END SUBROUTINE runcp_uspp_x

View File

@ -6,17 +6,18 @@
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-----------------------------------------------------------------------
SUBROUTINE initbox ( tau0, alat, at, ainv, taub, irb )
SUBROUTINE initbox ( tau0, alat, at, ainv, taub, irb, iabox, nabox )
!-----------------------------------------------------------------------
!
! sets the indexes irb and positions taub for the small boxes
! around atoms
!
USE kinds, ONLY: DP
USE ions_base, ONLY: nat
USE ions_base, ONLY: nat, ityp
USE uspp_param, ONLY: upf
USE control_flags, ONLY: iverbosity
USE io_global, ONLY: stdout
USE mp_global, ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm
USE mp_global, ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, my_bgrp_id, nbgrp
USE fft_base, ONLY: dfftb, dfftp, fft_type_descriptor
USE fft_smallbox_type, ONLY: fft_box_set
@ -26,9 +27,10 @@
! output
INTEGER, INTENT(out) :: irb(3,nat)
REAL(DP), INTENT(out) :: taub(3,nat)
INTEGER, INTENT(out) :: iabox(nat), nabox
! local
REAL(DP) :: x(3), xmod
INTEGER :: nr(3), nrb(3), xint, ia, i
INTEGER :: nr(3), nrb(3), xint, ia, i, is
!
IF ( dfftb%nr1 < 1) CALL errore ('initbox', 'incorrect value for box grid dimensions', 1)
IF ( dfftb%nr2 < 1) CALL errore ('initbox', 'incorrect value for box grid dimensions', 2)
@ -96,6 +98,19 @@
CALL fft_box_set( dfftb, nat, irb, dfftp )
! build the local list of atom
nabox = 0
DO ia = 1, nat
IF( .NOT. upf(ityp(ia))%tvanp ) CYCLE
#if defined(__MPI)
IF( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
CYCLE
END IF
#endif
nabox = nabox + 1
iabox( nabox ) = ia
END DO
IF( iverbosity > 1 ) THEN
DO ia=1,nat
WRITE( stdout,2000) ia, (irb(i,ia),i=1,3)

View File

@ -79,7 +79,7 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
!
INTEGER iss, isup, isdw, ig, ir, i, j, k, ij, is, ia, inlc
REAL(DP) :: vtxc, vave, ebac, wz, eh, ehpre, enlc
COMPLEX(DP) fp, fm, ci, drhop, zpseu, zh
COMPLEX(DP) fp, fm, drhop, zpseu, zh
COMPLEX(DP), ALLOCATABLE :: rhotmp(:), vtemp(:)
COMPLEX(DP), ALLOCATABLE :: drhot(:,:)
REAL(DP), ALLOCATABLE :: gagb(:,:), rhosave(:,:), newrhosave(:,:), rhocsave(:)
@ -104,6 +104,7 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
! ... dalbe(:) = delta( alpha(:), beta(:) )
REAL(DP), DIMENSION(6), PARAMETER :: dalbe = &
(/ 1.0_DP, 0.0_DP, 0.0_DP, 1.0_DP, 0.0_DP, 1.0_DP /)
COMPLEX(DP), PARAMETER :: ci = ( 0.0d0, 1.0d0 )
CALL start_clock( 'vofrho' )
@ -130,8 +131,6 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
!
END IF
!
ci = ( 0.0d0, 1.0d0 )
!
! wz = factor for g.neq.0 because of c*(g)=c(-g)
!
wz = 2.0d0
@ -442,12 +441,12 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
END IF
IF( nspin == 1 ) THEN
rhog( 1:dfftp%ngm, 1 ) = rhog( 1:dfftp%ngm, 1 ) + vtemp(1:dfftp%ngm)
CALL zaxpy(dfftp%ngm, (1.0d0,0.0d0) , vtemp, 1, rhog(1,1), 1)
ELSE
isup=1
isdw=2
rhog( 1:dfftp%ngm, isup ) = rhog( 1:dfftp%ngm, isup ) + vtemp(1:dfftp%ngm)
rhog( 1:dfftp%ngm, isdw ) = rhog( 1:dfftp%ngm, isdw ) + vtemp(1:dfftp%ngm)
CALL zaxpy(dfftp%ngm, (1.0d0,0.0d0) , vtemp, 1, rhog(1,isup), 1)
CALL zaxpy(dfftp%ngm, (1.0d0,0.0d0) , vtemp, 1, rhog(1,isdw), 1)
IF( ttsic ) THEN
rhog( 1:dfftp%ngm, isup ) = rhog( 1:dfftp%ngm, isup ) - self_vloc(1:dfftp%ngm)
rhog( 1:dfftp%ngm, isdw ) = rhog( 1:dfftp%ngm, isdw ) - self_vloc(1:dfftp%ngm)

View File

@ -464,6 +464,13 @@
!
! ... assign random values to wave functions
!
! 2.519 = 4^(2/3), equivalent to keep only (ngw_g/4) values
fac = 2.519d0
IF( ngw_g/4 < nbsp ) fac = 1.0d0
IF( ngw_g < nbsp ) THEN
CALL errore(' wave_rand_init ', ' too few plane waves, linear dependent electronic states! ', 1)
END IF
DO ib = 1, nbsp
IF( local ) THEN
@ -487,7 +494,7 @@
IF( ibgrp > 0 ) THEN
DO ig = 1, ngw
IF( local ) THEN
IF( gg(ig) < ggx / 2.519d0 ) THEN ! 2.519 = 4^(2/3), equivalent to keep only (ngw_g/4) values
IF( gg(ig) < ggx / fac ) THEN
rranf1 = rnd( 1, mill(1,ig) ) * rnd( 2, mill(2,ig) ) * rnd( 3, mill(3,ig) )
rranf2 = 0.0d0
cm_bgrp( ig, ibgrp ) = ampre * CMPLX( rranf1, rranf2 ,kind=DP) / ( 1.0d0 + gg(ig) )
@ -599,3 +606,65 @@
RETURN
END SUBROUTINE c_bgrp_pack_x
#if defined (__CUDA)
SUBROUTINE c_bgrp_expand_gpu_x( c_bgrp )
USE kinds, ONLY: DP
USE mp, ONLY: mp_sum
USE electrons_base, ONLY: nspin, i2gupdwn_bgrp, nupdwn, iupdwn_bgrp, iupdwn, nupdwn_bgrp
USE mp_global, ONLY: nbgrp, inter_bgrp_comm
USE cudafor
IMPLICIT NONE
COMPLEX(DP), DEVICE :: c_bgrp(:,:)
INTEGER :: iss, n1, n2, m1, m2, i
IF( nbgrp < 2 ) &
RETURN
DO iss = nspin, 1, -1
n1 = iupdwn_bgrp(iss)
n2 = n1 + nupdwn_bgrp(iss) - 1
m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
m2 = m1 + nupdwn_bgrp(iss) - 1
!$cuf kernel do(1) <<<*,*>>>
DO i = m2, m1, -1
c_bgrp(:,i) = c_bgrp(:,i-m1+n1)
END DO
END DO
DO iss = 1, nspin
m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
m2 = m1 + nupdwn_bgrp(iss) - 1
!$cuf kernel do(1) <<<*,*>>>
DO i = iupdwn(iss), m1-1
c_bgrp(:,i) = 0.0d0
END DO
!$cuf kernel do(1) <<<*,*>>>
DO i = m2+1, iupdwn(iss) + nupdwn(iss) - 1
c_bgrp(:,i) = 0.0d0
END DO
END DO
CALL mp_sum( c_bgrp, inter_bgrp_comm )
RETURN
END SUBROUTINE c_bgrp_expand_gpu_x
SUBROUTINE c_bgrp_pack_gpu_x( c_bgrp )
USE kinds, ONLY: DP
USE electrons_base, ONLY: nspin, i2gupdwn_bgrp, nupdwn, iupdwn_bgrp, iupdwn, nupdwn_bgrp
USE mp_global, ONLY: nbgrp
USE cudafor
IMPLICIT NONE
COMPLEX(DP), DEVICE :: c_bgrp(:,:)
INTEGER :: iss, n1, n2, m1, m2, i
IF( nbgrp < 2 ) &
RETURN
DO iss = 1, nspin
n1 = iupdwn_bgrp(iss)
n2 = n1 + nupdwn_bgrp(iss) - 1
m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
m2 = m1 + nupdwn_bgrp(iss) - 1
!$cuf kernel do(1) <<<*,*>>>
DO i = n1, n2
c_bgrp(:,i) = c_bgrp(:,i-n1+m1)
END DO
END DO
RETURN
END SUBROUTINE c_bgrp_pack_gpu_x
#endif

View File

@ -616,6 +616,9 @@ CONTAINS
WRITE( stdout,*) ' Number of x-y planes for each processors: '
WRITE( stdout, fmt = '( 5(" |",I4,",",I4) )' ) ( ( desc%nr2p(j), &
desc%nr3p(i), i = 1, desc%nproc3 ), j=1,desc%nproc2 )
IF ( .not. desc%use_pencil_decomposition ) WRITE( stdout,*) ' Using Slab Decomposition'
IF ( desc%use_pencil_decomposition ) WRITE( stdout,*) ' Using Pencil Decomposition'
1000 FORMAT(3X, &
'Global Dimensions Local Dimensions Processor Grid',/,3X, &
'.X. .Y. .Z. .X. .Y. .Z. .X. .Y. .Z.',/, &

30
FFTXlib/tests/Makefile Normal file
View File

@ -0,0 +1,30 @@
# Makefile for FFTXlib testing
include ../../make.inc
MODFLAGS= $(MOD_FLAG).. $(MOD_FLAG).
SRCS = test_fft_scalar_gpu.f90 \
test_fft_scatter_mod_gpu.f90 \
test_fwinv_gpu.f90
EXECS = $(SRCS:.f90=.x)
all: common $(EXECS)
common: tester.o utils.o
%.x: %.o
$(LD) $(LDFLAGS) $< utils.o tester.o -o $@ ../libqefft.a $(QELIBS)
clean :
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
# .PHONY forces execution of a rule irrespective of the presence of an
# updated file with the same name of the rule. In this way, the script
# that generates version.f90 always runs, updating the version if you
# execute "svn update". The update_version script takes care of not
# changing the file if the svn version did not change
.PHONY: all clean

View File

@ -0,0 +1,273 @@
#if defined(__CUDA)
program test_fft_scalar_gpu
USE tester
IMPLICIT NONE
!
TYPE(tester_t) :: test
!
CALL test%init()
!
test%tolerance64 = 1.d-13
!
CALL save_random_seed("test_fft_scalar_gpu", 0)
!
CALL test_cft_2xy_gpu(test)
!
CALL test_cft_1z_gpu(test)
!
CALL test_cfft3d_gpu(test)
!
CALL test_cfft3ds_gpu(test)
!
CALL test%print()
!
CONTAINS
!
SUBROUTINE fill_random(c, c_d, n)
USE cudafor
USE fft_param, ONLY : DP
implicit none
complex(DP), device :: c_d(:)
complex(DP) :: c(:)
integer, intent(in) :: n
!
real(DP), ALLOCATABLE :: rnd_aux(:)
!
ALLOCATE (rnd_aux(2*n))
CALL RANDOM_NUMBER(rnd_aux)
c = CMPLX(rnd_aux(1:n), rnd_aux(n:2*n))
c_d = c
DEALLOCATE(rnd_aux)
END SUBROUTINE fill_random
!
SUBROUTINE test_cft_1z_gpu(test)
USE fft_scalar, ONLY : cft_1z, cft_2xy, cfft3d, cfft3ds
USE fft_scalar, ONLY : cft_1z_gpu, cft_2xy_gpu, cfft3d_gpu, cfft3ds_gpu
USE fft_param, ONLY : DP
USE cudafor
implicit none
!
TYPE(tester_t) :: test
!
! stream
integer(kind = cuda_stream_kind) :: stream = 0
! size
integer, parameter :: nsl=100, nz=56, ldz=56
!
! array for random values
real(DP) :: rnd_aux(2 * nsl * ldz)
!
! variables on device
complex(DP), device :: c_d(nsl*ldz)
complex(DP), device :: cout_d(nsl*ldz)
! variables on host
complex(DP) :: c(nsl*ldz)
complex(DP) :: cout(nsl*ldz)
!
CALL fill_random(c, c_d, nsl*ldz)
!
! Check forward direction
CALL cft_1z(c, nsl, nz, ldz, 1, cout)
CALL cft_1z_gpu(c_d, nsl, nz, ldz, 1, cout_d, stream)
!
! Use c as auxiliary variable hosting GPU results
c = (0.d0, 0.d0)
c = cout_d
CALL test%assert_close( c, cout )
!
!
CALL fill_random(c, c_d, nsl*ldz)
!
! Check backward direction
CALL cft_1z(c, nsl, nz, ldz, -1, cout)
CALL cft_1z_gpu(c_d, nsl, nz, ldz, -1, cout_d, stream)
!
! Use c as auxiliary variable hosting GPU results
c = (0.d0, 0.d0)
c = cout_d
CALL test%assert_close( c, cout )
!
! == Same as above, for inplace call ==
!
CALL fill_random(c, c_d, nsl*ldz)
!
! Check forward direction
CALL cft_1z(c, nsl, nz, ldz, 1, cout)
CALL cft_1z_gpu(c_d, nsl, nz, ldz, 1, cout_d, stream, in_place=.true.)
!
! Use c as auxiliary variable hosting GPU results
c = (0.d0, 0.d0)
c = c_d
CALL test%assert_close( c, cout )
!
!
CALL fill_random(c, c_d, nsl*ldz)
!
! Check backward direction
CALL cft_1z(c, nsl, nz, ldz, -1, cout)
CALL cft_1z_gpu(c_d, nsl, nz, ldz, -1, cout_d, stream, in_place=.true.)
!
! Use c as auxiliary variable hosting GPU results
c = (0.d0, 0.d0)
c = c_d
CALL test%assert_close( c, cout )
!
END SUBROUTINE test_cft_1z_gpu
!
SUBROUTINE test_cft_2xy_gpu(test)
USE fft_scalar, ONLY : cft_2xy
USE fft_scalar, ONLY : cft_2xy_gpu
USE fft_param, ONLY : DP
USE cudafor
implicit none
!
TYPE(tester_t) :: test
!
! stream
integer(kind = cuda_stream_kind) :: stream = 0
! size
integer, parameter :: nx = 10, ny = 10, nzl = 5, ldx=10, ldy=10
!
! array for random values
real(DP) :: rnd_aux(2 * nzl * ldx * ldy)
! variables on device
complex(DP), device :: c_d(nzl * ldx * ldy)
complex(DP), device :: tmp_d(nzl * ldx * ldy)
! variables on host
complex(DP) :: c(nzl * ldx * ldy)
complex(DP) :: tmp(nzl * ldx * ldy)
!
CALL fill_random(c, c_d, nzl * ldx * ldy)
!
CALL cft_2xy_gpu(c_d, tmp_d, nzl, nx, ny, ldx, ldy, 1, stream)
CALL cft_2xy(c, nzl, nx, ny, ldx, ldy, 1)
!
! Use c as auxiliary variable hosting GPU results
tmp = c_d
CALL test%assert_close( c, tmp )
!
CALL fill_random(c, c_d, nzl * ldx * ldy)
!
CALL cft_2xy_gpu(c_d, tmp_d, nzl, nx, ny, ldx, ldy, -1, stream)
CALL cft_2xy(c, nzl, nx, ny, ldx, ldy, -1)
!
! Use c as auxiliary variable hosting GPU results
tmp = c_d
CALL test%assert_close( c, tmp )
!
END SUBROUTINE test_cft_2xy_gpu
!
SUBROUTINE test_cfft3d_gpu(test)
USE fft_scalar, ONLY : cfft3d
USE fft_scalar, ONLY : cfft3d_gpu
USE fft_param, ONLY : DP
USE cudafor
implicit none
!
TYPE(tester_t) :: test
!
! stream
integer(kind = cuda_stream_kind) :: stream = 0
! size
integer, parameter :: nx = 10, ny = 10, nz = 10
integer, parameter :: ldx= 10, ldy= 10, ldz= 10
#if ! defined(__DFTI)
integer, parameter :: howmany=1
#else
integer, parameter :: howmany=12
#endif
!
! array for random values
real(DP) :: rnd_aux(2 * howmany * ldx * ldy * ldz)
! variables on device
complex(DP), device :: c_d(howmany * ldx * ldy * ldz)
complex(DP), device :: tmp_d(howmany * ldx * ldy * ldz)
! variables on host
complex(DP) :: c(howmany * ldx * ldy * ldz)
complex(DP) :: tmp(howmany * ldx * ldy * ldz)
!
integer :: i, rs, re
!
#if ! defined(__DFTI)
print *, 'The current CPU scalar driver does not support howmany. Reverting to howmany 1'
#endif
CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
!
CALL cfft3d( c, nx, ny, nz, ldx, ldy, ldz, howmany, 1 )
CALL cfft3d_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, 1, stream )
!
! Use c as auxiliary variable hosting GPU results
tmp = c_d
DO i=0, howmany - 1
rs = i * ldx * ldy * ldz + 1
re = rs + nx * ny * nz - 1
CALL test%assert_close( c(rs:re), tmp(rs:re) )
END DO
!
CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
!
CALL cfft3d( c, nx, ny, nz, ldx, ldy, ldz, howmany, -1 )
CALL cfft3d_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, -1, stream )
!
! Use c as auxiliary variable hosting GPU results
tmp = c_d
DO i=0, howmany - 1
rs = i * ldx * ldy * ldz + 1
re = rs + nx * ny * nz - 1
CALL test%assert_close( c(rs:re), tmp(rs:re) )
END DO
!
END SUBROUTINE test_cfft3d_gpu
!
SUBROUTINE test_cfft3ds_gpu(test)
USE fft_scalar, ONLY : cfft3ds
USE fft_scalar, ONLY : cfft3ds_gpu
USE fft_param, ONLY : DP
USE cudafor
implicit none
!
TYPE(tester_t) :: test
!
! stream
integer(kind = cuda_stream_kind) :: stream = 0
! size
integer, parameter :: nx = 10, ny = 10, nz = 10
integer, parameter :: ldx= 10, ldy= 10, ldz= 10, howmany=1
!
! array for random values
real(DP) :: rnd_aux(2 * howmany * ldx * ldy * ldz)
! variables on device
complex(DP), device :: c_d(howmany * ldx * ldy * ldz)
complex(DP), device :: tmp_d(howmany * ldx * ldy * ldz)
! variables on host
complex(DP) :: c(howmany * ldx * ldy * ldz)
complex(DP) :: tmp(howmany * ldx * ldy * ldz)
integer :: do_fft_y(ldx), do_fft_z(ldx*ldy)
!
CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
do_fft_y = 1; do_fft_z = 1
!
CALL cfft3ds( c, nx, ny, nz, ldx, ldy, ldz, howmany, 1, do_fft_z, do_fft_y)
CALL cfft3ds_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, 1, do_fft_z, do_fft_y, stream)
!
! Use c as auxiliary variable hosting GPU results
tmp = c_d
CALL test%assert_close( c, tmp )
!
CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
!
CALL cfft3ds( c, nx, ny, nz, ldx, ldy, ldz, howmany, -1, do_fft_z, do_fft_y)
CALL cfft3ds_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, -1, do_fft_z, do_fft_y, stream )
!
! Use c as auxiliary variable hosting GPU results
tmp = c_d
CALL test%assert_close( c, tmp )
!
END SUBROUTINE test_cfft3ds_gpu
end program test_fft_scalar_gpu
#else
program test_fft_scalar_gpu
end program test_fft_scalar_gpu
#endif

View File

@ -0,0 +1,406 @@
#if defined(__CUDA)
program test_fft_scatter_mod_gpu
#if defined(__MPI)
USE mpi
#endif
USE tester
IMPLICIT NONE
! MPI type
type mpi_t
integer :: me, n, root, comm
end type mpi_t
TYPE(mpi_t) :: mp
!
TYPE(tester_t) :: test
!
INTEGER :: ierr, level, i
!
#if defined(__MPI)
#if defined(_OPENMP)
CALL MPI_Init_thread(MPI_THREAD_FUNNELED,level, ierr)
#else
CALL MPI_Init(ierr)
#endif
#endif
!
CALL mpi_data_init(mp%me, mp%n, mp%root, mp%comm)
!
CALL test%init()
!
test%tolerance64 = 1.d-14
!
CALL save_random_seed("test_fft_scatter_mod_gpu", mp%me)
!
DO i = 1, mp%n
IF (MOD(mp%n,i) == 0 ) THEN
! gamma case
CALL test_fft_scatter_xy_gpu_1(mp, test, .true., i)
! k case
CALL test_fft_scatter_xy_gpu_1(mp, test, .false., i)
!
! gamma case
CALL test_fft_scatter_yz_gpu_1(mp, test, .true., i)
! k case
CALL test_fft_scatter_yz_gpu_1(mp, test, .false., i)
END IF
END DO
CALL test_fft_scatter_many_yz_gpu_1(mp, test, .true., 1)
CALL test_fft_scatter_many_yz_gpu_1(mp, test, .false., 1)
!
CALL collect_results(test)
!
IF (mp%me == mp%root) CALL test%print()
!
#if defined(__MPI)
CALL MPI_Finalize(ierr)
#endif
CONTAINS
!
SUBROUTINE mpi_data_init(mpme, npes, mproot, comm)
implicit none
integer, intent(out) :: mpme, npes, mproot, comm
integer :: ierr
mpme=0; npes=1; mproot=0; comm=0
#if defined(__MPI)
CALL mpi_comm_rank(MPI_COMM_WORLD, mpme, ierr)
CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
comm = MPI_COMM_WORLD
#endif
END SUBROUTINE mpi_data_init
SUBROUTINE fft_desc_init(dfft, smap, flavor, gamma_only, parallel, comm, nyfft)
USE fft_types, ONLY : fft_type_descriptor, fft_type_init
USE stick_base, ONLY : sticks_map
USE fft_param, ONLY : DP
implicit none
TYPE(fft_type_descriptor) :: dfft
TYPE(sticks_map) :: smap
CHARACTER(LEN=*), INTENT(IN) :: flavor
LOGICAL :: gamma_only
LOGICAL :: parallel
INTEGER :: comm, nyfft
REAL(DP), PARAMETER :: pi=4.D0*DATAN(1.D0)
!
REAL(DP) :: at(3,3), bg(3,3)
!
at = RESHAPE((/1.d0, 0.d0, 0.d0, 0.d0, 1.d0, 0.d0, 0.d0, 0.d0, 1.d0/), shape(at))
bg = RESHAPE((/1.d0, 0.d0, 0.d0, 0.d0, 1.d0, 0.d0, 0.d0, 0.d0, 1.d0/), shape(bg))
bg = 2.d0*pi
!
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 4.d0, nyfft=nyfft)
!
END SUBROUTINE fft_desc_init
SUBROUTINE fft_desc_finalize(dfft, smap)
USE fft_types, ONLY : fft_type_descriptor, fft_type_deallocate
USE stick_base, ONLY : sticks_map, sticks_map_deallocate
implicit none
TYPE(fft_type_descriptor) :: dfft
TYPE(sticks_map) :: smap
!
CALL fft_type_deallocate(dfft)
CALL sticks_map_deallocate( smap )
END SUBROUTINE fft_desc_finalize
!
SUBROUTINE fill_random(c, c_d, n)
USE cudafor
USE fft_param, ONLY : DP
implicit none
complex(DP), device :: c_d(:)
complex(DP) :: c(:)
integer, intent(in) :: n
!
real(DP), ALLOCATABLE :: rnd_aux(:)
!
ALLOCATE (rnd_aux(2*n))
CALL RANDOM_NUMBER(rnd_aux)
c = CMPLX(rnd_aux(1:n), rnd_aux(n:2*n))
c_d = c
DEALLOCATE(rnd_aux)
END SUBROUTINE fill_random
!
SUBROUTINE test_fft_scatter_xy_gpu_1(mp, test, gamma_only, ny)
USE cudafor
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
USE scatter_mod, ONLY : fft_scatter_xy
USE scatter_mod_gpu, ONLY : fft_scatter_xy_gpu
implicit none
TYPE(mpi_t) :: mp
TYPE(tester_t) :: test
!
TYPE(fft_type_descriptor) :: dfft
TYPE(sticks_map) :: smap
LOGICAL, INTENT(IN) :: gamma_only
INTEGER, INTENT(IN) :: ny
!
LOGICAL :: parallel
COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_out(:), aux(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: scatter_in_d(:), scatter_out_d(:)
integer(kind = cuda_stream_kind) :: stream = 0
integer :: fft_sign = 2
integer :: vsiz, nr1p_, compare_len, me2
!
parallel = mp%n .gt. 1
CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
me2 = dfft%mype2 + 1
vsiz = dfft%nnr
compare_len = dfft%nr1x * dfft%my_nr2p * dfft%my_nr3p
if (ny > 1) then
! When using task groups, wave FFTs are not distributed along Y
fft_sign = 3
vsiz = dfft%nnr_tg
compare_len = dfft%nr1x * dfft%nr2x * dfft%my_nr3p
end if
!
! Allocate variables
ALLOCATE(scatter_in(vsiz), scatter_out(vsiz), aux(vsiz))
ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz))
!
! Test 1
CALL fill_random(scatter_in, scatter_in_d, vsiz)
!
CALL fft_scatter_xy( dfft, scatter_in, scatter_out, vsiz, fft_sign )
CALL fft_scatter_xy_gpu( dfft, scatter_in_d, scatter_out_d, vsiz, fft_sign, stream )
aux(1:compare_len) = scatter_out_d(1:compare_len)
!
! Check
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
!
! Test 2
CALL fill_random(scatter_in, scatter_in_d, vsiz)
!
CALL fft_scatter_xy( dfft, scatter_out, scatter_in, vsiz, -1*fft_sign )
CALL fft_scatter_xy_gpu( dfft, scatter_out_d, scatter_in_d, vsiz, -1*fft_sign, stream )
!
compare_len = dfft%nr2x * dfft%nr1w(me2) * dfft%my_nr3p
IF (ny > 1) compare_len = dfft%nr2x * dfft%nr1w_tg * dfft%my_nr3p
!
aux(1:compare_len) = scatter_out_d(1:compare_len)
! Check
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
!
CALL fft_desc_finalize(dfft, smap)
DEALLOCATE(scatter_in, scatter_out, aux, scatter_in_d, scatter_out_d)
!
END SUBROUTINE test_fft_scatter_xy_gpu_1
!
SUBROUTINE test_fft_scatter_yz_gpu_1(mp, test, gamma_only, ny)
!
! This test checks wave fft scatter, with parallel = .true. if
! called with more than 1 MPI.
!
USE cudafor
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
USE scatter_mod, ONLY : fft_scatter_yz
USE scatter_mod_gpu, ONLY : fft_scatter_yz_gpu
implicit none
TYPE(mpi_t) :: mp
TYPE(tester_t) :: test
!
TYPE(fft_type_descriptor) :: dfft
TYPE(sticks_map) :: smap
LOGICAL, INTENT(IN) :: gamma_only
INTEGER, INTENT(IN) :: ny
!
LOGICAL :: parallel
COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_out(:), aux(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: scatter_in_d(:), scatter_out_d(:)
integer(kind = cuda_stream_kind) :: stream = 0
integer :: fft_sign = 2
integer :: vsiz, compare_len, my_nr1p_
!
parallel = mp%n .gt. 1
CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
vsiz = dfft%nnr
my_nr1p_ = count(dfft%ir1w > 0)
if (ny > 1) then
fft_sign = 3
vsiz = dfft%nnr_tg
my_nr1p_ = count(dfft%ir1w_tg > 0)
end if
!
! Allocate variables
ALLOCATE(scatter_in(vsiz), scatter_out(vsiz), aux(vsiz))
ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz))
!
! Test 1
CALL fill_random(scatter_in, scatter_in_d, vsiz)
!
CALL fft_scatter_yz( dfft, scatter_in, scatter_out, vsiz, fft_sign )
CALL fft_scatter_yz_gpu( dfft, scatter_in_d, scatter_out_d, vsiz, fft_sign )
! Set the number of elements that should be strictly equivalent in the
! two implementations.
compare_len = dfft%my_nr3p*my_nr1p_*dfft%nr2x
aux(1:compare_len) = scatter_out_d(1:compare_len)
! Check
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
!
! Test 2
CALL fill_random(scatter_in, scatter_in_d, vsiz)
!
CALL fft_scatter_yz( dfft, scatter_out, scatter_in, vsiz, -1*fft_sign )
CALL fft_scatter_yz_gpu( dfft, scatter_out_d, scatter_in_d, vsiz, -1*fft_sign )
!
compare_len = dfft%nsw(mp%me+1)*dfft%nr3x
aux(1:compare_len) = scatter_out_d(1:compare_len)
! Check
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
!
CALL fft_desc_finalize(dfft, smap)
DEALLOCATE(scatter_in, scatter_out, aux, scatter_in_d, scatter_out_d)
!
END SUBROUTINE test_fft_scatter_yz_gpu_1
!
SUBROUTINE test_fft_scatter_many_yz_gpu_1(mp, test, gamma_only, ny)
!
! This test checks wave fft scatter, with parallel = .true. if
! called with more than 1 MPI.
!
USE cudafor
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
USE scatter_mod, ONLY : fft_scatter_yz
USE scatter_mod_gpu, ONLY : fft_scatter_yz_gpu, fft_scatter_many_yz_gpu
implicit none
TYPE(mpi_t) :: mp
TYPE(tester_t) :: test
!
TYPE(fft_type_descriptor) :: dfft
TYPE(sticks_map) :: smap
LOGICAL, INTENT(IN) :: gamma_only
INTEGER, INTENT(IN) :: ny
INTEGER, PARAMETER :: howmany = 4
!
LOGICAL :: parallel
COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_in_cpy(:), scatter_out(:), aux(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: scatter_in_d(:), scatter_out_d(:), aux_d(:)
! convenient variables for slices
integer :: i, l, start_in, end_in, start_out, end_out, nstick_zx, n3, n3x, vsiz
integer :: start_sl, end_sl
!integer(kind = cuda_stream_kind) :: streams(5)
!
parallel = mp%n .gt. 1
IF (ny > 1) print *, 'scatter_many does not support task grouping'
CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
!
! Allocate variables
vsiz = dfft%nnr*howmany
ALLOCATE(scatter_in(vsiz), scatter_in_cpy(vsiz), scatter_out(vsiz), aux(vsiz))
ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz), aux_d(vsiz))
!
! How FFT allocates bunches in this case:
nstick_zx = MAXVAL(dfft%nsw)
n3 = dfft%nr3
n3x = dfft%nr3x
! Test 1
CALL fill_random(scatter_in, scatter_in_d, vsiz)
scatter_in_cpy = scatter_in
!
!print *, 'dfft%nnr, nr3, nr2, nr1 : ', dfft%nnr, dfft%nr3, dfft%nr2, dfft%nr1
! Test 1.1, compare scatter of slices
DO i=0,howmany-1
start_in = i*dfft%nnr + 1
end_in = (i+1)*dfft%nnr
start_out = start_in
!
end_out = (start_out-1) + dfft%my_nr3p*dfft%nr1w(dfft%mype2 +1)*dfft%nr2x
!
CALL fft_scatter_yz( dfft, scatter_in(start_in:end_in), scatter_out(start_out:end_out), dfft%nnr, 2 )
CALL fft_scatter_yz_gpu( dfft, scatter_in_d(start_in:end_in), scatter_out_d(start_in:end_out), dfft%nnr, 2 )
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
! Check
CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
END DO
!
scatter_in = scatter_in_cpy
! Store data as expected in input
DO i=0,howmany-1
start_in = i*dfft%nnr + 1
end_in = (i+1)*dfft%nnr
start_out= i*nstick_zx*n3x+1
end_out = (i+1)*nstick_zx*n3
scatter_in_d( start_out : end_out ) = scatter_in(start_in:start_in+nstick_zx*n3)
END DO
CALL fft_scatter_many_yz_gpu ( dfft, scatter_in_d, scatter_out_d, vsiz, 2, howmany )
DO i=0,howmany-1
start_out = i*dfft%nnr + 1
end_out = (start_out-1) + dfft%my_nr3p*dfft%nr1w(dfft%mype2 +1)*dfft%nr2x
!
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
!
CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
END DO
!
!
! Test 2
CALL fill_random(scatter_in, scatter_in_d, vsiz)
scatter_in_cpy = scatter_in
!
DO i=0,howmany-1
! Input data for fft_scatter_yz call
start_in = i*dfft%nnr + 1
end_in = (i+1)*dfft%nnr
!
! Where to store output data
start_out = start_in
end_out = end_in
CALL fft_scatter_yz( dfft, scatter_out(start_out:end_out), scatter_in(start_in:end_in), dfft%nnr, -2 )
CALL fft_scatter_yz_gpu( dfft, scatter_out_d(start_out:end_out), scatter_in_d(start_in:end_in), dfft%nnr, -2 )
!
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
!
! Check only relevant part
end_out = start_out + dfft%nsw(mp%me+1)*n3
CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
END DO
!
!
! Now repeat the test, but doing all the FFTs in a single shot
scatter_in_d(1:vsiz) = scatter_in_cpy(1:vsiz)
!
CALL fft_scatter_many_yz_gpu ( dfft, scatter_out_d, scatter_in_d, vsiz, -2, howmany )
DO i=0,howmany-1
! Extract data from GPU. Data are spaced by nstick_zx*n3x
start_out = 1 + i*nstick_zx*n3x
end_out = (i+1)*nstick_zx*n3x
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
!
start_in = i*dfft%nnr + 1
end_in = i*dfft%nnr + dfft%nsw(mp%me+1)*n3x !nstick_zx*nx3!
!
! Extract only data tofft_scatter_yz compare the two methods. Results from the
! previous call to fft_scatter_yz are separated by nnr, while the
! new results are separated by nstick_zx*n3x. We read start_out
! and add from there dfft%nsw(mp%me+1)*n3x to be compared (don't forget the -1!)
start_sl = start_out
end_sl = (start_out - 1) + dfft%nsw(mp%me+1)*n3x
CALL test%assert_close( aux(start_sl:end_sl), scatter_out(start_in:end_in) )
END DO
!
CALL fft_desc_finalize(dfft, smap)
DEALLOCATE(scatter_in, scatter_in_cpy, scatter_out, aux, scatter_in_d, scatter_out_d)
!
END SUBROUTINE test_fft_scatter_many_yz_gpu_1
end program test_fft_scatter_mod_gpu
! dummy subroutines
subroutine stop_clock( label )
character(len=*) :: label
end subroutine stop_clock
subroutine start_clock( label )
character(len=*) :: label
end subroutine start_clock
!
#else
program test_fft_scatter_mod_gpu
end program test_fft_scatter_mod_gpu
#endif

View File

@ -0,0 +1,424 @@
#if defined(__CUDA)
program test_fwinv_gpu
#if defined(__MPI)
USE mpi
#endif
USE tester
IMPLICIT NONE
! MPI type
type mpi_t
integer :: me, n, root, comm
end type mpi_t
TYPE(mpi_t) :: mp
!
TYPE(tester_t) :: test
!
INTEGER :: ierr, level, i
!
#if defined(__MPI)
#if defined(_OPENMP)
CALL MPI_Init_thread(MPI_THREAD_FUNNELED,level, ierr)
#else
CALL MPI_Init(ierr)
#endif
#endif
!
CALL mpi_data_init(mp%me, mp%n, mp%root, mp%comm)
!
CALL test%init()
!
test%tolerance64 = 1.d-12
!
CALL save_random_seed("test_fwinv_gpu", mp%me)
!
DO i = 1, mp%n
IF (MOD(mp%n,i) == 0 ) THEN
CALL test_fwfft_gpu_1(mp, test, .true., i)
CALL test_fwfft_gpu_1(mp, test, .false., i)
!
CALL test_invfft_gpu_1(mp, test, .true., i)
CALL test_invfft_gpu_1(mp, test, .false., i)
END IF
END DO
CALL test_fwfft_many_gpu_1(mp, test, .true., 1)
CALL test_fwfft_many_gpu_1(mp, test, .false., 1)
!
CALL test_invfft_many_gpu_1(mp, test, .true., 1)
CALL test_invfft_many_gpu_1(mp, test, .false., 1)
!
CALL collect_results(test)
!
IF (mp%me == mp%root) CALL test%print()
!
#if defined(__MPI)
CALL MPI_Finalize(ierr)
#endif
CONTAINS
!
SUBROUTINE mpi_data_init(mpme, npes, mproot, comm)
implicit none
integer, intent(out) :: mpme, npes, mproot, comm
integer :: ierr
mpme=0; npes=1; mproot=0; comm=0
#if defined(__MPI)
CALL mpi_comm_rank(MPI_COMM_WORLD, mpme, ierr)
CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
comm = MPI_COMM_WORLD
#endif
END SUBROUTINE mpi_data_init
!
SUBROUTINE calc_bg(at, bg)
USE fft_param, ONLY : DP
implicit none
REAL(DP), PARAMETER :: pi=4.D0*DATAN(1.D0)
REAL(DP), INTENT(IN) :: at(3,3)
REAL(DP), INTENT(OUT) :: bg(3,3)
REAL(DP) :: ucvol
!
ucvol=at(1,1)*(at(2,2)*at(3,3)-at(3,2)*at(2,3))+&
& at(2,1)*(at(3,2)*at(1,2)-at(1,2)*at(3,3))+&
at(3,1)*(at(1,2)*at(2,3)-at(2,2)*at(1,3))
!calculate reciprocal-space lattice vectors
bg(1,1)=2.0*pi*(at(2,2)*at(3,3)-at(3,2)*at(2,3))/ucvol
bg(2,1)=2.0*pi*(at(3,2)*at(1,3)-at(1,2)*at(3,3))/ucvol
bg(3,1)=2.0*pi*(at(1,2)*at(2,3)-at(2,2)*at(1,3))/ucvol
bg(1,2)=2.0*pi*(at(2,3)*at(3,1)-at(3,3)*at(2,1))/ucvol
bg(2,2)=2.0*pi*(at(3,3)*at(1,1)-at(1,3)*at(3,1))/ucvol
bg(3,2)=2.0*pi*(at(1,3)*at(2,1)-at(2,3)*at(1,1))/ucvol
bg(1,3)=2.0*pi*(at(2,1)*at(3,2)-at(3,1)*at(2,2))/ucvol
bg(2,3)=2.0*pi*(at(3,1)*at(1,2)-at(1,1)*at(3,2))/ucvol
bg(3,3)=2.0*pi*(at(1,1)*at(2,2)-at(2,1)*at(1,2))/ucvol
END SUBROUTINE calc_bg
!
SUBROUTINE fft_desc_init(dfft, smap, flavor, gamma_only, parallel, comm, nyfft)
USE stick_base
USE fft_types, ONLY : fft_type_descriptor, fft_type_init
USE fft_param, ONLY : DP
implicit none
TYPE(fft_type_descriptor) :: dfft
TYPE(sticks_map) :: smap
CHARACTER(LEN=*), INTENT(IN) :: flavor
LOGICAL :: gamma_only
LOGICAL :: parallel
INTEGER :: comm, nyfft
!
REAL(DP) :: at(3,3), bg(3,3)
!
at = RESHAPE((/10.d0, 0.d0, 0.d0, 0.d0, 10.d0, 0.d0, 0.d0, 0.d0, 10.d0/), shape(at))
CALL calc_bg(at, bg)
!
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 6.d0, nyfft=nyfft)
!
END SUBROUTINE fft_desc_init
SUBROUTINE fft_desc_finalize(dfft, smap)
USE fft_types, ONLY : fft_type_descriptor, fft_type_deallocate
USE stick_base, ONLY : sticks_map, sticks_map_deallocate
implicit none
TYPE(fft_type_descriptor) :: dfft
TYPE(sticks_map) :: smap
!
CALL fft_type_deallocate(dfft)
CALL sticks_map_deallocate( smap )
END SUBROUTINE fft_desc_finalize
!
SUBROUTINE fill_random(c, c_d, n)
USE cudafor
USE fft_param, ONLY : DP
implicit none
complex(DP), device :: c_d(:)
complex(DP) :: c(:)
integer, intent(in) :: n
!
real(DP), ALLOCATABLE :: rnd_aux(:)
!
ALLOCATE (rnd_aux(2*n))
CALL RANDOM_NUMBER(rnd_aux)
c = CMPLX(rnd_aux(1:n), rnd_aux(n:2*n))
c_d = c
DEALLOCATE(rnd_aux)
END SUBROUTINE fill_random
!
SUBROUTINE test_fwfft_gpu_1(mp, test, gamma_only, ny)
USE cudafor
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
USE fft_interfaces, ONLY : fwfft
implicit none
TYPE(mpi_t) :: mp
TYPE(tester_t) :: test
!
TYPE(fft_type_descriptor) :: dfft
TYPE(sticks_map) :: smap
LOGICAL, INTENT(IN) :: gamma_only
INTEGER, INTENT(IN) :: ny
!
LOGICAL :: parallel
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
INTEGER :: i
!
parallel = mp%n .gt. 1
CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
!
! Test 1
!
IF ( ny .gt. 1 ) THEN
! Allocate variables
ALLOCATE(data_in(dfft%nnr_tg), aux(dfft%nnr_tg))
ALLOCATE(data_in_d(dfft%nnr_tg))
CALL fill_random(data_in, data_in_d, dfft%nnr_tg)
!
CALL fwfft( 'tgWave' , data_in, dfft, 1 )
CALL fwfft( 'tgWave' , data_in_d, dfft, 1 )
ELSE
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
ALLOCATE(data_in_d(dfft%nnr))
CALL fill_random(data_in, data_in_d, dfft%nnr)
!
CALL fwfft( 'Wave' , data_in, dfft, 1 )
CALL fwfft( 'Wave' , data_in_d, dfft, 1 )
ENDIF
aux = data_in_d
! Check
CALL test%assert_close( data_in(1:dfft%ngw), aux(1:dfft%ngw) )
!
! Test 2
!
DEALLOCATE(data_in, data_in_d, aux)
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
ALLOCATE(data_in_d(dfft%nnr))
CALL fill_random(data_in, data_in_d, dfft%nnr)
!
CALL fwfft( 'Rho' , data_in, dfft, 1 )
CALL fwfft( 'Rho' , data_in_d, dfft, 1 )
aux = data_in_d
! Check
CALL test%assert_close( data_in(1:dfft%ngm), aux(1:dfft%ngm) )
!
CALL fft_desc_finalize(dfft, smap)
DEALLOCATE(data_in, data_in_d, aux)
!
END SUBROUTINE test_fwfft_gpu_1
!
SUBROUTINE test_invfft_gpu_1(mp, test, gamma_only, ny)
USE cudafor
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
USE fft_interfaces, ONLY : invfft
implicit none
TYPE(mpi_t) :: mp
TYPE(tester_t) :: test
!
TYPE(fft_type_descriptor) :: dfft
TYPE(sticks_map) :: smap
LOGICAL, INTENT(IN) :: gamma_only
INTEGER, INTENT(IN) :: ny
!
LOGICAL :: parallel
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
!
parallel = mp%n .gt. 1
CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
!
! Test 1
!
IF ( ny .gt. 1 ) THEN
! Allocate variables
ALLOCATE(data_in(dfft%nnr_tg), aux(dfft%nnr_tg))
ALLOCATE(data_in_d(dfft%nnr_tg))
CALL fill_random(data_in, data_in_d, dfft%nnr_tg)
!
CALL invfft( 'tgWave' , data_in, dfft, 1 )
CALL invfft( 'tgWave' , data_in_d, dfft, 1 )
ELSE
!
! Allocate variables
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
ALLOCATE(data_in_d(dfft%nnr))
CALL fill_random(data_in, data_in_d, dfft%nnr)
!
CALL invfft( 'Wave' , data_in, dfft, 1 )
CALL invfft( 'Wave' , data_in_d, dfft, 1 )
ENDIF
aux = data_in_d
! Check
CALL test%assert_close( data_in, aux )
!
! Test 2
!
DEALLOCATE(data_in, data_in_d, aux)
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
ALLOCATE(data_in_d(dfft%nnr))
CALL fill_random(data_in, data_in_d, dfft%nnr)
!
CALL invfft( 'Rho' , data_in, dfft, 1 )
CALL invfft( 'Rho' , data_in_d, dfft, 1 )
aux = data_in_d
! Check
CALL test%assert_close( data_in, aux )
!
CALL fft_desc_finalize(dfft, smap)
DEALLOCATE(data_in, data_in_d, aux)
!
END SUBROUTINE test_invfft_gpu_1
!
SUBROUTINE test_fwfft_many_gpu_1(mp, test, gamma_only, ny)
USE cudafor
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
USE fft_interfaces, ONLY : fwfft
implicit none
TYPE(mpi_t) :: mp
TYPE(tester_t) :: test
!
TYPE(fft_type_descriptor) :: dfft
TYPE(sticks_map) :: smap
LOGICAL, INTENT(IN) :: gamma_only
INTEGER, INTENT(IN) :: ny
!
LOGICAL :: parallel
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
integer, parameter :: howmany=4
INTEGER :: i, start
!
parallel = mp%n .gt. 1
CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
!
! Test 1
!
IF ( ny .gt. 1 ) THEN
! Not (yet?) possible
RETURN
ELSE
ALLOCATE(data_in(dfft%nnr*howmany), aux(dfft%nnr*howmany))
ALLOCATE(data_in_d(dfft%nnr*howmany))
CALL fill_random(data_in, data_in_d, dfft%nnr*howmany)
!
CALL fwfft( 'Wave' , data_in_d, dfft, howmany=howmany)
!
DO i=0,howmany-1
start = i*dfft%nnr
CALL fwfft( 'Wave' , data_in(1+start:), dfft, 1 )
aux(start+1:start+dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
! Check
CALL test%assert_close( data_in(start+1:start+dfft%ngw), aux(start+1:start+dfft%ngw) )
END DO
!
ENDIF
!
! Test 2
!
DEALLOCATE(data_in, data_in_d, aux)
ALLOCATE(data_in(dfft%nnr*howmany), aux(dfft%nnr))
ALLOCATE(data_in_d(dfft%nnr*howmany))
!
CALL fill_random(data_in, data_in_d, dfft%nnr*howmany)
!
CALL fwfft( 'Rho' , data_in_d, dfft, howmany)
DO i=0,howmany-1
start = i*dfft%nnr
CALL fwfft( 'Rho' , data_in(1+start:), dfft, 1 )
aux(1:dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
! Check
CALL test%assert_close( data_in(start+1:start+dfft%ngm), aux(1:dfft%ngm) )
END DO
!
CALL fft_desc_finalize(dfft, smap)
DEALLOCATE(data_in, data_in_d, aux)
!
END SUBROUTINE test_fwfft_many_gpu_1
!
SUBROUTINE test_invfft_many_gpu_1(mp, test, gamma_only, ny)
USE cudafor
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
USE fft_interfaces, ONLY : invfft
implicit none
TYPE(mpi_t) :: mp
TYPE(tester_t) :: test
!
TYPE(fft_type_descriptor) :: dfft
TYPE(sticks_map) :: smap
LOGICAL, INTENT(IN) :: gamma_only
INTEGER, INTENT(IN) :: ny
!
LOGICAL :: parallel
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
INTEGER(kind = cuda_stream_kind) :: strm = 0
integer, parameter :: howmany=4
integer :: start, i
!
parallel = mp%n .gt. 1
CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
!
! Test 1
!
IF ( ny .gt. 1 ) THEN
! Not (yet?) possible
RETURN
ELSE
!
! Allocate variables
ALLOCATE(data_in(howmany*dfft%nnr), aux(howmany*dfft%nnr))
ALLOCATE(data_in_d(howmany*dfft%nnr))
CALL fill_random(data_in, data_in_d, howmany*dfft%nnr)
!
!CALL invfft( 'Wave' , data_in, dfft, 1 )
CALL invfft( 'Wave' , data_in_d, dfft, howmany=howmany, stream=strm )
DO i=0,howmany-1
start = i*dfft%nnr
CALL invfft( 'Wave' , data_in(1+start:), dfft, 1 )
aux(start+1:start+dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
! Check
CALL test%assert_close( data_in(start+1:start+dfft%nnr), aux(start+1:start+dfft%nnr) )
END DO
ENDIF
!
! Test 2
!
DEALLOCATE(data_in, data_in_d, aux)
ALLOCATE(data_in(dfft%nnr*howmany), aux(dfft%nnr))
ALLOCATE(data_in_d(dfft%nnr*howmany))
CALL fill_random(data_in, data_in_d, dfft%nnr*howmany)
!
CALL invfft( 'Rho' , data_in_d, dfft, howmany )
!
DO i=0,howmany-1
start = i*dfft%nnr
CALL invfft( 'Rho' , data_in(1+start:), dfft, 1 )
aux(1:dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
! Check
CALL test%assert_close( data_in(start+1:start+dfft%nnr), aux(1:dfft%nnr) )
END DO
!
CALL fft_desc_finalize(dfft, smap)
DEALLOCATE(data_in, data_in_d, aux)
!
END SUBROUTINE test_invfft_many_gpu_1
end program test_fwinv_gpu
! dummy subroutines
subroutine stop_clock( label )
character(len=*) :: label
end subroutine stop_clock
subroutine start_clock( label )
character(len=*) :: label
end subroutine start_clock
!
#else
program test_fwinv_gpu
end program test_fwinv_gpu
#endif

862
FFTXlib/tests/tester.f90 Normal file
View File

@ -0,0 +1,862 @@
! This file is part of fortran_tester
! Copyright 2015 Pierre de Buyl and Peter Colberg
! 2016 Pierre de Buyl and Stefano Szaghi
! 2018 Pierre de Buyl and Pietro Bonfa
! License: BSD
!> Routines to test Fortran programs
!!
!! fortran_tester is a pure-Fortran module. It provides a datatype to hold test results and
!! routines to test for equality, closeness, and positivity of variables. The routines are
!! overloaded and the resulting interface consists of a small number of names.
module tester
use, intrinsic :: iso_fortran_env, only : int8, int16, int32, int64, real32, real64
implicit none
private
public :: tester_t
!> The main **tester** class.
type :: tester_t
integer(int32) :: n_errors=0_int32 !< Number of errors.
integer(int32) :: n_tests=0_int32 !< Number of tests.
real(real32) :: tolerance32=2._real32*epsilon(1._real32) !< Real tolerance, 32 bits.
real(real64) :: tolerance64=2._real64*epsilon(1._real64) !< Real tolerance, 64 bits.
contains
procedure :: init !< Initialize the tester.
procedure :: print !< Print tests results.
generic, public :: assert_equal => &
assert_equal_i8, &
assert_equal_i16, &
assert_equal_i32, &
assert_equal_i64, &
assert_equal_r32, &
assert_equal_r64, &
assert_equal_c32, &
assert_equal_c64, &
assert_equal_l, &
assert_equal_i8_1, &
assert_equal_i16_1, &
assert_equal_i32_1, &
assert_equal_i64_1, &
assert_equal_r32_1, &
assert_equal_r64_1, &
assert_equal_c32_1, &
assert_equal_c64_1, &
assert_equal_l_1 !< Check if two values (integer, real, complex or logical) are equal.
procedure, private :: assert_equal_i8 !< Check if two integers (8 bits) are equal.
procedure, private :: assert_equal_i16 !< Check if two integers (16 bits) are equal.
procedure, private :: assert_equal_i32 !< Check if two integers (32 bits) are equal.
procedure, private :: assert_equal_i64 !< Check if two integers (64 bits) are equal.
procedure, private :: assert_equal_r32 !< Check if two reals (32 bits) are equal.
procedure, private :: assert_equal_r64 !< Check if two reals (64 bits) are equal.
procedure, private :: assert_equal_c32 !< Check if two complex numbers (32 bits) are equal.
procedure, private :: assert_equal_c64 !< Check if two complex numbers (64 bits) are equal.
procedure, private :: assert_equal_l !< Check if two logicals are equal.
procedure, private :: assert_equal_i8_1 !< Check if two integer (8 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_i16_1 !< Check if two integer (16 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_i32_1 !< Check if two integer (32 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_i64_1 !< Check if two integer (64 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_r32_1 !< Check if two real (32 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_r64_1 !< Check if two real (64 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_c32_1 !< Check if two complex (32 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_c64_1 !< Check if two complex (64 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_l_1 !< Check if two logical arrays (rank 1) are equal.
generic, public :: assert_positive => &
assert_positive_i8, &
assert_positive_i16, &
assert_positive_i32, &
assert_positive_i64, &
assert_positive_r32, &
assert_positive_r64, &
assert_positive_i8_1, &
assert_positive_i16_1, &
assert_positive_i32_1, &
assert_positive_i64_1, &
assert_positive_r32_1, &
assert_positive_r64_1 !< Check if a number (integer or real) is positive.
procedure, private :: assert_positive_i8 !< Check if a integer (8 bits) is positive.
procedure, private :: assert_positive_i16 !< Check if a integer (16 bits) is positive.
procedure, private :: assert_positive_i32 !< Check if a integer (32 bits) is positive.
procedure, private :: assert_positive_i64 !< Check if a integer (64 bits) is positive.
procedure, private :: assert_positive_r32 !< Check if a real (32 bits) is positive.
procedure, private :: assert_positive_r64 !< Check if a real (64 bits) is positive.
procedure, private :: assert_positive_i8_1 !< Check if a integer (8 bits) array (rank 1) is positive.
procedure, private :: assert_positive_i16_1 !< Check if a integer (16 bits) array (rank 1) is positive.
procedure, private :: assert_positive_i32_1 !< Check if a integer (32 bits) array (rank 1) is positive.
procedure, private :: assert_positive_i64_1 !< Check if a integer (64 bits) array (rank 1) is positive.
procedure, private :: assert_positive_r32_1 !< Check if a real (32 bits) array (rank 1) is positive.
procedure, private :: assert_positive_r64_1 !< Check if a real (64 bits) array (rank 1) is positive.
generic, public :: assert_close => &
assert_close_r32, &
assert_close_r64, &
assert_close_c32, &
assert_close_c64, &
assert_close_r32_1, &
assert_close_r64_1, &
assert_close_c32_1, &
assert_close_c64_1 !< Check if two values (real or complex) are close with respect a tolerance.
procedure, private :: assert_close_r32 !< Check if two reals (32 bits) are close with respect a tolerance.
procedure, private :: assert_close_r64 !< Check if two reals (64 bits) are close with respect a tolerance.
procedure, private :: assert_close_c32 !< Check if two complex numbers (32 bits) are close with respect a tolerance.
procedure, private :: assert_close_c64 !< Check if two complex numbers (64 bits) are close with respect a tolerance.
procedure, private :: assert_close_r32_1 !< Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
procedure, private :: assert_close_r64_1 !< Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
procedure, private :: assert_close_c32_1 !< Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
procedure, private :: assert_close_c64_1 !< Check if two complex (64 bits) arrays (rank 1) are close with respect a tolerance.
end type tester_t
contains
!> Initialize the tester.
subroutine init(this, tolerance32, tolerance64)
class(tester_t), intent(out) :: this !< The tester.
real(real32), intent(in), optional :: tolerance32 !< Real tolerance, 32 bits.
real(real64), intent(in), optional :: tolerance64 !< Real tolerance, 64 bits.
this% n_errors = 0
this% n_tests = 0
if (present(tolerance64)) then
this% tolerance64 = tolerance64
else
this% tolerance64 = 2._real64*epsilon(1._real64)
end if
if (present(tolerance32)) then
this% tolerance32 = tolerance32
else
this% tolerance32 = 2._real32*epsilon(1._real32)
end if
end subroutine init
!> Print tests results.
subroutine print(this, errorstop)
class(tester_t), intent(in) :: this !< The tester.
logical, intent(in), optional :: errorstop !< Flag to activate error stop if one test fails.
logical :: do_errorstop
if (present(errorstop)) then
do_errorstop = errorstop
else
do_errorstop = .true.
end if
write(*,*) 'fortran_tester:', this% n_errors, ' error(s) for', this% n_tests, 'test(s)'
if (this% n_errors == 0) then
write(*,*) 'fortran_tester: all tests succeeded'
else
write(*,*) 'fortran_tester: tests failed'
if (do_errorstop) then
stop 1
end if
end if
end subroutine print
!> Check if two integers (8 bits) are equal.
subroutine assert_equal_i8(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int8), intent(in) :: i1 !< Value to compare.
integer(int8), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i1 .ne. i2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_i8
!> Check if two integers (16 bits) are equal.
subroutine assert_equal_i16(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int16), intent(in) :: i1 !< Value to compare.
integer(int16), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i1 .ne. i2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_i16
!> Check if two integers (32 bits) are equal.
subroutine assert_equal_i32(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int32), intent(in) :: i1 !< Value to compare.
integer(int32), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i1 .ne. i2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_i32
!> Check if two integers (64 bits) are equal.
subroutine assert_equal_i64(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int64), intent(in) :: i1 !< Value to compare.
integer(int64), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i1 .ne. i2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_i64
!> Check if two reals (32 bits) are equal.
subroutine assert_equal_r32(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), intent(in) :: r1 !< Value to compare.
real(real32), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (r1 .ne. r2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_r32
!> Check if two reals (64 bits) are equal.
subroutine assert_equal_r64(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), intent(in) :: r1 !< Value to compare.
real(real64), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (r1 .ne. r2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_r64
!> Check if two complex numbers (32 bits) are equal.
subroutine assert_equal_c32(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real32), intent(in) :: c1 !< Value to compare.
complex(real32), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (c1 .ne. c2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_c32
!> Check if two complex numbers (64 bits) are equal.
subroutine assert_equal_c64(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real64), intent(in) :: c1 !< Value to compare.
complex(real64), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (c1 .ne. c2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_c64
!> Check if two logicals are equal.
subroutine assert_equal_l(this, l1, l2, fail)
class(tester_t), intent(inout) :: this !< The tester.
logical, intent(in) :: l1 !< Value to compare.
logical, intent(in) :: l2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (l1 .neqv. l2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_l
!> Check if two integer (8 bits) arrays (rank 1) are equal.
subroutine assert_equal_i8_1(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int8), dimension(:), intent(in) :: i1 !< Value to compare.
integer(int8), dimension(:), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(i1) .ne. size(i2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(i1-i2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_i8_1
!> Check if two integer (16 bits) arrays (rank 1) are equal.
subroutine assert_equal_i16_1(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int16), dimension(:), intent(in) :: i1 !< Value to compare.
integer(int16), dimension(:), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(i1) .ne. size(i2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(i1-i2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_i16_1
!> Check if two integer (32 bits) arrays (rank 1) are equal.
subroutine assert_equal_i32_1(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int32), dimension(:), intent(in) :: i1 !< Value to compare.
integer(int32), dimension(:), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(i1) .ne. size(i2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(i1-i2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_i32_1
!> Check if two integer (64 bits) arrays (rank 1) are equal.
subroutine assert_equal_i64_1(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int64), dimension(:), intent(in) :: i1 !< Value to compare.
integer(int64), dimension(:), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(i1) .ne. size(i2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(i1-i2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_i64_1
!> Check if two real (32 bits) arrays (rank 1) are equal.
subroutine assert_equal_r32_1(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), dimension(:), intent(in) :: r1 !< Value to compare.
real(real32), dimension(:), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(r1) .ne. size(r2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(r1-r2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_r32_1
!> Check if two real (64 bits) arrays (rank 1) are equal.
subroutine assert_equal_r64_1(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), dimension(:), intent(in) :: r1 !< Value to compare.
real(real64), dimension(:), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(r1) .ne. size(r2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(r1-r2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_r64_1
!> Check if two complex (32 bits) arrays (rank 1) are equal.
subroutine assert_equal_c32_1(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real32), dimension(:), intent(in) :: c1 !< Value to compare.
complex(real32), dimension(:), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(c1) .ne. size(c2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(c1-c2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_c32_1
!> Check if two complex (64 bits) arrays (rank 1) are equal.
subroutine assert_equal_c64_1(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real64), dimension(:), intent(in) :: c1 !< Value to compare.
complex(real64), dimension(:), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(c1) .ne. size(c2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(c1-c2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_c64_1
!> Check if two logical arrays (rank 1) are equal.
subroutine assert_equal_l_1(this, l1, l2, fail)
class(tester_t), intent(inout) :: this !< The tester.
logical, intent(in), dimension(:) :: l1 !< Value to compare.
logical, intent(in), dimension(:) :: l2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
integer :: k
this% n_tests = this% n_tests + 1
if ( size(l1) .ne. size(l2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
do k = 1, size(l1)
if (l1(k) .neqv. l2(k)) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
exit
end if
end do
end if
end subroutine assert_equal_l_1
!> Check if a integer (32 bits) is positive.
subroutine assert_positive_i8(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int8), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i8
!> Check if a integer (16 bits) is positive.
subroutine assert_positive_i16(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int16), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i16
!> Check if a integer (32 bits) is positive.
subroutine assert_positive_i32(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int32), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i32
!> Check if a integer (32 bits) is positive.
subroutine assert_positive_i64(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int64), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i64
!> Check if a real (32 bits) is positive.
subroutine assert_positive_r32(this, r, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), intent(in) :: r !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (r < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_r32
!> Check if a real (64 bits) is positive.
subroutine assert_positive_r64(this, r, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), intent(in) :: r !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (r < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_r64
!> Check if a integer (8 bits) array (rank 1) is positive.
subroutine assert_positive_i8_1(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int8), dimension(:), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(i) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i8_1
!> Check if a integer (16 bits) array (rank 1) is positive.
subroutine assert_positive_i16_1(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int16), dimension(:), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(i) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i16_1
!> Check if a integer (32 bits) array (rank 1) is positive.
subroutine assert_positive_i32_1(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int32), dimension(:), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(i) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i32_1
!> Check if a integer (64 bits) array (rank 1) is positive.
subroutine assert_positive_i64_1(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int64), dimension(:), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(i) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i64_1
!> Check if a real (32 bits) array (rank 1) is positive.
subroutine assert_positive_r32_1(this, r, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), dimension(:), intent(in) :: r !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(r) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_r32_1
!> Check if a real (64 bits) array (rank 1) is positive.
subroutine assert_positive_r64_1(this, r, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), dimension(:), intent(in) :: r !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(r) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_r64_1
!> Check if two reals (32 bits) are close with respect a tolerance.
subroutine assert_close_r32(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), intent(in) :: r1 !< Value to compare.
real(real32), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( abs(r1-r2) > this% tolerance32 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_close_r32
!> Check if two reals (64 bits) are close with respect a tolerance.
subroutine assert_close_r64(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), intent(in) :: r1 !< Value to compare.
real(real64), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( abs(r1-r2) > this% tolerance64 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_close_r64
!> Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
subroutine assert_close_r32_1(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), intent(in), dimension(:) :: r1 !< Value to compare.
real(real32), intent(in), dimension(:) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(r1) .ne. size(r2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_close_r32_1
!> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
subroutine assert_close_r64_1(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), intent(in), dimension(:) :: r1 !< Value to compare.
real(real64), intent(in), dimension(:) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(r1) .ne. size(r2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_close_r64_1
!> Check if two complex numbers (32 bits) are close with respect a tolerance.
subroutine assert_close_c32(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real32), intent(in) :: c1 !< Value to compare.
complex(real32), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( abs(c1-c2) > this% tolerance32 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_close_c32
!> Check if two complex numbers (64 bits) are close with respect a tolerance.
subroutine assert_close_c64(this, r1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real64), intent(in) :: r1 !< Value to compare.
complex(real64), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( abs(r1-c2) > this% tolerance64 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_close_c64
!> Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
subroutine assert_close_c32_1(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real32), intent(in), dimension(:) :: c1 !< Value to compare.
complex(real32), intent(in), dimension(:) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(c1) .ne. size(c2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(c1-c2)) > this% tolerance32 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_close_c32_1
!> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
subroutine assert_close_c64_1(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real64), intent(in), dimension(:) :: c1 !< Value to compare.
complex(real64), intent(in), dimension(:) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(c1) .ne. size(c2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(c1-c2)) > this% tolerance64 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_close_c64_1
end module tester

94
FFTXlib/tests/utils.f90 Normal file
View File

@ -0,0 +1,94 @@
SUBROUTINE collect_results(test)
#if defined(__MPI)
USE mpi
#endif
USE tester
IMPLICIT NONE
!
TYPE(tester_t) :: test
INTEGER :: itottests, itoterr, ierr, me
!
#if defined(__MPI)
!
CALL MPI_REDUCE(test%n_errors, itoterr, 1, MPI_INTEGER, MPI_SUM, &
0, MPI_COMM_WORLD, ierr)
! Fail in case MPI fails...
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
!
CALL MPI_REDUCE(test%n_tests, itottests, 1, MPI_INTEGER, MPI_SUM, &
0, MPI_COMM_WORLD, ierr)
! Fail in case MPI fails...
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
!
test%n_tests = itottests
test%n_errors = itoterr
!
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
!
CALL MPI_Comm_rank(MPI_COMM_WORLD, me, ierr);
!
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
!
#endif
END SUBROUTINE collect_results
SUBROUTINE save_random_seed(test_name, mpime)
IMPLICIT NONE
CHARACTER(len=*), INTENT(IN) :: test_name
INTEGER, INTENT(IN) :: mpime
!
INTEGER, PARAMETER :: in_unit=20, out_unit=21
CHARACTER(len=80) :: fname
INTEGER :: n, istat
INTEGER, ALLOCATABLE :: seed(:)
!
CALL random_seed(size = n)
ALLOCATE(seed(n))
! First try if the OS provides a random number generator
OPEN(UNIT=in_unit, file="/dev/urandom", access="stream", &
form="unformatted", action="read", status="old", iostat=istat)
IF (istat == 0) THEN
READ(in_unit) seed
CLOSE(in_unit)
ELSE
! Fallback to stupid algorithm. Actually we do not really need
! high-quality random numbers
CALL random_seed(get=seed)
seed = seed + mpime
END IF
!
CALL random_seed(put=seed)
!
WRITE(fname, '("rnd_seed_",A,I4.4)') TRIM(test_name), mpime
fname = TRIM(fname)
!
OPEN (UNIT=out_unit,FILE=fname,ACTION="write",STATUS="replace")
!
WRITE (out_unit,*) n
WRITE (out_unit,*) seed
CLOSE (out_unit)
DEALLOCATE(seed)
!
END SUBROUTINE save_random_seed
SUBROUTINE no_test
#if defined(__MPI)
USE mpi
#endif
USE tester
IMPLICIT NONE
!TYPE(tester_t) :: test
INTEGER :: ierr
!
#if defined(__MPI)
CALL MPI_Init(ierr)
#endif
!CALL test%init()
!CALL print_results(test)
#if defined(__MPI)
CALL mpi_finalize(ierr)
#endif
END SUBROUTINE no_test

View File

@ -9,6 +9,8 @@ CG = \
ccgdiagg.o \
rcgdiagg.o
-include make.gpu
all : libcg.a

View File

@ -0,0 +1,526 @@
!
! Copyright (C) 2001-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
#define ZERO ( 0.D0, 0.D0 )
#define ONE ( 1.D0, 0.D0 )
FUNCTION KSDdot( n, A, incx, B, incy) result( res )
!
USE util_param, ONLY : DP
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: incx,incy,n
!
#if defined(__CUDA)
REAL(DP), DEVICE, INTENT(IN) :: A(n), B(n)
#else
REAL(DP), INTENT(IN) :: A(n), B(n)
REAL(DP), EXTERNAL :: ddot
#endif
!
REAL(DP) :: res
!
#if defined(__CUDA)
res = cublasDdot( n, A, incx, B, incy )
#else
res = ddot( n, A, incx, B, incy )
#endif
!
RETURN
!
END FUNCTION KSDdot
! define __VERBOSE to print a message after each eigenvalue is computed
!
!----------------------------------------------------------------------------
SUBROUTINE ccgdiagg_gpu( hs_1psi_gpu, s_1psi_gpu, precondition_d, &
npwx, npw, nbnd, npol, psi_d, e_d, btype, &
ethr, maxter, reorder, notconv, avg_iter )
!----------------------------------------------------------------------------
!
! ... "poor man" iterative diagonalization of a complex hermitian matrix
! ... through preconditioned conjugate gradient algorithm
! ... Band-by-band algorithm with minimal use of memory
! ... Calls hs_1psi and s_1psi to calculate H|psi> + S|psi> and S|psi>
! ... Works for generalized eigenvalue problem (US pseudopotentials) as well
!
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm
USE mp, ONLY : mp_sum
#if defined(__VERBOSE)
USE util_param, ONLY : stdout
#endif
USE device_memcpy_m, ONLY : dev_memset, dev_memcpy
!
IMPLICIT NONE
!
! ... Mathematical constants
!
REAL(DP), PARAMETER :: pi = 3.14159265358979323846_DP
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npwx, npw, nbnd, npol, maxter
INTEGER, INTENT(IN) :: btype(nbnd)
REAL(DP), INTENT(IN) :: precondition_d(npwx*npol), ethr
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nbnd)
REAL(DP), INTENT(INOUT) :: e_d(nbnd)
INTEGER, INTENT(OUT) :: notconv
REAL(DP), INTENT(OUT) :: avg_iter
#if defined(__CUDA)
attributes(DEVICE) :: precondition_d, psi_d, e_d
#endif
!
! ... local variables
!
INTEGER :: i, j, k, m, m_start, m_end, iter, moved
COMPLEX(DP), ALLOCATABLE :: hpsi_d(:), spsi_d(:), g_d(:), cg_d(:)
COMPLEX(DP), ALLOCATABLE :: scg_d(:), ppsi_d(:), g0_d(:), lagrange_d(:)
#if defined(__CUDA)
attributes(DEVICE) :: hpsi_d, spsi_d, g_d, cg_d, scg_d, ppsi_d, g0_d, lagrange_d
#endif
COMPLEX(DP), ALLOCATABLE :: lagrange(:)
REAL(DP) :: gamma, ddot_temp, es_1, es(2)
REAL(DP), ALLOCATABLE :: e(:)
REAL(DP) :: a0, b0, gg0, gg, gg1, cg0, e0, psi_norm, sint, cost
REAL(DP) :: theta, cos2t, sin2t
LOGICAL :: reorder
INTEGER :: kdim, kdmx, kdim2, ierr, istat
REAL(DP) :: empty_ethr, ethr_m
!
! ... external functions
!
REAL (DP), EXTERNAL :: ksDdot
EXTERNAL hs_1psi_gpu, s_1psi_gpu
! hs_1psi( npwx, npw, psi, hpsi, spsi )
! s_1psi( npwx, npw, psi, spsi )
!
CALL start_clock( 'ccgdiagg' )
!
empty_ethr = MAX( ( ethr * 5.D0 ), 1.D-5 )
!
IF ( npol == 1 ) THEN
!
kdim = npw
kdmx = npwx
!
ELSE
!
kdim = npwx * npol
kdmx = npwx * npol
!
END IF
!
kdim2 = 2 * kdim
!
ALLOCATE( hpsi_d(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate hpsi_d ', ABS(ierr) )
ALLOCATE( spsi_d(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate spsi_d ', ABS(ierr) )
ALLOCATE( g_d(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate g_d ', ABS(ierr) )
ALLOCATE( cg_d(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate cg_d ', ABS(ierr) )
ALLOCATE( scg_d(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate scg_d ', ABS(ierr) )
ALLOCATE( ppsi_d(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate ppsi_d ', ABS(ierr) )
ALLOCATE( g0_d(kdmx), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate g0_d ', ABS(ierr) )
!
ALLOCATE( lagrange_d(nbnd), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate lagrange_d ', ABS(ierr) )
!
ALLOCATE( lagrange(nbnd), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' ccgdiagg ',' cannot allocate lagrange ', ABS(ierr) )
ALLOCATE( e(nbnd) )
!
avg_iter = 0.D0
notconv = 0
moved = 0
!
! ... every eigenfunction is calculated separately
!
DO m = 1, nbnd
!
IF ( btype(m) == 1 ) THEN
!
ethr_m = ethr
!
ELSE
!
ethr_m = empty_ethr
!
END IF
!
CALL dev_memset( spsi_d , ZERO )
CALL dev_memset( scg_d , ZERO )
CALL dev_memset( hpsi_d , ZERO )
CALL dev_memset( g_d , ZERO )
CALL dev_memset( cg_d , ZERO )
CALL dev_memset( g0_d , ZERO )
CALL dev_memset( ppsi_d , ZERO )
CALL dev_memset( lagrange_d , ZERO )
!
! ... calculate S|psi>
!
CALL s_1psi_gpu( npwx, npw, psi_d(1,m), spsi_d )
!
! ... orthogonalize starting eigenfunction to those already calculated
!
call divide(inter_bgrp_comm,m,m_start,m_end); !write(*,*) m,m_start,m_end
lagrange = ZERO
if(m_start.le.m_end) &
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi_d(1,m_start), &
kdmx, spsi_d, 1, ZERO, lagrange_d(m_start), 1 )
if(m_start.le.m_end) lagrange(m_start:m_end) = lagrange_d(m_start:m_end)
CALL mp_sum( lagrange( 1:m ), inter_bgrp_comm )
!
CALL mp_sum( lagrange( 1:m ), intra_bgrp_comm )
!
psi_norm = DBLE( lagrange(m) )
lagrange_d(1:m) = lagrange(1:m)
!
DO j = 1, m - 1
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, kdmx
!
psi_d(i,m) = psi_d(i,m) - lagrange_d(j) * psi_d(i,j)
!
END DO
!
psi_norm = psi_norm - ( DBLE( lagrange(j) )**2 + AIMAG( lagrange(j) )**2 )
!
END DO
!
psi_norm = SQRT( psi_norm )
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, kdmx
psi_d(i,m) = psi_d(i,m) / psi_norm
END DO
!
! ... calculate starting gradient (|hpsi> = H|psi>) ...
!
CALL hs_1psi_gpu( npwx, npw, psi_d(1,m), hpsi_d, spsi_d )
!
! ... and starting eigenvalue (e = <y|PHP|y> = <psi|H|psi>)
!
! ... NB: ddot(2*npw,a,1,b,1) = REAL( zdotc(npw,a,1,b,1) )
!
e(m) = ksDdot( kdim2, psi_d(1,m), 1, hpsi_d, 1 )
!
CALL mp_sum( e(m), intra_bgrp_comm )
!
! ... start iteration for this band
!
iterate: DO iter = 1, maxter
!
! ... calculate P (PHP)|y>
! ... ( P = preconditioning matrix, assumed diagonal )
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, kdmx
g_d(i) = hpsi_d(i) / precondition_d(i)
ppsi_d(i) = spsi_d(i) / precondition_d(i)
END DO
!
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
!
es(1) = ksDdot( kdim2, spsi_d(1), 1, g_d(1), 1 )
es(2) = ksDdot( kdim2, spsi_d(1), 1, ppsi_d(1), 1 )
!
CALL mp_sum( es , intra_bgrp_comm )
!
es(1) = es(1) / es(2)
es_1 = es(1)
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, kdmx
g_d(i) = g_d(i) - es_1 * ppsi_d(i)
END DO
!
! ... e1 = <y| S P^2 PHP|y> / <y| S S P^2|y> ensures that
! ... <g| S P^2|y> = 0
! ... orthogonalize to lowest eigenfunctions (already calculated)
!
! ... scg is used as workspace
!
CALL s_1psi_gpu( npwx, npw, g_d(1), scg_d(1) )
!
lagrange(1:m-1) = ZERO
call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end
if(m_start.le.m_end) &
CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi_d(1,m_start), &
kdmx, scg_d, 1, ZERO, lagrange_d(m_start), 1 )
if(m_start.le.m_end) lagrange(m_start:m_end) = lagrange_d(m_start:m_end)
CALL mp_sum( lagrange( 1:m-1 ), inter_bgrp_comm )
!
CALL mp_sum( lagrange( 1:m-1 ), intra_bgrp_comm )
!
lagrange_d(1:m) = lagrange(1:m)
!
DO j = 1, ( m - 1 )
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, kdmx
g_d(i) = g_d(i) - lagrange_d(j) * psi_d(i,j)
scg_d(i) = scg_d(i) - lagrange_d(j) * psi_d(i,j)
END DO
!
END DO
!
IF ( iter /= 1 ) THEN
!
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
!
gg1 = ksDdot( kdim2, g_d(1), 1, g0_d(1), 1 )
!
CALL mp_sum( gg1, intra_bgrp_comm )
!
END IF
!
! ... gg is <g(n+1)|S|g(n+1)>
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, kdmx
g0_d(i) = scg_d(i) * precondition_d(i)
END DO
!
gg = ksDdot( kdim2, g_d(1), 1, g0_d(1), 1 )
!
CALL mp_sum( gg, intra_bgrp_comm )
!
IF ( iter == 1 ) THEN
!
! ... starting iteration, the conjugate gradient |cg> = |g>
!
gg0 = gg
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, kdmx
cg_d(i) = g_d(i)
END DO
!
ELSE
!
! ... |cg(n+1)> = |g(n+1)> + gamma(n) * |cg(n)>
!
! ... Polak-Ribiere formula :
!
gamma = ( gg - gg1 ) / gg0
gg0 = gg
!
!
!
! See comment below
!!DO i = 1, kdmx
!! cg_d(i) = g_d(i) + cg_d(i) * gamma
!!END DO
!
! ... The following is needed because <y(n+1)| S P^2 |cg(n+1)>
! ... is not 0. In fact :
! ... <y(n+1)| S P^2 |cg(n)> = sin(theta)*<cg(n)|S|cg(n)>
!
psi_norm = gamma * cg0 * sint
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, kdmx
! v== this breaks the logic, done here for performance
cg_d(i) = (g_d(i) + cg_d(i) * gamma) - psi_norm * psi_d(i,m)
END DO
!
END IF
!
! ... |cg> contains now the conjugate gradient
!
! ... |scg> is S|cg>
!
CALL hs_1psi_gpu( npwx, npw, cg_d(1), ppsi_d(1), scg_d(1) )
!
cg0 = ksDdot( kdim2, cg_d(1), 1, scg_d(1), 1 )
!
CALL mp_sum( cg0 , intra_bgrp_comm )
!
cg0 = SQRT( cg0 )
!
! ... |ppsi> contains now HP|cg>
! ... minimize <y(t)|PHP|y(t)> , where :
! ... |y(t)> = cos(t)|y> + sin(t)/cg0 |cg>
! ... Note that <y|P^2S|y> = 1, <y|P^2S|cg> = 0 ,
! ... <cg|P^2S|cg> = cg0^2
! ... so that the result is correctly normalized :
! ... <y(t)|P^2S|y(t)> = 1
!
a0 = 2.D0 * ksDdot( kdim2, psi_d(1,m), 1, ppsi_d(1), 1 ) / cg0
!
CALL mp_sum( a0 , intra_bgrp_comm )
!
b0 = ksDdot( kdim2, cg_d(1), 1, ppsi_d(1), 1 ) / cg0**2
!
CALL mp_sum( b0 , intra_bgrp_comm )
!
e0 = e(m)
!
theta = 0.5D0 * ATAN( a0 / ( e0 - b0 ) )
!
cost = COS( theta )
sint = SIN( theta )
!
cos2t = cost*cost - sint*sint
sin2t = 2.D0*cost*sint
!
es(1) = 0.5D0 * ( ( e0 - b0 ) * cos2t + a0 * sin2t + e0 + b0 )
es(2) = 0.5D0 * ( - ( e0 - b0 ) * cos2t - a0 * sin2t + e0 + b0 )
!
! ... there are two possible solutions, choose the minimum
!
IF ( es(2) < es(1) ) THEN
!
theta = theta + 0.5D0 * pi
!
cost = COS( theta )
sint = SIN( theta )
!
END IF
!
! ... new estimate of the eigenvalue
!
e(m) = MIN( es(1), es(2) )
! ... upgrade |psi>
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, kdmx
psi_d(i,m) = cost * psi_d(i,m) + sint / cg0 * cg_d(i)
END DO
!
! ... here one could test convergence on the energy
!
IF ( ABS( e(m) - e0 ) < ethr_m ) EXIT iterate
!
! ... upgrade H|psi> and S|psi>
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, kdmx
spsi_d(i) = cost * spsi_d(i) + sint / cg0 * scg_d(i)
!
hpsi_d(i) = cost * hpsi_d(i) + sint / cg0 * ppsi_d(i)
END DO
!
END DO iterate
!
#if defined(__VERBOSE)
IF ( iter >= maxter ) THEN
WRITE(stdout,'("e(",i4,") = ",f12.6," eV (not converged after ",i3,&
& " iterations)")') m, e(m)*13.6058, iter
ELSE
WRITE(stdout,'("e(",i4,") = ",f12.6," eV (",i3," iterations)")') &
m, e(m)*13.6058, iter
END IF
FLUSH (stdout)
#endif
IF ( iter >= maxter ) notconv = notconv + 1
!
avg_iter = avg_iter + iter + 1
! ... reorder eigenvalues if they are not in the right order
! ... ( this CAN and WILL happen in not-so-special cases )
!
!
IF ( m > 1 .AND. reorder ) THEN
!
IF ( e(m) - e(m-1) < - 2.D0 * ethr_m ) THEN
! ... if the last calculated eigenvalue is not the largest...
!
DO i = m - 2, 1, - 1
!
IF ( e(m) - e(i) > 2.D0 * ethr_m ) EXIT
!
END DO
!
i = i + 1
!
moved = moved + 1
!
! ... last calculated eigenvalue should be in the
! ... i-th position: reorder
!
e0 = e(m)
!
!$cuf kernel do(1) <<<*,*>>>
DO k = 1, kdmx
ppsi_d(k) = psi_d(k,m)
END DO
!
DO j = m, i + 1, - 1
!
e(j) = e(j-1)
!
!$cuf kernel do(1) <<<*,*>>>
DO k = 1, kdmx
psi_d(k,j) = psi_d(k,j-1)
END DO
!
END DO
!
e(i) = e0
!
!$cuf kernel do(1) <<<*,*>>>
DO k = 1, kdmx
psi_d(k,i) = ppsi_d(k)
END DO
!
! ... this procedure should be good if only a few inversions occur,
! ... extremely inefficient if eigenvectors are often in bad order
! ... ( but this should not happen )
!
END IF
!
END IF
!
END DO
!
avg_iter = avg_iter / DBLE( nbnd )
!
! STORING e in e_d since eigenvalues are always on the host
CALL dev_memcpy(e_d, e)
!
DEALLOCATE( lagrange )
DEALLOCATE( e )
DEALLOCATE( lagrange_d )
DEALLOCATE( ppsi_d )
DEALLOCATE( g0_d )
DEALLOCATE( cg_d )
DEALLOCATE( g_d )
DEALLOCATE( hpsi_d )
DEALLOCATE( scg_d )
DEALLOCATE( spsi_d )
!
CALL stop_clock( 'ccgdiagg' )
!
RETURN
!
END SUBROUTINE ccgdiagg_gpu

6
KS_Solvers/CG/make.gpu Normal file
View File

@ -0,0 +1,6 @@
# Makefile for CG GPU
CG += \
rcgdiagg_gpu.o \
ccgdiagg_gpu.o

View File

@ -0,0 +1,543 @@
!
! Copyright (C) 2002-2006 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
SUBROUTINE cgcudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
#if defined(__CUDA)
use cudafor
use cublas
#endif
IMPLICIT NONE
DOUBLE PRECISION :: ALPHA,BETA
INTEGER :: INCX,INCY,LDA,M,N
CHARACTER :: TRANS
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
#if defined(__CUDA)
attributes(device) :: A, X, Y
#endif
!
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
!
END SUBROUTINE cgcudaDGEMV
! define __VERBOSE to print a message after each eigenvalue is computed
!----------------------------------------------------------------------------
SUBROUTINE rcgdiagg_gpu( hs_1psi_gpu, s_1psi_gpu, precondition_d, &
npwx, npw, nbnd, psi_d, e_d, btype, &
ethr, maxter, reorder, notconv, avg_iter )
!----------------------------------------------------------------------------
!
! ... "poor man" iterative diagonalization of a complex hermitian matrix
! ... through preconditioned conjugate gradient algorithm
! ... Band-by-band algorithm with minimal use of memory
! ... Calls hs_1psi and s_1psi to calculate H|psi> + S|psi> and S|psi>
! ... Works for generalized eigenvalue problem (US pseudopotentials) as well
!
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, gstart
USE mp, ONLY : mp_sum
#if defined(__VERBOSE)
USE cg_param, ONLY : stdout
#endif
!
IMPLICIT NONE
!
! ... Mathematical constants
!
REAL(DP), PARAMETER :: pi = 3.14159265358979323846_DP
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npwx, npw, nbnd, maxter
INTEGER, INTENT(IN) :: btype(nbnd)
REAL(DP), INTENT(IN) :: precondition_d(npw), ethr
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx,nbnd)
REAL(DP), INTENT(INOUT) :: e_d(nbnd)
INTEGER, INTENT(OUT) :: notconv
REAL(DP), INTENT(OUT) :: avg_iter
#if defined(__CUDA)
attributes(DEVICE) :: precondition_d, psi_d, e_d
#endif
!
! ... local variables
!
INTEGER :: i, j, l, m, m_start, m_end, iter, moved
REAL(DP), ALLOCATABLE :: lagrange_d(:)
REAL(DP), ALLOCATABLE :: lagrange(:), e(:)
COMPLEX(DP), ALLOCATABLE :: hpsi_d(:), spsi_d(:), g_d(:), cg_d(:), &
scg_d(:), ppsi_d(:), g0_d(:), psi_aux(:)
COMPLEX(DP) :: psi1, hpsi1, spsi1, ppsi1, scg1, cg1, g1, g01
REAL(DP) :: psi_norm, a0, b0, gg0, gamma, gg, gg1, &
cg0, e0, es(2), aux
REAL(DP) :: es1
REAL(DP) :: theta, cost, sint, cos2t, sin2t
LOGICAL :: reorder
INTEGER :: npw2, npwx2
REAL(DP) :: empty_ethr, ethr_m
#if defined(__CUDA)
attributes(DEVICE) :: lagrange_d, hpsi_d, spsi_d, g_d, cg_d, scg_d, ppsi_d, g0_d
#endif
!
! ... external functions
!
REAL(DP), EXTERNAL :: ksDdot
EXTERNAL hs_1psi_gpu, s_1psi_gpu
! hs_1psi( npwx, npw, psi, hpsi, spsi )
! s_1psi( npwx, npw, psi, spsi )
!
!
CALL start_clock( 'rcgdiagg' )
!
IF ( gstart == -1 ) CALL errore( 'regter', 'gstart variable not initialized', 1 )
!
empty_ethr = MAX( ( ethr * 5.D0 ), 1.D-5 )
!
npw2 = 2 * npw
npwx2 = 2 * npwx
!
ALLOCATE( spsi_d( npwx ) )
ALLOCATE( scg_d( npwx ) )
ALLOCATE( hpsi_d( npwx ) )
ALLOCATE( g_d( npwx ) )
ALLOCATE( cg_d( npwx ) )
ALLOCATE( g0_d( npwx ) )
ALLOCATE( ppsi_d( npwx ) )
!
ALLOCATE( lagrange_d( nbnd ) )
ALLOCATE( lagrange ( nbnd ) )
ALLOCATE( e ( nbnd ) )
ALLOCATE( psi_aux ( nbnd ) )
!
! Sync eigenvalues that will remain on the Host
e(1:nbnd) = e_d(1:nbnd)
!print *, 'init ', e(1:nbnd)
!
avg_iter = 0.D0
notconv = 0
moved = 0
!
! ... every eigenfunction is calculated separately
!
DO m = 1, nbnd
IF ( btype(m) == 1 ) THEN
!
ethr_m = ethr
!
ELSE
!
ethr_m = empty_ethr
!
END IF
!
! ... calculate S|psi>
!
CALL s_1psi_gpu( npwx, npw, psi_d(1,m), spsi_d )
!
! ... orthogonalize starting eigenfunction to those already calculated
!
call divide(inter_bgrp_comm,m,m_start,m_end); !write(*,*) m,m_start,m_end
lagrange = 0.d0
if(m_start.le.m_end) &
CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi_d(1,m_start), npwx2, spsi_d, 1, 0.D0, lagrange_d(m_start), 1 )
if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
!print *, 'lagrange ', lagrange(1:m)
CALL mp_sum( lagrange( 1:m ), inter_bgrp_comm )
IF ( gstart == 2 ) THEN
psi_aux(1:m) = psi_d(1,1:m)
spsi1 = spsi_d(1)
lagrange(1:m) = lagrange(1:m) - psi_aux(1:m) * spsi1
END IF
!
CALL mp_sum( lagrange( 1:m ), intra_bgrp_comm )
!
psi_norm = lagrange(m)
lagrange_d(1:m) = lagrange(1:m)
!
DO j = 1, m - 1
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, npwx
psi_d(i,m) = psi_d(i,m) - lagrange_d(j) * psi_d(i,j)
END DO
!
!print *, 'psi_norm ', j, psi_norm
psi_norm = psi_norm - lagrange(j)**2
!
END DO
!
psi_norm = SQRT( psi_norm )
!print *, 'psi_norm 178', psi_norm
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, npwx
psi_d(i,m) = psi_d(i,m) / psi_norm
! ... set Im[ psi(G=0) ] - needed for numerical stability
IF (i == 1) THEN
IF ( gstart == 2 ) psi_d(1,m) = CMPLX( DBLE(psi_d(1,m)), 0.D0 ,kind=DP)
END IF
END DO
!
! ... calculate starting gradient (|hpsi> = H|psi>) ...
!
CALL hs_1psi_gpu( npwx, npw, psi_d(1,m), hpsi_d, spsi_d )
!
! ... and starting eigenvalue (e = <y|PHP|y> = <psi|H|psi>)
!
! ... NB: ddot(2*npw,a,1,b,1) = DBLE( zdotc(npw,a,1,b,1) )
!
e(m) = 2.D0 * ksDdot( npw2, psi_d(1,m), 1, hpsi_d, 1 )
!print *, 'e(m)', e(m)
IF ( gstart == 2 ) THEN
psi1 = psi_d(1,m)
hpsi1 = hpsi_d(1)
e(m) = e(m) - psi1 * hpsi1
END IF
!
CALL mp_sum( e(m), intra_bgrp_comm )
!print *, 'before iterate', psi1, hpsi1, spsi1, e(1:nbnd)
!
! ... start iteration for this band
!
iterate: DO iter = 1, maxter
!
! ... calculate P (PHP)|y>
! ... ( P = preconditioning matrix, assumed diagonal )
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, npw
g_d(i) = hpsi_d(i) / precondition_d(i)
ppsi_d(i) = spsi_d(i) / precondition_d(i)
END DO
!
! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
!
es(1) = 2.D0 * ksDdot( npw2, spsi_d(1), 1, g_d(1), 1 )
es(2) = 2.D0 * ksDdot( npw2, spsi_d(1), 1, ppsi_d(1), 1 )
!
IF ( gstart == 2 ) THEN
!
g1 = g_d(1); ppsi1 = ppsi_d(1); spsi1 = spsi_d(1)
!
es(1) = es(1) - spsi1 * g1
es(2) = es(2) - spsi1 * ppsi1
!
END IF
!
CALL mp_sum( es , intra_bgrp_comm )
!
es(1) = es(1) / es(2)
!
es1 = es(1)
!$cuf kernel do
DO i=1, npwx
g_d(i) = g_d(i) - es1 * ppsi_d(i)
END DO
!
! ... e1 = <y| S P^2 PHP|y> / <y| S S P^2|y> ensures that
! ... <g| S P^2|y> = 0
!
! ... orthogonalize to lowest eigenfunctions (already calculated)
!
! ... scg is used as workspace
!
CALL s_1psi_gpu( npwx, npw, g_d(1), scg_d(1) )
!
lagrange(1:m-1) = 0.d0
call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end
if(m_start.le.m_end) &
CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi_d(1,m_start), npw2, scg_d, 1, 0.D0, lagrange_d(m_start), 1 )
if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
CALL mp_sum( lagrange( 1:m-1 ), inter_bgrp_comm )
IF ( gstart == 2 ) THEN
psi_aux(1:m-1) = psi_d(1,1:m-1)
scg1 = scg_d(1)
lagrange(1:m-1) = lagrange(1:m-1) - psi_aux(1:m-1) * scg1
END IF
!
CALL mp_sum( lagrange( 1:m-1 ), intra_bgrp_comm )
!
DO j = 1, ( m - 1 )
!
aux = lagrange(j)
!$cuf kernel do(1)
DO i = 1, npwx
g_d(i) = g_d(i) - aux * psi_d(i,j)
scg_d(i) = scg_d(i) - aux * psi_d(i,j)
END DO
!
END DO
!
IF ( iter /= 1 ) THEN
!
! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
!
gg1 = 2.D0 * ksDdot( npw2, g_d(1), 1, g0_d(1), 1 )
IF ( gstart == 2 ) THEN
g1 = g_d(1) ; g01 = g0_d(1)
gg1 = gg1 - g1 * g01
END IF
!
CALL mp_sum( gg1 , intra_bgrp_comm )
!
END IF
!
! ... gg is <g(n+1)|S|g(n+1)>
!
!$cuf kernel do
do i=1, npwx
g0_d(i) = scg_d(i)
end do
!
!$cuf kernel do
do i=1, npw
g0_d(i) = g0_d(i) * precondition_d(i)
end do
!
gg = 2.D0 * ksDdot( npw2, g_d(1), 1, g0_d(1), 1 )
IF ( gstart == 2 ) THEN
g1 = g_d(1) ; g01 = g0_d(1)
gg = gg - g1*g01
END IF
!
CALL mp_sum( gg , intra_bgrp_comm )
!
IF ( iter == 1 ) THEN
!
! ... starting iteration, the conjugate gradient |cg> = |g>
!
gg0 = gg
!
!$cuf kernel do
DO i=1, npwx
cg_d(i) = g_d(i)
! ... |cg> contains now the conjugate gradient
! ... set Im[ cg(G=0) ] - needed for numerical stability
IF ( gstart == 2 .and. i == 1 ) cg_d(1) = CMPLX( DBLE(cg_d(1)), 0.D0 ,kind=DP)
END DO
!
ELSE
!
! ... |cg(n+1)> = |g(n+1)> + gamma(n) * |cg(n)>
!
! ... Polak-Ribiere formula :
!
gamma = ( gg - gg1 ) / gg0
gg0 = gg
!
!$cuf kernel do
do i=1, npwx
cg_d(i) = g_d(i) + cg_d(i) * gamma
end do
!
! ... The following is needed because <y(n+1)| S P^2 |cg(n+1)>
! ... is not 0. In fact :
! ... <y(n+1)| S P^2 |cg(n)> = sin(theta)*<cg(n)|S|cg(n)>
!
psi_norm = gamma * cg0 * sint
!
!$cuf kernel do
do i=1, npwx
cg_d(i) = cg_d(i) - psi_norm * psi_d(i,m)
! ... |cg> contains now the conjugate gradient
! ... set Im[ cg(G=0) ] - needed for numerical stability
IF ( gstart == 2 .and. i == 1 ) cg_d(1) = CMPLX( DBLE(cg_d(1)), 0.D0 ,kind=DP)
end do
!
END IF
!
! ... |scg> is S|cg>
!
CALL hs_1psi_gpu( npwx, npw, cg_d(1), ppsi_d(1), scg_d(1) )
!
cg0 = 2.D0 * ksDdot( npw2, cg_d(1), 1, scg_d(1), 1 )
IF ( gstart == 2 ) THEN
cg1 = cg_d(1) ; scg1 = scg_d(1)
cg0 = cg0 - cg1*scg1
END IF
!
CALL mp_sum( cg0 , intra_bgrp_comm )
!
cg0 = SQRT( cg0 )
!
! ... |ppsi> contains now HP|cg>
! ... minimize <y(t)|PHP|y(t)> , where :
! ... |y(t)> = cos(t)|y> + sin(t)/cg0 |cg>
! ... Note that <y|P^2S|y> = 1, <y|P^2S|cg> = 0 ,
! ... <cg|P^2S|cg> = cg0^2
! ... so that the result is correctly normalized :
! ... <y(t)|P^2S|y(t)> = 1
!
a0 = 4.D0 * ksDdot( npw2, psi_d(1,m), 1, ppsi_d(1), 1 )
IF ( gstart == 2 ) THEN
psi1 = psi_d(1,m)
ppsi1 = ppsi_d(1)
a0 = a0 - 2.D0 * psi1 * ppsi1
END IF
!
a0 = a0 / cg0
!
CALL mp_sum( a0 , intra_bgrp_comm )
!
b0 = 2.D0 * ksDdot( npw2, cg_d(1), 1, ppsi_d(1), 1 )
IF ( gstart == 2 ) THEN
cg1 = cg_d(1)
ppsi1 = ppsi_d(1)
b0 = b0 - cg1 * ppsi1
END IF
!
b0 = b0 / cg0**2
!
CALL mp_sum( b0 , intra_bgrp_comm )
!
e0 = e(m)
!
theta = 0.5D0 * ATAN( a0 / ( e0 - b0 ) )
!
cost = COS( theta )
sint = SIN( theta )
!
cos2t = cost*cost - sint*sint
sin2t = 2.D0*cost*sint
!
es(1) = 0.5D0 * ( ( e0 - b0 ) * cos2t + a0 * sin2t + e0 + b0 )
es(2) = 0.5D0 * ( - ( e0 - b0 ) * cos2t - a0 * sin2t + e0 + b0 )
!
! ... there are two possible solutions, choose the minimum
!
IF ( es(2) < es(1) ) THEN
!
theta = theta + 0.5D0 * pi
!
cost = COS( theta )
sint = SIN( theta )
!
END IF
!
! ... new estimate of the eigenvalue
!
e(m) = MIN( es(1), es(2) )
!
! ... upgrade |psi>
!
!$cuf kernel do
do i=1, npwx
psi_d(i,m) = cost * psi_d(i,m) + sint / cg0 * cg_d(i)
end do
!
! ... here one could test convergence on the energy
!
IF ( ABS( e(m) - e0 ) < ethr_m ) EXIT iterate
!
! ... upgrade H|psi> and S|psi>
!
!$cuf kernel do
do i=1, npwx
spsi_d(i) = cost * spsi_d(i) + sint / cg0 * scg_d(i)
!
hpsi_d(i) = cost * hpsi_d(i) + sint / cg0 * ppsi_d(i)
end do
!
END DO iterate
!
#if defined(__VERBOSE)
IF ( iter >= maxter ) THEN
WRITE(stdout,'("e(",i4,") = ",f12.6," eV (not converged after ",i3,&
& " iterations)")') m, e(m)*13.6058, iter
ELSE
WRITE(stdout,'("e(",i4,") = ",f12.6," eV (",i3," iterations)")') &
m, e(m)*13.6058, iter
END IF
FLUSH (stdout)
#endif
IF ( iter >= maxter ) notconv = notconv + 1
!
avg_iter = avg_iter + iter + 1
!
! ... reorder eigenvalues if they are not in the right order
! ... ( this CAN and WILL happen in not-so-special cases )
!
IF ( m > 1 .AND. reorder ) THEN
!
IF ( e(m) - e(m-1) < - 2.D0 * ethr_m ) THEN
!
! ... if the last calculated eigenvalue is not the largest...
!
DO i = m - 2, 1, - 1
!
IF ( e(m) - e(i) > 2.D0 * ethr_m ) EXIT
!
END DO
!
i = i + 1
!
moved = moved + 1
!
! ... last calculated eigenvalue should be in the
! ... i-th position: reorder
!
e0 = e(m)
!
!$cuf kernel do
do l=1, npwx
ppsi_d(l) = psi_d(l,m)
end do
!
DO j = m, i + 1, - 1
!
e(j) = e(j-1)
!
!$cuf kernel do
do l=1, npwx
psi_d(l,j) = psi_d(l,j-1)
end do
!
END DO
!
e(i) = e0
!
!$cuf kernel do
do l=1, npwx
psi_d(l,i) = ppsi_d(l)
end do
!
! ... this procedure should be good if only a few inversions occur,
! ... extremely inefficient if eigenvectors are often in bad order
! ... ( but this should not happen )
!
END IF
!
END IF
!
END DO
!
avg_iter = avg_iter / DBLE( nbnd )
e_d(1:nbnd) = e(1:nbnd)
!
DEALLOCATE( lagrange )
DEALLOCATE( lagrange_d )
DEALLOCATE( e )
DEALLOCATE( psi_aux )
DEALLOCATE( ppsi_d )
DEALLOCATE( g0_d )
DEALLOCATE( cg_d )
DEALLOCATE( g_d )
DEALLOCATE( hpsi_d )
DEALLOCATE( scg_d )
DEALLOCATE( spsi_d )
!
CALL stop_clock( 'rcgdiagg' )
!
RETURN
!
END SUBROUTINE rcgdiagg_gpu

View File

@ -11,6 +11,8 @@ rotate_HSpsi_k.o \
rotate_wfc_gamma.o \
rotate_wfc_k.o
-include make.gpu
all : libdense.a

19
KS_Solvers/DENSE/make.gpu Normal file
View File

@ -0,0 +1,19 @@
# Makefile for CG GPU
DENSE += \
rotate_wfc_k_gpu.o \
rotate_wfc_gamma_gpu.o \
rotate_HSpsi_k_gpu.o \
rotate_HSpsi_gamma_gpu.o
rotate_HSpsi_gamma_gpu.o : ../../UtilXlib/mp.o
rotate_HSpsi_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
rotate_HSpsi_gamma_gpu.o : ../../UtilXlib/util_param.o
rotate_wfc_gamma_gpu.o : ../../LAXlib/la_module.o
rotate_wfc_gamma_gpu.o : ../../UtilXlib/mp.o
rotate_wfc_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
rotate_wfc_gamma_gpu.o : ../../UtilXlib/util_param.o
rotate_wfc_k_gpu.o : ../../LAXlib/la_module.o
rotate_wfc_k_gpu.o : ../../UtilXlib/mp.o
rotate_wfc_k_gpu.o : ../../UtilXlib/mp_bands_util.o
rotate_wfc_k_gpu.o : ../../UtilXlib/util_param.o

View File

@ -0,0 +1,222 @@
!
! Copyright (C) 2003-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
#define ZERO ( 0.D0, 0.D0 )
!----------------------------------------------------------------------------
SUBROUTINE rotate_HSpsi_gamma_gpu( npwx, npw, nstart, nbnd, psi_d, hpsi_d, overlap, spsi_d, e_d )
!----------------------------------------------------------------------------
!
! ... Serial version of rotate_wfc for Gamma-only calculations
! ... This version assumes real wavefunctions (k=0) with only
! ... half plane waves stored: psi(-G)=psi*(G), except G=0
!
! GPU version by Ivan Carnimeo
!
#if defined (__CUDA)
USE cudafor
#endif
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
me_bgrp, root_bgrp
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
USE mp, ONLY : mp_sum, mp_barrier, mp_allgather, mp_type_create_column_section, mp_type_free
!
IMPLICIT NONE
!
INCLUDE 'laxlib.fh'
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: &
npw, & ! dimension of the matrices (psi,Hpsi,Spsi) to be rotated
npwx, & ! leading dimension of the wavefunction-related matrices
nstart, & ! input number of states
nbnd ! output number of states
LOGICAL, INTENT(IN) :: overlap ! if .FALSE. : spsi is not needed (and not used)
REAL(DP), INTENT(OUT) :: e_d(nbnd) ! eigenvalues of the reduced H matrix
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx,nstart), hpsi_d(npwx,nstart) ! input and output psi, Hpsi,
COMPLEX(DP), INTENT(INOUT), OPTIONAL :: spsi_d(npwx,nstart) ! ... and optionnally Spsi
!
! ... local variables
!
INTEGER :: kdim, kdmx
INTEGER :: n_start, n_end, my_n, recv_counts(nbgrp), displs(nbgrp), column_type
INTEGER :: ii, jj ! indexes for cuf kernel loops
!
! ... device variables
!
COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
REAL(DP), ALLOCATABLE :: hh_d(:,:), ss_d(:,:), vv_d(:,:)
REAL(DP), ALLOCATABLE :: en_d(:)
#if defined (__CUDA)
attributes (device) :: psi_d, hpsi_d, spsi_d, e_d
attributes (device) :: aux_d, hh_d, ss_d, vv_d, en_d
#endif
!
IF ( gstart == -1 ) CALL errore( 'rotHSw', 'gstart variable not initialized', 1 )
IF ( overlap .AND..NOT.present(spsi_d) ) call errore( 'rotHSw','spsi array needed with overlap=.TRUE.',1)
!
call start_clock('rotHSw'); !write(*,*) 'start rotHSw' ; FLUSH(6)
!
! ... set Im[ psi(G=0) ] etc - needed for numerical stability
!
IF ( gstart == 2 ) then
!$cuf kernel do(1)
DO ii = 1, nstart
psi_d (1,ii) = CMPLX( DBLE( psi_d (1,ii) ), 0.D0,kind=DP)
hpsi_d(1,ii) = CMPLX( DBLE( hpsi_d(1,ii) ), 0.D0,kind=DP)
END DO
IF (overlap) THEN
!$cuf kernel do(1)
DO ii = 1, nstart
spsi_d(1,ii) = CMPLX( DBLE( spsi_d(1,ii) ), 0.D0,kind=DP)
END DO
END IF
END IF
kdim = 2 * npw
kdmx = 2 * npwx
!
ALLOCATE( hh_d( nstart, nstart ) )
ALLOCATE( ss_d( nstart, nstart ) )
ALLOCATE( vv_d( nstart, nstart ) )
ALLOCATE( en_d( nstart ) )
!
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
!
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
!
call start_clock('rotHSw:hc'); !write(*,*) 'start rotHSw:hc' ; FLUSH(6)
CALL mp_type_create_column_section(hh_d(1,1), 0, nstart, nstart, column_type)
CALL divide_all(inter_bgrp_comm,nstart,n_start,n_end,recv_counts,displs)
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
if (n_start .le. n_end) &
CALL gpu_DGEMM( 'T','N', nstart,my_n,kdim, 2.D0, psi_d,kdmx, hpsi_d(1,n_start),kdmx, 0.D0, hh_d(1,n_start),nstart )
IF ( gstart == 2 ) call gpu_DGER( nstart, my_n, -1.D0, psi_d,kdmx, hpsi_d(1,n_start),kdmx, hh_d(1,n_start),nstart )
call start_clock('rotHSw:hc:s1')
CALL mp_sum( hh_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
call stop_clock('rotHSw:hc:s1')
! call start_clock('rotHSw:hc:b1'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b1')
call start_clock('rotHSw:hc:s2')
CALL mp_allgather(hh_d, column_type, recv_counts, displs, inter_bgrp_comm)
call stop_clock('rotHSw:hc:s2')
!
IF ( overlap ) THEN
!
if (n_start .le. n_end) &
CALL gpu_DGEMM('T','N', nstart,my_n,kdim, 2.D0, psi_d,kdmx, spsi_d(1,n_start),kdmx, 0.D0, ss_d(1,n_start),nstart)
IF ( gstart == 2 ) CALL gpu_DGER(nstart, my_n, -1.D0, psi_d,kdmx, spsi_d(1,n_start),kdmx, ss_d(1,n_start),nstart)
!
ELSE
!
if (n_start .le. n_end) &
CALL gpu_DGEMM('T','N', nstart,my_n,kdim, 2.D0, psi_d,kdmx, psi_d(1,n_start),kdmx, 0.D0, ss_d(1,n_start),nstart)
IF ( gstart == 2 ) CALL gpu_DGER(nstart, my_n, -1.D0, psi_d,kdmx, psi_d(1,n_start),kdmx, ss_d(1,n_start),nstart)
!
END IF
call start_clock('rotHSw:hc:s3')
CALL mp_sum( ss_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
call stop_clock('rotHSw:hc:s3')
! call start_clock('rotHSw:hc:b2'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b2')
call start_clock('rotHSw:hc:s4')
CALL mp_allgather(ss_d, column_type, recv_counts, displs, inter_bgrp_comm)
call stop_clock('rotHSw:hc:s4')
CALL mp_type_free( column_type )
call stop_clock('rotHSw:hc'); !write(*,*) 'stop rotHSw:hc' ; FLUSH(6)
!
! ... Diagonalize
!
call start_clock('rotHSw:diag'); !write(*,*) 'start rotHSw:diag' ; FLUSH(6)
CALL diaghg( nstart, nbnd, hh_d, ss_d, nstart, en_d, vv_d, me_bgrp, root_bgrp, intra_bgrp_comm )
!$cuf kernel do(1)
DO ii = 1, nbnd
e_d(ii) = en_d(ii)
END DO
call stop_clock('rotHSw:diag'); !write(*,*) 'stop rotHSw:diag' ; FLUSH(6)
!
! ... update the basis set
!
call start_clock('rotHSw:evc'); !write(*,*) 'start rotHSw:evc' ; FLUSH(6)
CALL mp_type_create_column_section(psi_d(1,1), 0, npwx, npwx, column_type)
CALL divide_all(inter_bgrp_comm,nbnd,n_start,n_end,recv_counts,displs)
ALLOCATE( aux_d ( npwx, nbnd ) )
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
if (n_start .le. n_end) &
CALL gpu_DGEMM( 'N','N', kdim,my_n,nstart, 1.D0, psi_d,kdmx,vv_d(1,n_start),nstart, 0.D0, aux_d(1,n_start),kdmx )
!$cuf kernel do(2)
DO ii = 1, npwx
DO jj = n_start, n_end
psi_d(ii,jj) = aux_d(ii,jj)
END DO
END DO
! call start_clock('rotHSw:ev:b3'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b3')
call start_clock('rotHSw:ev:s5')
CALL mp_allgather(psi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
call stop_clock('rotHSw:ev:s5')
if (n_start .le. n_end) &
CALL gpu_DGEMM( 'N','N', kdim,my_n,nstart, 1.D0,hpsi_d,kdmx,vv_d(1,n_start),nstart, 0.D0, aux_d(1,n_start),kdmx )
!$cuf kernel do (2)
DO ii = 1, npwx
DO jj = n_start, n_end
hpsi_d(ii,jj) = aux_d(ii,jj)
END DO
END DO
! call start_clock('rotHSw:ev:b4'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b4')
call start_clock('rotHSw:ev:s6')
CALL mp_allgather(hpsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
call stop_clock('rotHSw:ev:s6')
IF (overlap) THEN
if (n_start .le. n_end) &
CALL gpu_DGEMM( 'N','N', kdim,my_n,nstart, 1.D0,spsi_d,kdmx,vv_d(1,n_start),nstart, 0.D0, aux_d(1,n_start),kdmx )
!$cuf kernel do (2)
DO ii = 1, npwx
DO jj = n_start, n_end
spsi_d(ii,jj) = aux_d(ii,jj)
END DO
END DO
! call start_clock('rotHSw:ev:b5'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b5')
call start_clock('rotHSw:ev:s7')
CALL mp_allgather(spsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
call stop_clock('rotHSw:ev:s7')
ELSE IF (present(spsi_d)) THEN
!$cuf kernel do (2)
DO ii = 1, npwx
DO jj = 1, nbnd
spsi_d(ii,jj) = psi_d(ii,jj)
END DO
END DO
END IF
DEALLOCATE( aux_d )
CALL mp_type_free( column_type )
call stop_clock('rotHSw:evc'); !write(*,*) 'stop rotHSw:evc' ; FLUSH(6)
!
DEALLOCATE( en_d )
DEALLOCATE( vv_d )
DEALLOCATE( ss_d )
DEALLOCATE( hh_d )
call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6)
!call print_clock('rotHSw')
!call print_clock('rotHSw:hc')
!call print_clock('rotHSw:diag')
!call print_clock('rotHSw:evc')
!
RETURN
!
END SUBROUTINE rotate_HSpsi_gamma_gpu

View File

@ -0,0 +1,286 @@
!
! Copyright (C) 2003-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
#define ZERO ( 0.D0, 0.D0 )
!----------------------------------------------------------------------------
SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, overlap, spsi_d, e_d )
!----------------------------------------------------------------------------
!
! ... Serial version of rotate_wfc for colinear, k-point calculations
!
#if defined (__CUDA)
USE cudafor
#endif
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
me_bgrp, root_bgrp
USE mp, ONLY : mp_sum, mp_barrier, mp_allgather, mp_type_create_column_section, mp_type_free
USE device_memcpy_m, ONLY: dev_memcpy, dev_memset
!
IMPLICIT NONE
!
INCLUDE 'laxlib.fh'
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: &
npw, & ! dimension of the matrices (psi,Hpsi,Spsi) to be rotated
npwx, & ! leading dimension of the wavefunction-related matrices
nstart, & ! input number of states
nbnd, & ! output number of states
npol ! number of spin polarizations
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nstart), hpsi_d(npwx*npol,nstart) ! input and output psi, Hpsi,
COMPLEX(DP), INTENT(INOUT), OPTIONAL :: spsi_d(npwx*npol,nstart) ! ... and optionnally Spsi
LOGICAL, INTENT(IN) :: overlap ! if .FALSE. : spsi is not needed (and not used)
REAL(DP), INTENT(OUT) :: e_d(nbnd) ! eigenvalues of the reduced H matrix
!
! ... local variables
!
INTEGER :: kdim, kdmx
INTEGER :: n_start, n_end, my_n, recv_counts(nbgrp), displs(nbgrp), column_type
INTEGER :: ii, jj ! indexes for cuf kernel loops
!
! ... device variables
!
COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
COMPLEX(DP), ALLOCATABLE :: hh_d(:,:), ss_d(:,:), vv_d(:,:)
REAL(DP), ALLOCATABLE :: en_d(:)
#if defined (__CUDA)
attributes(device) :: aux_d, psi_d, hpsi_d, spsi_d
attributes(device) :: hh_d, ss_d, vv_d, en_d, e_d
#endif
!
IF ( overlap .AND..NOT.present(spsi_d) ) call errore( 'rotHSw','spsi_d array needed with overlap=.TRUE.',1)
!
call start_clock('rotHSw'); !write(*,*) 'start rotHSw' ; FLUSH(6)
!
if (npol == 2 .and. npw < npwx ) then ! pack wfcs so that pw's are contiguous
call start_clock('rotHSw:move'); !write(*,*) 'start rotHSw:move' ; FLUSH(6)
ALLOCATE ( aux_d ( npwx, nstart ) )
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nstart
aux_d(ii,jj) = psi_d (npwx+ii,jj)
END DO
END DO
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nstart
psi_d (npw+ii,jj) = aux_d(ii,jj)
END DO
END DO
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nstart
aux_d(ii,jj) = hpsi_d(npwx+ii,jj)
END DO
END DO
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nstart
hpsi_d(npw+ii,jj) = aux_d(ii,jj)
END DO
END DO
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nstart
aux_d(ii,jj) = spsi_d(npwx+ii,jj)
END DO
END DO
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nstart
spsi_d(npw+ii,jj) = aux_d(ii,jj)
END DO
END DO
DEALLOCATE( aux_d )
call stop_clock('rotHSw:move'); !write(*,*) 'stop rotHSw:move' ; FLUSH(6)
end if
kdim = npol * npw
kdmx = npol * npwx
!
ALLOCATE( hh_d( nstart, nstart ) )
ALLOCATE( ss_d( nstart, nstart ) )
ALLOCATE( vv_d( nstart, nstart ) )
ALLOCATE( en_d( nstart ) )
!
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
!
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
!
call start_clock('rotHSw:hc'); !write(*,*) 'start rotHSw:hc' ; FLUSH(6)
CALL mp_type_create_column_section(hh_d(1,1), 0, nstart, nstart, column_type)
CALL divide_all(inter_bgrp_comm,nstart,n_start,n_end,recv_counts,displs)
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
if (n_start .le. n_end) &
CALL gpu_ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, kdmx, hpsi_d(1,n_start), kdmx, (0.D0,0.D0), &
hh_d(1,n_start), nstart )
call start_clock('rotHSw:hc:s1')
CALL mp_sum( hh_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
call stop_clock('rotHSw:hc:s1')
!call start_clock('rotHSw:hc:b1'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b1')
call start_clock('rotHSw:hc:s2')
CALL mp_allgather(hh_d, column_type, recv_counts, displs, inter_bgrp_comm)
call stop_clock('rotHSw:hc:s2')
!
IF ( overlap ) THEN
!
if (n_start .le. n_end) &
CALL gpu_ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, kdmx, spsi_d(1,n_start), kdmx, &
(0.D0,0.D0), ss_d(1,n_start), nstart )
!
ELSE
!
if (n_start .le. n_end) &
CALL gpu_ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, kdmx, psi_d(1,n_start), kdmx, &
(0.D0,0.D0), ss_d(1,n_start), nstart )
!
END IF
call start_clock('rotHSw:hc:s3')
CALL mp_sum( ss_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
call stop_clock('rotHSw:hc:s3')
!call start_clock('rotHSw:hc:b2'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b2')
call start_clock('rotHSw:hc:s4')
CALL mp_allgather(ss_d, column_type, recv_counts, displs, inter_bgrp_comm)
call stop_clock('rotHSw:hc:s4')
CALL mp_type_free( column_type )
call stop_clock('rotHSw:hc'); !write(*,*) 'stop rotHSw:hc' ; FLUSH(6)
!
! ... Diagonalize
!
call start_clock('rotHSw:diag'); !write(*,*) 'start rotHSw:diag' ; FLUSH(6)
CALL diaghg( nstart, nbnd, hh_d, ss_d, nstart, en_d, vv_d, me_bgrp, root_bgrp, intra_bgrp_comm )
CALL dev_memcpy(e_d, en_d, [1,nbnd])
call stop_clock('rotHSw:diag'); !write(*,*) 'stop rotHSw:diag' ; FLUSH(6)
!
! ... update the basis set
!
call start_clock('rotHSw:evc'); !write(*,*) 'start rotHSw:evc' ; FLUSH(6)
CALL mp_type_create_column_section(psi_d(1,1), 0, npwx, npwx, column_type)
CALL divide_all(inter_bgrp_comm,nbnd,n_start,n_end,recv_counts,displs)
ALLOCATE( aux_d ( kdmx, nbnd ) )
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
if (n_start .le. n_end) &
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), psi_d, kdmx, vv_d(1,n_start), nstart, &
(0.D0,0.D0), aux_d(1,n_start), kdmx )
CALL dev_memcpy(psi_d, aux_d, [1, kdmx], 1, [n_start,n_end])
!call start_clock('rotHSw:ev:b3'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b3')
call start_clock('rotHSw:ev:s5')
CALL mp_allgather(psi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
call stop_clock('rotHSw:ev:s5')
if (n_start .le. n_end) &
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), hpsi_d, kdmx, vv_d(1,n_start), nstart, &
(0.D0,0.D0), aux_d(1,n_start), kdmx )
CALL dev_memcpy(hpsi_d, aux_d, [1, kdmx], 1, [n_start,n_end]) !call start_clock('rotHSw:ev:b4'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b4')
call start_clock('rotHSw:ev:s6')
CALL mp_allgather(hpsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
call stop_clock('rotHSw:ev:s6')
IF (overlap) THEN
if (n_start .le. n_end) &
CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), spsi_d, kdmx, vv_d(1,n_start), &
nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
CALL dev_memcpy(spsi_d, aux_d, [1, kdmx], 1, [n_start,n_end]) !call start_clock('rotHSw:ev:b5'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b5')
call start_clock('rotHSw:ev:s7')
CALL mp_allgather(spsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
call stop_clock('rotHSw:ev:s7')
ELSE IF (present(spsi_d)) THEN
CALL dev_memcpy(spsi_d, psi_d, [1, kdmx], 1, [n_start,n_end])
END IF
DEALLOCATE( aux_d )
CALL mp_type_free( column_type )
call stop_clock('rotHSw:evc'); !write(*,*) 'stop rotHSw:evc' ; FLUSH(6)
!
DEALLOCATE( vv_d )
DEALLOCATE( ss_d )
DEALLOCATE( hh_d )
DEALLOCATE( en_d )
!
call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6)
!call print_clock('rotHSw')
!call print_clock('rotHSw:hc')
!call print_clock('rotHSw:diag')
!call print_clock('rotHSw:evc')
!
if (npol== 2 .and. npw < npwx ) then ! unpack wfcs to its original order
call start_clock('rotHSw:move'); !write(*,*) 'start rotHSw:move' ; FLUSH(6)
ALLOCATE ( aux_d ( npwx, nbnd ) )
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nbnd
aux_d(ii,jj) = psi_d (npw+ii,jj)
END DO
END DO
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nbnd
psi_d(npwx+ii,jj) = aux_d(ii,jj)
END DO
END DO
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nbnd
aux_d(ii,jj) = hpsi_d(npw+ii,jj)
END DO
END DO
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nbnd
hpsi_d(npwx+ii,jj) = aux_d(ii,jj)
END DO
END DO
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nbnd
aux_d(ii,jj) = spsi_d(npw+ii,jj)
END DO
END DO
!$cuf kernel do(2)
DO ii = 1, npw
DO jj = 1, nbnd
spsi_d(npwx+ii,jj) = aux_d(ii,jj)
END DO
END DO
!$cuf kernel do(2)
DO ii = npw+1, npwx
DO jj = 1, nbnd
psi_d(ii,jj) = ZERO
END DO
END DO
!$cuf kernel do(2)
DO ii = npw+1, npwx
DO jj = 1, nbnd
hpsi_d(ii,jj) = ZERO
END DO
END DO
!$cuf kernel do(2)
DO ii = npw+1, npwx
DO jj = 1, nbnd
spsi_d(ii,jj) = ZERO
END DO
END DO
DEALLOCATE( aux_d )
call stop_clock('rotHSw:move'); !write(*,*) 'stop rotHSw:move' ; FLUSH(6)
end if
!
RETURN
!
END SUBROUTINE rotate_HSpsi_k_gpu

View File

@ -0,0 +1,198 @@
!
! Copyright (C) 2003-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!----------------------------------------------------------------------------
SUBROUTINE rotate_wfc_gamma_gpu( h_psi_gpu, s_psi_gpu, overlap, &
npwx, npw, nstart, nbnd, psi_d, evc_d, e_d )
!----------------------------------------------------------------------------
!
! ... Serial version of rotate_wfc for Gamma-only calculations
! ... This version assumes real wavefunctions (k=0) with only
! ... half plane waves stored: psi_d(-G)=psi_d*(G), except G=0
!
USE LAXlib, ONLY : diaghg
#if defined(__CUDA)
USE cudafor
USE cublas
#else
#define cublasDGEMM dgemm
#endif
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, &
nbgrp, my_bgrp_id, me_bgrp, root_bgrp
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
USE mp, ONLY : mp_sum
!
IMPLICIT NONE
!
!INCLUDE 'laxlib.fh'
!
! ... I/O variables
!
INTEGER :: npw, npwx, nstart, nbnd
! dimension of the matrix to be diagonalized
! leading dimension of matrix psi_d, as declared in the calling pgm unit
! input number of states
! output number of states
LOGICAL :: overlap
! if .FALSE. : S|psi_d> not needed
COMPLEX(DP) :: psi_d(npwx,nstart), evc_d(npwx,nbnd)
! input and output eigenvectors (may overlap)
REAL(DP) :: e_d(nbnd)
! eigenvalues
#if defined(__CUDA)
attributes(DEVICE) :: psi_d, evc_d, e_d
#endif
!
! ... local variables
!
INTEGER :: npw2, npwx2
COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
REAL(DP), ALLOCATABLE :: hr_d(:,:), sr_d(:,:), vr_d(:,:)
REAL(DP), ALLOCATABLE :: en_d(:)
#if defined(__CUDA)
attributes(DEVICE) :: aux_d, hr_d, sr_d, vr_d, en_d
#endif
INTEGER :: n_start, n_end, my_n, i, j
!
EXTERNAL h_psi_gpu, s_psi_gpu
! h_psi(npwx,npw,nvec,psi_d,hpsi)
! calculates H|psi_d>
! s_psi(npwx,npw,nvec,spsi)
! calculates S|psi_d> (if needed)
! Vectors psi_d,hpsi,spsi are dimensioned (npwx,npol,nvec)
npw2 = 2 * npw
npwx2 = 2 * npwx
IF ( gstart == -1 ) CALL errore( 'regter', 'gstart variable not initialized', 1 )
!
ALLOCATE( aux_d( npwx, nstart ) )
ALLOCATE( hr_d( nstart, nstart ) )
ALLOCATE( sr_d( nstart, nstart ) )
ALLOCATE( vr_d( nstart, nstart ) )
ALLOCATE( en_d( nstart ) )
call start_clock('rotwfcg'); !write(*,*) 'start rotwfcg' ; FLUSH(6)
!
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
!
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
!
! ... set Im[ psi_d(G=0) ] - needed for numerical stability
!
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO i=1,nstart
psi_d(1,i) = CMPLX( DBLE( psi_d(1,i) ), 0.D0,kind=DP)
END DO
END IF
!
call start_clock('rotwfcg:hpsi'); !write(*,*) 'start rotwfcg:hpsi' ; FLUSH(6)
CALL h_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
call stop_clock('rotwfcg:hpsi'); !write(*,*) 'stop rotwfcg:hpsi' ; FLUSH(6)
!
call start_clock('rotwfcg:hc'); !write(*,*) 'start rotwfcg:hc' ; FLUSH(6)
hr_d=0.D0
CALL divide(inter_bgrp_comm,nstart,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
if (n_start .le. n_end) &
CALL cublasDGEMM( 'T','N', nstart, my_n, npw2, 2.D0, psi_d, &
npwx2, aux_d(1,n_start), npwx2, 0.D0, hr_d(1,n_start), nstart )
IF ( gstart == 2 ) call CGcudaDGER( nstart, my_n, -1.D0, psi_d, &
npwx2, aux_d(1,n_start), npwx2, hr_d(1,n_start), nstart )
CALL mp_sum( hr_d, inter_bgrp_comm )
!
CALL mp_sum( hr_d, intra_bgrp_comm )
!
sr_d=0.D0
IF ( overlap ) THEN
!
CALL s_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
!
if (n_start .le. n_end) &
CALL cublasDGEMM( 'T','N', nstart, my_n, npw2, 2.D0, psi_d, &
npwx2, aux_d(1,n_start), npwx2, 0.D0, sr_d(1,n_start), nstart )
IF ( gstart == 2 ) CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, &
npwx2, aux_d(1,n_start), npwx2, sr_d(1,n_start), nstart )
!
ELSE
!
if (n_start .le. n_end) &
CALL cublasDGEMM( 'T','N', nstart, my_n, npw2, 2.D0, psi_d, &
npwx2, psi_d(1,n_start), npwx2, 0.D0, sr_d(1,n_start), nstart )
IF ( gstart == 2 ) CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, &
npwx2, psi_d(1,n_start), npwx2, sr_d(1,n_start), nstart )
!
END IF
CALL mp_sum( sr_d, inter_bgrp_comm )
!
CALL mp_sum( sr_d, intra_bgrp_comm )
call stop_clock('rotwfcg:hc'); !write(*,*) 'stop rotwfcg:hc' ; FLUSH(6)
!
! ... Diagonalize
!
call start_clock('rotwfcg:diag'); !write(*,*) 'start rotwfcg:diag' ; FLUSH(6)
CALL diaghg( nstart, nbnd, hr_d, sr_d, nstart, en_d, vr_d, me_bgrp, root_bgrp, intra_bgrp_comm )
call stop_clock('rotwfcg:diag'); !write(*,*) 'stop rotwfcg:diag' ; FLUSH(6)
call start_clock('rotwfcg:evc_d'); !write(*,*) 'start rotwfcg:evc_d' ; FLUSH(6)
!
!$cuf kernel do(1)
DO i=1, nbnd
e_d(i) = en_d(i)
END DO
!
! ... update the basis set
!
aux_d=(0.D0,0.D0)
if (n_start .le. n_end) &
CALL cublasDGEMM( 'N','N', npw2, nbnd, my_n, 1.D0, psi_d(1,n_start), &
npwx2, vr_d(n_start,1), nstart, 0.D0, aux_d, npwx2 )
CALL mp_sum( aux_d, inter_bgrp_comm )
!
!$cuf kernel do(2)
DO i=1, nbnd
DO j=1, npwx
evc_d(j,i) = aux_d(j,i)
END DO
END DO
call stop_clock('rotwfcg:evc_d'); !write(*,*) 'stop rotwfcg:evc_d' ; FLUSH(6)
!
DEALLOCATE( en_d )
DEALLOCATE( vr_d )
DEALLOCATE( sr_d )
DEALLOCATE( hr_d )
DEALLOCATE( aux_d )
call stop_clock('rotwfcg'); !write(*,*) 'stop rotwfcg' ; FLUSH(6)
!call print_clock('rotwfcg')
!call print_clock('rotwfcg:hpsi')
!call print_clock('rotwfcg:hc')
!call print_clock('rotwfcg:diag')
!call print_clock('rotwfcg:evc_d')
!
RETURN
!
END SUBROUTINE rotate_wfc_gamma_gpu
! In principle this can go away .......
SUBROUTINE CGcudaDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#if defined(__CUDA)
use cudafor
use cublas
#endif
! .. Scalar Arguments ..
DOUBLE PRECISION :: ALPHA
INTEGER :: INCX, INCY, LDA, M, N
! .. Array Arguments ..
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
#if defined(__CUDA)
attributes(device) :: A, X, Y
#endif
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
END SUBROUTINE CGcudaDGER

View File

@ -0,0 +1,170 @@
!
! Copyright (C) 2001-2007 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!----------------------------------------------------------------------------
SUBROUTINE rotate_wfc_k_gpu( h_psi_gpu, s_psi_gpu, overlap, &
npwx, npw, nstart, nbnd, npol, psi_d, evc_d, e_d )
!----------------------------------------------------------------------------
!
! ... Serial version of rotate_wfc for colinear, k-point calculations
!
USE LAXlib
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id,&
nbgrp, my_bgrp_id, me_bgrp, root_bgrp
USE mp, ONLY : mp_sum
!
IMPLICIT NONE
!
!INCLUDE 'laxlib.fh'
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
! dimension of the matrix to be diagonalized
! leading dimension of matrix psi, as declared in the calling pgm unit
! input number of states
! output number of states
! number of spin polarizations
LOGICAL :: overlap
! if .FALSE. : S|psi> not needed
COMPLEX(DP) :: psi_d(npwx*npol,nstart), evc_d(npwx*npol,nbnd)
! input and output eigenvectors (may overlap)
REAL(DP) :: e_d(nbnd)
! eigenvalues
#if defined(__CUDA)
attributes(DEVICE) :: psi_d, evc_d, e_d
#endif
!
! ... local variables
!
INTEGER :: kdim, kdmx
COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
COMPLEX(DP), ALLOCATABLE :: hc_d(:,:), sc_d(:,:), vc_d(:,:)
REAL(DP), ALLOCATABLE :: en_d(:)
#if defined(__CUDA)
attributes(DEVICE) :: aux_d, hc_d, sc_d, vc_d, en_d
#endif
INTEGER :: n_start, n_end, my_n, i, j
!
EXTERNAL h_psi_gpu, s_psi_gpu
! h_psi(npwx,npw,nvec,psi,hpsi)
! calculates H|psi>
! s_psi(npwx,npw,nvec,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nvec)
IF ( npol == 1 ) THEN
!
kdim = npw
kdmx = npwx
!
ELSE
!
kdim = npwx*npol
kdmx = npwx*npol
!
END IF
!
ALLOCATE( aux_d(kdmx, nstart ) )
ALLOCATE( hc_d( nstart, nstart) )
ALLOCATE( sc_d( nstart, nstart) )
ALLOCATE( vc_d( nstart, nstart) )
ALLOCATE( en_d( nstart ) )
call start_clock('rotwfck'); !write(*,*) 'start rotwfck';FLUSH(6)
!
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
!
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
!
call start_clock('rotwfck:hpsi'); !write(*,*) 'start rotwfck:hpsi';FLUSH(6)
CALL h_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
call stop_clock('rotwfck:hpsi') ; !write(*,*) 'stop rotwfck:hpsi';FLUSH(6)
!
call start_clock('rotwfck:hc'); !write(*,*) 'start rotwfck:hc';FLUSH(6)
hc_d=(0.D0,0.D0)
CALL divide(inter_bgrp_comm,nstart,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
if (n_start .le. n_end) &
call ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, &
kdmx, aux_d(1,n_start), kdmx, (0.D0,0.D0), hc_d(1,n_start), nstart )
CALL mp_sum( hc_d, inter_bgrp_comm )
!
CALL mp_sum( hc_d, intra_bgrp_comm )
!
sc_d=(0.D0,0.D0)
IF ( overlap ) THEN
!
CALL s_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
if (n_start .le. n_end) &
CALL ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, &
kdmx, aux_d(1,n_start), kdmx, (0.D0,0.D0), sc_d(1,n_start), nstart )
!
ELSE
!
if (n_start .le. n_end) &
CALL ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, &
kdmx, psi_d(1,n_start), kdmx, (0.D0,0.D0), sc_d(1,n_start), nstart )
!
END IF
CALL mp_sum( sc_d, inter_bgrp_comm )
!
CALL mp_sum( sc_d, intra_bgrp_comm )
call stop_clock('rotwfck:hc'); !write(*,*) 'stop rotwfck:hc';FLUSH(6)
!
! ... Diagonalize
!
call start_clock('rotwfck:diag'); !write(*,*) 'start rotwfck:diag';FLUSH(6)
!! cdiaghg on GPU. See interface from LAXlib module
CALL diaghg( nstart, nbnd, hc_d, sc_d, nstart, en_d, vc_d, me_bgrp, root_bgrp, intra_bgrp_comm )
call stop_clock('rotwfck:diag'); !write(*,*) 'stop rotwfck:diag';FLUSH(6)
call start_clock('rotwfck:evc'); !write(*,*) 'start rotwfck:evc';FLUSH(6)
!
!$cuf kernel do(1) <<<*,*>>>
DO i=1,nbnd
e_d(i) = en_d(i)
END DO
!
! ... update the basis set
!
aux_d=(0.D0,0.D0)
if (n_start .le. n_end) &
CALL ZGEMM( 'N','N', kdim, nbnd, my_n, (1.D0,0.D0), psi_d(1,n_start), &
kdmx, vc_d(n_start,1), nstart, (0.D0,0.D0), aux_d, kdmx )
CALL mp_sum( aux_d, inter_bgrp_comm )
!
!
!$cuf kernel do(1) <<<*,*>>>
DO i=1, nbnd
DO j=1, kdmx
evc_d(j,i) = aux_d(j,i)
END DO
END DO
!
call stop_clock('rotwfck:evc') ! ; write(*,*) 'stop rotwfck;evc';FLUSH(6)
!
DEALLOCATE( en_d )
DEALLOCATE( vc_d )
DEALLOCATE( sc_d )
DEALLOCATE( hc_d )
DEALLOCATE( aux_d )
call stop_clock('rotwfck'); !write(*,*) 'stop rotwfck';FLUSH(6)
!call print_clock('rotwfck')
!call print_clock('rotwfck:hpsi')
!call print_clock('rotwfck:hc')
!call print_clock('rotwfck:diag')
!call print_clock('rotwfck:evc')
!
RETURN
!
END SUBROUTINE rotate_wfc_k_gpu
!

View File

@ -9,6 +9,8 @@ DAVID = \
cegterg.o \
regterg.o
-include make.gpu
all : libdavid.a

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,13 @@
# Makefile for DAVID GPU
DAVID += \
cegterg_gpu.o \
regterg_gpu.o
ifdef PGI_POWER_WORKAROUND
# Circumvents bug in the Power implementation of PGI compilers
F90FLAGS:= -O1 $(filter-out -fast,$(F90FLAGS))
endif
cegterg_gpu.o : ../../UtilXlib/util_param.o
regterg_gpu.o : ../../UtilXlib/util_param.o

View File

@ -22,7 +22,6 @@ SUBROUTINE regterg( h_psi, s_psi, uspp, g_psi, &
! ... where H is an hermitean operator, e is a real scalar,
! ... S is an uspp matrix, evc is a complex vector
! ... (real wavefunctions with only half plane waves stored)
USE util_param, ONLY : DP, stdout
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, &
nbgrp, my_bgrp_id, me_bgrp, root_bgrp

File diff suppressed because it is too large Load Diff

View File

@ -32,7 +32,10 @@ DENSE/rotate_HSpsi_gamma.o \
DENSE/rotate_HSpsi_k.o \
DENSE/rotate_wfc_gamma.o \
DENSE/rotate_wfc_k.o
# add here other objects, e.g. NewSolver/*.o
-include make.gpu
# add here other objects, e.g. ParO/*.o NewSolver/*.o
libks_solvers.a: $(ALLOBJS)
$(AR) $(ARFLAGS) $@ $?

View File

@ -6,8 +6,11 @@ include ../../make.inc
MODFLAGS= $(MOD_FLAG) ../../ELPA/src $(MOD_FLAG) ../../LAXlib $(MOD_FLAG) ../../UtilXlib $(MOD_FLAG).
PPCG = \
generic_cublas.o \
ppcg_gamma.o \
ppcg_k.o
ppcg_gamma_gpu.o \
ppcg_k.o \
ppcg_k_gpu.o \
all : libppcg.a

View File

@ -0,0 +1,221 @@
subroutine gpu_DGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
#if defined(__CUDA)
USE cublas
#endif
implicit none
character*1 transa, transb
integer :: m, n, k, lda, ldb, ldc
DOUBLE PRECISION :: alpha, beta
DOUBLE PRECISION, dimension(lda, *) :: A
DOUBLE PRECISION, dimension(ldb, *) :: B
DOUBLE PRECISION, dimension(ldc, *) :: C
#if defined(__CUDA)
attributes(device) :: A, B, C
call cublasDGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
#endif
return
end subroutine gpu_DGEMM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
subroutine gpu_ZGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
#if defined(__CUDA)
USE cublas
#endif
implicit none
character*1 transa, transb
integer :: m, n, k, lda, ldb, ldc
DOUBLE COMPLEX :: alpha, beta
DOUBLE COMPLEX, dimension(lda, *) :: A
DOUBLE COMPLEX, dimension(ldb, *) :: B
DOUBLE COMPLEX, dimension(ldc, *) :: C
#if defined(__CUDA)
attributes(device) :: A, B, C
call cublasZGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
#endif
return
end subroutine gpu_ZGEMM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
subroutine gpu_DGER (m, n, alpha, x, incx, y, incy, a, lda)
#if defined(__CUDA)
USE cublas
#endif
implicit none
integer :: m, n, lda, incx, incy
DOUBLE PRECISION :: alpha
DOUBLE PRECISION, dimension(lda, *) :: A
DOUBLE PRECISION, dimension(*) :: x, y
#if defined(__CUDA)
attributes(device) :: A, x, y
call cublasDGER(m, n, alpha, x, incx, y, incy, a, lda)
#endif
return
end subroutine gpu_DGER
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
function gpu_DDOT (n, dx, incx, dy, incy)
#if defined(__CUDA)
USE cublas
#endif
implicit none
DOUBLE PRECISION :: gpu_DDOT
integer :: n, incx, incy
DOUBLE PRECISION, dimension(*) :: dx, dy
#if defined(__CUDA)
attributes(device) :: dx, dy
gpu_DDOT=cublasDDOT(n, dx, incx, dy, incy)
#endif
return
end function gpu_DDOT
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
subroutine gpu_DTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
#if defined(__CUDA)
USE cublas
#endif
implicit none
character*1 :: side, uplo, transa, diag
integer :: m, n, lda, ldb
DOUBLE PRECISION :: alpha
DOUBLE PRECISION, dimension(lda, *) :: a
DOUBLE PRECISION, dimension(ldb, *) :: b
#if defined(__CUDA)
attributes(device) :: a, b
call cublasDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)
#endif
return
end subroutine gpu_DTRSM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE gpu_threaded_memset(array, val, length)
!
#if defined(__CUDA)
USE cudafor
#endif
USE util_param, ONLY : DP
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: length
REAL(DP), INTENT(OUT) :: array(length)
#if defined(__CUDA)
attributes(device) :: array
#endif
REAL(DP), INTENT(IN) :: val
!
INTEGER :: i
!
IF (length<=0) RETURN
!
!$cuf kernel do(1)
DO i=1, length
array(i) = val
ENDDO
!
END SUBROUTINE gpu_threaded_memset
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE gpu_threaded_assign(array_out, array_in, kdimx, nact, use_idx, idx, bgrp_root_only)
! assign (copy) a complex array in a threaded way
! array_out( 1:kdimx, 1:nact ) = array_in( 1:kdimx, 1:nact ) or
! array_out( 1:kdimx, 1:nact ) = array_in( 1:kdimx, idx(1:nact) )
! if the index array idx is given
! if bgrp_root_only is present and .true. the assignement is made only by the
! MPI root process of the bgrp and array_out is zeroed otherwise
#if defined(__CUDA)
USE cudafor
#endif
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : root_bgrp_id, nbgrp, my_bgrp_id
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: kdimx, nact
COMPLEX(DP), INTENT(OUT) :: array_out( kdimx, nact )
COMPLEX(DP), INTENT(IN) :: array_in ( kdimx, * )
INTEGER, INTENT(IN) :: idx( * )
#if defined(__CUDA)
attributes(device) :: array_out, array_in, idx
#endif
LOGICAL, INTENT(IN) :: bgrp_root_only
LOGICAL, INTENT(IN) :: use_idx
!
INTEGER, PARAMETER :: blocksz = 256
INTEGER :: nblock
INTEGER :: i, j
!
IF (kdimx <=0 .OR. nact<= 0) RETURN
!
IF (bgrp_root_only .AND. ( my_bgrp_id /= root_bgrp_id ) ) THEN
call threaded_memset( array_out, 0.d0, 2*kdimx*nact )
RETURN
END IF
nblock = (kdimx - 1)/blocksz + 1
IF (use_idx ) THEN
!$cuf kernel do(2)
DO i=1, nact
DO j=1,nblock
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) )
ENDDO
ENDDO
ELSE
!$cuf kernel do(2)
DO i=1, nact
DO j=1,nblock
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i )
ENDDO
ENDDO
END IF
!
END SUBROUTINE gpu_threaded_assign
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE gpu_threaded_backassign(array_out, idx, array_in, kdimx, nact, use_a2, a2_in )
! assign (copy) a complex array in a threaded way
! array_out( 1:kdimx, idx(1:nact) ) = array_in( 1:kdimx, 1:nact ) or
! array_out( 1:kdimx, idx(1:nact) ) = array_in( 1:kdimx, 1:nact ) + a2_in( 1:kdimx, idx(1:nact) (
! if a2_in is present
! the index array idx is mandatory otherwise one could use previous routine)
#if defined(__CUDA)
USE cudafor
#endif
USE util_param, ONLY : DP
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: kdimx, nact
COMPLEX(DP), INTENT(INOUT) :: array_out( kdimx, * ) ! we don't want to mess with un referenced columns
COMPLEX(DP), INTENT(IN) :: array_in ( kdimx, nact )
COMPLEX(DP), INTENT(IN) :: a2_in ( kdimx, * )
INTEGER, INTENT(IN) :: idx( * )
#if defined(__CUDA)
attributes(device) :: array_out, array_in, a2_in, idx
#endif
LOGICAL, INTENT(IN) :: use_a2
!
INTEGER, PARAMETER :: blocksz = 256
INTEGER :: nblock
INTEGER :: i, j
!
IF (kdimx <=0 .OR. nact<= 0) RETURN
!
nblock = (kdimx - 1)/blocksz + 1
IF ( use_a2) THEN
!$cuf kernel do(2)
DO i=1, nact
DO j=1,nblock
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) ) = &
array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) + &
a2_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) )
ENDDO
ENDDO
ELSE
!$cuf kernel do(2)
DO i=1, nact
DO j=1,nblock
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i )
ENDDO
ENDDO
END IF
!
END SUBROUTINE gpu_threaded_backassign
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

View File

@ -1,6 +1,14 @@
generic_cublas.o : ../../UtilXlib/mp_bands_util.o
generic_cublas.o : ../../UtilXlib/util_param.o
ppcg_gamma.o : ../../UtilXlib/mp.o
ppcg_gamma.o : ../../UtilXlib/mp_bands_util.o
ppcg_gamma.o : ../../UtilXlib/util_param.o
ppcg_gamma_gpu.o : ../../UtilXlib/mp.o
ppcg_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
ppcg_gamma_gpu.o : ../../UtilXlib/util_param.o
ppcg_k.o : ../../UtilXlib/mp.o
ppcg_k.o : ../../UtilXlib/mp_bands_util.o
ppcg_k.o : ../../UtilXlib/util_param.o
ppcg_k_gpu.o : ../../UtilXlib/mp.o
ppcg_k_gpu.o : ../../UtilXlib/mp_bands_util.o
ppcg_k_gpu.o : ../../UtilXlib/util_param.o

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -13,6 +13,8 @@ pcg_k.o \
bpcg_gamma.o \
bpcg_k.o
-include make.gpu
all : libparo.a
libparo.a: $(PARO)

View File

@ -0,0 +1,404 @@
! Copyright (C) 2015-2016 Aihui Zhou's group
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-------------------------------------------------------------------------------
!
! We propose some parallel orbital updating based plane wave basis methods
! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
! parallelization, which make them have great advantage in large-scale parallelization.
!
! The approach following Algorithm is the parallel orbital updating algorithm:
! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
! the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$
! and let $n=0$.
! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
! $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))] $$
! in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the
! $n$-th iteration or the former iterations.
! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$ satisfying
! $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad \forall v \in \tilde{V}_N$$
! where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$,
! $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else, stop.
!
! You can see the detailed information through
! X. Dai, X. Gong, A. Zhou, J. Zhu,
! A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
! X. Dai, Z. Liu, X. Zhang, A. Zhou,
! A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations,
! arXiv:1510.07230 (2015).
! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
! A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
!
! The file is written mainly by Stefano de Gironcoli and Yan Pan.
!
! The following file is for solving step 2 of the parallel orbital updating algorithm.
!
! Ivan Carnimeo: GPU version
!
#define ZERO ( 0.D0, 0.D0 )
#define ONE ( 1.D0, 0.D0 )
!
!----------------------------------------------------------------------------
SUBROUTINE bpcg_gamma_gpu( hs_psi_gpu, g_1psi_gpu, psi0_d, spsi0_d, npw, npwx, nbnd, nvec, psi_d, hpsi_d, spsi_d, ethr, e_d, nhpsi )
!----------------------------------------------------------------------------
!
! Block Preconditioned Conjugate Gradient solution of the linear system
!
! [ H - e S ]|\tilde\psi> = Pc [ e S - H ] |psi>
!
! the search targets the space orthogonal to the current best wfcs (psi0);
! the solution is sought until the residual norm is a fixed fraction of the RHS norm
! in this way the more accurate is the original problem the more accuratly the correction is computed
!
! in order to avoid un-necessary HSpsi evaluations this version assumes psi,hpsi and spsi are all
! provided in input and return their estimate for further use
!
! Ivan Carnimeo: GPU version
!
#if defined (__CUDA)
USE cudafor
#endif
USE util_param, ONLY : DP, stdout
USE mp_bands_util, ONLY : intra_bgrp_comm, gstart
USE mp, ONLY : mp_sum
!
IMPLICIT NONE
!
! Following varibales are temporary
COMPLEX(DP), INTENT(IN) :: psi0_d(npwx,nbnd) ! psi0 needed to compute the Pv projection
COMPLEX(DP), INTENT(IN) :: spsi0_d(npwx,nbnd) ! Spsi0 needed to compute the Pv projection
!
INTEGER, INTENT(IN) :: npw, npwx, nbnd, nvec ! input dimensions
REAL(DP), INTENT(IN) :: ethr ! threshold for convergence.
REAL(DP), INTENT(INOUT) :: e_d(nvec) ! current estimate of the target eigenvalues
COMPLEX(DP),INTENT(INOUT) :: psi_d(npwx,nvec),hpsi_d(npwx,nvec),spsi_d(npwx,nvec) !
! input: the current estimate of the wfcs
! output: the estimated correction vectors
INTEGER, INTENT(INOUT) :: nhpsi ! (updated) number of Hpsi evaluations
!
! ... LOCAL variables
!
INTEGER, PARAMETER :: maxter = 5 ! maximum number of CG iterations
!
REAL(DP), ALLOCATABLE :: g0(:), g1(:), g2(:), gamma(:), ethr_cg(:), ff(:), ff0(:)
INTEGER, ALLOCATABLE :: cg_iter(:)
REAL(DP) :: beta, ee
INTEGER :: npw2, npwx2, i, l, block_size, done, nactive, nnew, newdone
!
! ... DEVICE variables
!
EXTERNAL g_1psi_gpu, hs_psi_gpu
REAL(DP), EXTERNAL :: gpu_DDOT
COMPLEX(DP), ALLOCATABLE :: b_d(:,:), & ! RHS for testing
p_d(:,:), hp_d(:,:), sp_d(:,:), z_d(:,:) ! additional working vetors
REAL(DP), ALLOCATABLE :: spsi0vec_d (:,:) ! the product of spsi0 and a group of vectors
COMPLEX(DP) :: tmp, tmp_d
INTEGER :: ii, jj
REAL(DP), ALLOCATABLE :: alpha(:)
#if defined (__CUDA)
attributes(device) :: psi_d, spsi_d, hpsi_d
attributes(device) :: psi0_d, spsi0_d, spsi0vec_d
attributes(device) :: e_d, b_d, z_d, p_d, hp_d, sp_d
attributes(device) :: tmp_d
#endif
!
!
CALL start_clock( 'pcg' ); !write (6,*) ' enter pcg' , e(1:2) ; FLUSH(6)
!
npw2 = 2*npw
npwx2 = 2*npwx
block_size = min(nvec,64)
!
ALLOCATE( g0( block_size ), g1( block_size ), g2( block_size ), gamma( block_size ) )
ALLOCATE( ethr_cg( block_size ), ff( block_size ), ff0( block_size ), cg_iter( block_size ) )
ALLOCATE( z_d( npwx, block_size ), b_d( npwx, block_size ) )
ALLOCATE( p_d(npwx,block_size), hp_d(npwx,block_size), sp_d(npwx,block_size) )
ALLOCATE( spsi0vec_d(nbnd, block_size) )
ALLOCATE( alpha( block_size ) )
!
!
done = 0 ! the number of correction vectors already solved
nactive = 0 ! the number of correction vectors currently being updated
cg_iter = 0 ! how many iteration each active vector has completed (<= maxter)
MAIN_LOOP: & ! This is a continuous loop. It terminates only when nactive vanishes
DO
nnew = min(done+block_size,nvec)-(done+nactive) ! number of new corrections to be added to the seach
if ( nnew > 0 ) then ! add nnew vectors to the active list
!write(6,*) ' nnew =', nnew
do l=nactive+1,nactive+nnew
i=l+done
!write(6,*) ' l =',l,' i =',i
!write (6,*) ' enter pcg' , e(i) ; FLUSH(6)
!$cuf kernel do(1)
DO ii = 1, npwx
b_d(ii,l) = e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i) ! initial gradient and saved RHS for later
END DO
!$cuf kernel do(1)
DO ii = 1, npwx
z_d(ii,l) = b_d(ii,l)
END DO
call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i)) ! initial preconditioned gradient
end do
!- project on conduction bands
CALL start_clock( 'pcg:ortho' )
CALL gpu_DGEMM( 'T','N', nbnd,nnew,npw2, 2.D0, spsi0_d, npwx2, z_d(:,nactive+1), npwx2, 0.D0, spsi0vec_d, nbnd )
IF ( gstart == 2 ) CALL gpu_DGER( nbnd, nnew, -1.D0, spsi0_d, npwx2, z_d(:,nactive+1), npwx2, spsi0vec_d, nbnd )
CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
CALL gpu_DGEMM( 'N','N', npw2,nnew,nbnd,-1.D0, psi0_d, npwx2, spsi0vec_d, nbnd, 1.D0, z_d(:,nactive+1), npwx2 )
CALL stop_clock( 'pcg:ortho' )
!-
do l=nactive+1,nactive+nnew; i=l+done
g0(l) = 2.D0*gpu_DDOT(npw2,z_d(:,l),1,b_d(:,l),1)
IF (gstart==2) g0(l)=g0(l) - gpu_DDOT(2,z_d(1,l),1,b_d(1,l),1)
end do
CALL mp_sum( g0(nactive+1:nactive+nnew), intra_bgrp_comm ) ! g0 = < initial z | initial gradient b >
do l=nactive+1,nactive+nnew; i=l+done
!write(6,*) ' l =',l,' i =',i
ff(l) = 0.d0 ; ff0(l) = ff(l)
!write (6,*) 0, g0(l), ff(l)
! ethr_cg = ethr ! CG convergence threshold could be set from input but it is not ...
ethr_cg(l) = 1.0D-2 ! it makes more sense to fix the convergence of the CG solution to a
! fixed function of the RHS (see ethr_cg update later).
ethr_cg(l) = max ( 0.01*ethr, ethr_cg(l) * g0(l) ) ! here we set the convergence of the correction
!write(6,*) 'ethr_cg :', ethr_cg(l)
! zero the trial solution
psi_d(:,i) = ZERO
hpsi_d(:,i) = ZERO
spsi_d(:,i) = ZERO
! initial search direction
!$cuf kernel do(1)
DO ii = 1, npwx
p_d(ii,l) = z_d(ii,l)
END DO
cg_iter(l) = 0 ! this is a new correction vector, reset its interation count
end do
nactive = nactive + nnew
end if
!write(6,*) ' done =',done, ' nactive =',nactive
! iterate: ! DO cg_iter = 1, maxter ! THIS IS THE ENTRY POINT OF THE PCG LOOP
if ( nactive == 0 ) EXIT MAIN_LOOP ! this is the only MAIN_LOOP EXIT condition
cg_iter(1:nactive) = cg_iter(1:nactive) + 1 ! update interation counters
CALL start_clock( 'pcg:hs_1psi' )
! do l = 1, nactive ! THIS COULD/SHOULD BE A GLOBAL CALL (ONLY WITHIN ONE BGRP THOUGH)
! CALL hs_1psi( npwx, npw, p(:,l), hp(:,l), sp(:,l) ) ! apply H to a single wavefunction (no bgrp parallelization here!)
! end do
CALL hs_psi_gpu( npwx, npw, nactive, p_d, hp_d, sp_d ) ! apply H to a single wavefunction (no bgrp parallelization here!)
CALL stop_clock( 'pcg:hs_1psi' )
do l = 1, nactive; i=l+done
gamma(l) = 2.D0*gpu_DDOT(npw2,p_d(:,l),1,hp_d(:,l),1) - e_d(i) * 2.D0*gpu_DDOT(npw2,p_d(:,l),1,sp_d(:,l),1)
IF (gstart==2) gamma(l) = gamma(l) - gpu_DDOT(2,p_d(1,l),1,hp_d(1,l),1) + e_d(i) * gpu_DDOT(2,p_d(1,l),1,sp_d(1,l),1)
end do
CALL mp_sum( gamma(1:nactive), intra_bgrp_comm ) ! gamma = < p | hp - e sp >
do l = 1, nactive; i=l+done
!write(6,*) ' l =',l,' i =',i
alpha(l) = g0(l)/gamma(l)
!write(6,*) 'g0, gamma, alpha :', g0(l), gamma(l), alpha_d(l)
tmp = alpha(l)
!$cuf kernel do(1)
DO ii = 1, npwx
psi_d(ii,i) = psi_d(ii,i) + tmp * p_d(ii,l) ! updated solution
hpsi_d(ii,i) = hpsi_d(ii,i) + tmp * hp_d(ii,l) ! updated solution
spsi_d(ii,i) = spsi_d(ii,i) + tmp * sp_d(ii,l) ! updated solution
END DO
g2(l) = 2.D0 * ( gpu_DDOT(npw2,z_d(:,l),1,b_d(:,l),1) + &
e_d(i) * gpu_DDOT(npw2,z_d(:,l),1,spsi_d(:,i),1) - &
gpu_DDOT(npw2,z_d(:,l),1,hpsi_d(:,i),1) )
IF (gstart==2) g2(l) = g2(l) - gpu_DDOT(2,z_d(1,l),1,b_d(1,l),1) - &
e_d(i)*gpu_DDOT(2,z_d(1,l),1,spsi_d(1,i),1) + &
gpu_DDOT(2,z_d(1,l),1,hpsi_d(1,i),1)
end do
CALL mp_sum( g2(1:nactive), intra_bgrp_comm ) ! g2 = < old z | new gradient b + e spsi - hpsi >
do l = 1, nactive; i=l+done ! update the preconditioned gradient
!$cuf kernel do(1)
DO ii = 1, npwx
z_d(ii,l) = b_d(ii,l) + e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i)
END DO
call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i))
end do
!- project on conduction bands
CALL start_clock( 'pcg:ortho' )
CALL gpu_DGEMM( 'T','N', nbnd,nactive,npw2, 2.D0, spsi0_d, npwx2, z_d, npwx2, 0.D0, spsi0vec_d, nbnd )
IF ( gstart == 2 ) CALL gpu_DGER( nbnd, nactive, -1.D0, spsi0_d, npwx2, z_d, npwx2, spsi0vec_d, nbnd )
CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
CALL gpu_DGEMM( 'N','N', npw2,nactive,nbnd,-1.D0, psi0_d, npwx2, spsi0vec_d, nbnd, 1.D0, z_d, npwx2 )
CALL stop_clock( 'pcg:ortho' )
!-
do l = 1, nactive; i=l+done
g1(l) = 2.D0 * ( gpu_DDOT(npw2,z_d(:,l),1,b_d(:,l),1) + &
e_d(i) * gpu_DDOT(npw2,z_d(:,l),1,spsi_d(:,i),1) - &
gpu_DDOT(npw2,z_d(:,l),1,hpsi_d(:,i),1) )
IF (gstart==2) THEN
!$cuf kernel do(1)
do ii = 1, 1
tmp_d = b_d(1,l) + e_d(i)*spsi_d(1,i) - hpsi_d(1,i)
end do
g1(l) = g1(l) - gpu_DDOT(2,z_d(1,l),1,tmp_d,1)
END IF
end do
CALL mp_sum( g1(1:nactive), intra_bgrp_comm ) ! g1 = < new z | new gradient b + e spsi - hpsi >
do l = 1, nactive; i = l + done ! evaluate the function ff
ff(l) = - ( e_d(i)*gpu_DDOT(npw2,psi_d(:,i),1,spsi_d(:,i),1) - gpu_DDOT(npw2,psi_d(:,i),1,hpsi_d(:,i),1) ) &
- 2.D0 * gpu_DDOT(npw2,psi_d(:,i),1,b_d(:,l),1)
if (gstart==2) THEN
!$cuf kernel do(1)
do ii = 1, 1
tmp_d = e_d(i)*spsi_d(1,i) - hpsi_d(1,i) + 2.D0 * b_d(1,l)
end do
ff(l) = ff(l) + 0.5D0 * gpu_DDOT(2,psi_d(1,i),1,tmp_d,1)
END IF
end do
CALL mp_sum( ff(1:nactive), intra_bgrp_comm ) ! function minimum -0.5 < psi | e spsi - hpsi > - < psi | b >
newdone = 0 ! number of correction vectors that converge (or are done) at this iteration
do l = 1, nactive; i = l + done
!write (6,*) cg_iter(l), g1(l), ff(l), gamma(l)
IF ( ff(l) > ff0(l) .AND. ff0(l) < 0.d0 ) THEN
tmp = alpha(l)
!$cuf kernel do(1)
DO ii = 1, npwx
psi_d(ii,i) = psi_d(ii,i) - tmp * p_d(ii,l) ! fallback solution: if last iter failed to improve ff0
hpsi_d(ii,i) = hpsi_d(ii,i) - tmp * hp_d(ii,l)! exit whitout updating and ...
spsi_d(ii,i) = spsi_d(ii,i) - tmp * sp_d(ii,l)! hope next time it'll be better
END DO
END IF
!write(6,*) 'g0, g1, g2 :', g0(l), g1(l), g2(l)
!write(6,*) 'ff0, ff : ', ff0(l), ff(l)
IF ( ABS ( g1(l) ) < ethr_cg(l) .OR. ( ff(l) > ff0(l) ) .OR. cg_iter(l) == maxter) THEN ! EXIT iterate
!write (6,*) ' exit pcg loop'
!write(6,*) ' l =',l,' i =',i
!if ( cg_iter(l) == maxter.and. ABS(g1(l)) > ethr_cg(l)) write (6,*) 'CG not converged maxter exceeded', cg_iter(l), g1(l), g0(l), ethr_cg(l)
!IF ( ABS ( g1(l) ) < ethr_cg(l)) write (6,*) 'CG correction converged ', cg_iter(l), g1(l), ethr_cg(l)
!IF ( ABS ( g1(l) ) > g0(l) ) write (6,*) 'CG not converged ', cg_iter(l), g1(l), g0(l), ethr_cg(l)
nhpsi = nhpsi + cg_iter(l) ! update nhpsi count
IF (.NOT. (ABS(g1(l))< ethr_cg(l) .OR. (ff(l)>ff0(l)) ) .AND. cg_iter(l)==maxter) nhpsi = nhpsi + 1 ! because this would be the count
newdone = newdone + 1 ! one more solution found (or no more active anyway)
!write(6,*) ' newdone = ', newdone
CALL start_clock( 'pcg:move' )
!write(6,*) ' swapping converged psi/hpsi/spsi i = ',i, " with i' = ",done+newdone
! swap the terminated vector with the first in the list of the active ones
!$cuf kernel do(1)
DO ii = 1, npwx
p_d (ii,l) = psi_d (ii,done+newdone)
psi_d (ii,done+newdone) = psi_d (ii,i)
psi_d (ii,i) = p_d (ii,l)
hp_d(ii,l) = hpsi_d(ii,done+newdone)
hpsi_d(ii,done+newdone) = hpsi_d(ii,i)
hpsi_d(ii,i) = hp_d(ii,l)
sp_d(ii,l) = spsi_d(ii,done+newdone)
spsi_d(ii,done+newdone) = spsi_d(ii,i)
spsi_d(ii,i) = sp_d(ii,l)
END DO
ee = e_d(done+newdone)
!$cuf kernel do(1)
DO ii = 1, 1
e_d(done+newdone) = e_d(i)
END DO
e_d(i) = ee
!write(6,*) ' overwrite converged p/hp/etc l = ',l, ' with newdone = ',newdone
! move information of the swapped active vector in the right place to keep going
!$cuf kernel do(1)
DO ii = 1, npwx
p_d(ii,l) = p_d(ii,newdone)
hp_d(ii,l) = p_d(ii,newdone)
sp_d(ii,l) = sp_d(ii,newdone)
b_d(ii,l) = b_d(ii,newdone)
z_d(ii,l) = z_d(ii,newdone)
END DO
ff0(l) = ff0(newdone) ; ff(l) = ff(newdone)
alpha(l) = alpha(newdone)
g0(l) = g0(newdone) ; g1(l) = g1(newdone) ; g2(l) = g2(newdone)
cg_iter(l) = cg_iter(newdone) ; ethr_cg(l) = ethr_cg(newdone)
CALL stop_clock( 'pcg:move' )
ELSE
!write(6,*) ' l =',l,' i =',i
beta = (g1(l)-g2(l))/g0(l) ! Polak - Ribiere style update
g0(l) = g1(l) ! < new z | new gradient > -> < old z | old gradient >
!$cuf kernel do(1)
DO ii = 1, npwx
p_d(ii,l) = z_d(ii,l) + beta * p_d(ii,l) ! updated search direction
END DO
!write(6,*) 'beta :', beta
ff0(l) = ff(l) ! updated minimum value reached by the function
END IF
end do
IF ( newdone > 0 ) THEN
done = done + newdone
nactive = nactive - newdone
!write(6,*) ' there have been ', newdone, ' new converged solution'
!write(6,*) ' done = ', done, ' nactive =', nactive
CALL start_clock( 'pcg:move' )
do l=1, nactive
!write(6,*) ' l+newdone =',l+newdone,' -> l =',l
!$cuf kernel do(1)
DO ii = 1, npwx
p_d (ii,l) = p_d (ii,l+newdone)
hp_d(ii,l) = hp_d(ii,l+newdone)
sp_d(ii,l) = sp_d(ii,l+newdone)
b_d(ii,l) = b_d(ii,l+newdone)
z_d(ii,l) = z_d(ii,l+newdone)
END DO
ff0(l) = ff0(l+newdone) ; ff(l) = ff(l+newdone)
g0(l) = g0(l+newdone) ; g1(l) = g1(l+newdone) ; g2(l) = g2(l+newdone)
cg_iter(l) = cg_iter(l+newdone) ; ethr_cg(l) = ethr_cg(l+newdone)
end do
CALL stop_clock( 'pcg:move' )
END IF
! END DO iterate Here is where the pcg loop would terminate
END DO MAIN_LOOP
!write (6,*) ' exit pcg loop'
DEALLOCATE( b_d, z_d, p_d, hp_d, sp_d )
DEALLOCATE( ethr_cg, ff, ff0, cg_iter )
DEALLOCATE( g0, g1, g2, gamma )
DEALLOCATE( spsi0vec_d )
DEALLOCATE( alpha )
!
CALL stop_clock( 'pcg' )
!
RETURN
!
END SUBROUTINE bpcg_gamma_gpu

View File

@ -0,0 +1,389 @@
! Copyright (C) 2015-2016 Aihui Zhou's group
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-------------------------------------------------------------------------------
!
! We propose some parallel orbital updating based plane wave basis methods
! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
! parallelization, which make them have great advantage in large-scale parallelization.
!
! The approach following Algorithm is the parallel orbital updating algorithm:
! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
! the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$
! and let $n=0$.
! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
! $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))] $$
! in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the
! $n$-th iteration or the former iterations.
! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$ satisfying
! $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad \forall v \in \tilde{V}_N$$
! where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$,
! $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else, stop.
!
! You can see the detailed information through
! X. Dai, X. Gong, A. Zhou, J. Zhu,
! A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
! X. Dai, Z. Liu, X. Zhang, A. Zhou,
! A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations,
! arXiv:1510.07230 (2015).
! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
! A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
!
! The file is written mainly by Stefano de Gironcoli and Yan Pan.
! * GPU version Ivan Carnimeo
!
! The following file is for solving step 2 of the parallel orbital updating algorithm.
!
#define ZERO ( 0.D0, 0.D0 )
#define ONE ( 1.D0, 0.D0 )
!
!----------------------------------------------------------------------------
SUBROUTINE bpcg_k_gpu( hs_psi_gpu, g_1psi_gpu, psi0_d, spsi0_d, npw, npwx, nbnd, npol, nvec, &
psi_d, hpsi_d, spsi_d, ethr, e_d, nhpsi )
!----------------------------------------------------------------------------
!
! Block Preconditioned Conjugate Gradient solution of the linear system
!
! [ H - e S ]|\tilde\psi> = Pc [ e S - H ] |psi>
!
! the search targets the space orthogonal to the current best wfcs (psi0);
! the solution is sought until the residual norm is a fixed fraction of the RHS norm
! in this way the more accurate is the original problem the more accuratly the correction is computed
!
! in order to avoid un-necessary HSpsi evaluations this version assumes psi,hpsi and spsi are all
! provided in input and return their estimate for further use
!
#if defined (__CUDA)
USE cudafor
#endif
USE util_param, ONLY : DP, stdout
USE mp_bands_util, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum
!
IMPLICIT NONE
!
! Following varibales are temporary
COMPLEX(DP),INTENT(IN) :: psi0_d(npwx*npol,nbnd) ! psi0 needed to compute the Pv projection
COMPLEX(DP),INTENT(IN) :: spsi0_d(npwx*npol,nbnd) ! Spsi0 needed to compute the Pv projection
INTEGER, INTENT(IN) :: npw, npwx, nbnd, npol, nvec ! input dimensions
REAL(DP), INTENT(IN) :: ethr ! threshold for convergence.
REAL(DP), INTENT(INOUT) :: e_d(nvec) ! current estimate of the target eigenvalues
COMPLEX(DP),INTENT(INOUT) :: psi_d(npwx*npol,nvec),hpsi_d(npwx*npol,nvec),spsi_d(npwx*npol,nvec) !
! input: the current estimate of the wfcs
! output: the estimated correction vectors
INTEGER, INTENT(INOUT) :: nhpsi ! (updated) number of Hpsi evaluations
!
! ... LOCAL variables
!
INTEGER, PARAMETER :: maxter = 5 ! maximum number of CG iterations
!
COMPLEX(DP), ALLOCATABLE :: b_d(:,:), & ! RHS for testing
p_d(:,:), hp_d(:,:), sp_d(:,:), z_d(:,:) ! additional working vetors
COMPLEX(DP), ALLOCATABLE :: spsi0vec_d (:,:) ! the product of spsi0 and a group of vectors
REAL(DP), ALLOCATABLE :: g0(:), g1(:), g2(:), gamma(:), ethr_cg(:), ff(:), ff0(:)
REAL(DP), ALLOCATABLE :: alpha(:)
INTEGER, ALLOCATABLE :: cg_iter(:)
REAL(DP) :: beta, ee
INTEGER :: kdim, kdmx, i, l, block_size, done, nactive, nnew, newdone
!
REAL(DP), EXTERNAL :: gpu_DDOT
EXTERNAL g_1psi_gpu, hs_psi_gpu
! hs_1psi( npwx, npw, psi, hpsi, spsi )
! hs_psi( npwx, npw, nvec, psi, hpsi, spsi )
!
!
INTEGER :: ii, jj ! cuf kernel indeces
REAL(DP) :: tmp
#if defined (__CUDA)
attributes(device) :: psi_d, hpsi_d, spsi_d, psi0_d, spsi0_d, spsi0vec_d
attributes(device) :: e_d
attributes(device) :: b_d, p_d, hp_d, sp_d, z_d
#endif
!
CALL start_clock( 'pcg' ); !write (6,*) ' enter pcg' , e(1:2), 'npol = ', npol ; FLUSH(6)
!
kdim = npwx*(npol-1) + npw
kdmx = npwx* npol
block_size = min(nvec,64)
!
ALLOCATE( g0( block_size ), g1( block_size ), g2( block_size ), gamma( block_size ) )
ALLOCATE( ethr_cg( block_size ), ff( block_size ), ff0( block_size ), cg_iter( block_size ) )
ALLOCATE( z_d( kdmx, block_size ), b_d( kdmx, block_size ) )
ALLOCATE( p_d(kdmx,block_size), hp_d(kdmx,block_size), sp_d(kdmx,block_size) )
ALLOCATE( spsi0vec_d(nbnd, block_size) )
ALLOCATE( alpha( block_size ) )
!
done = 0 ! the number of correction vectors already solved
nactive = 0 ! the number of correction vectors currently being updated
cg_iter = 0 ! how many iteration each active vector has completed (<= maxter)
MAIN_LOOP: & ! This is a continuous loop. It terminates only when nactive vanishes
DO
nnew = min(done+block_size,nvec)-(done+nactive) ! number of new corrections to be added to the seach
if ( nnew > 0 ) then ! add nnew vectors to the active list
!write(6,*) ' nnew =', nnew
do l=nactive+1,nactive+nnew; i=l+done
!write(6,*) ' l =',l,' i =',i
!write (6,*) ' enter pcg' , e(i), 'npol = ', npol ; FLUSH(6)
!$cuf kernel do(1)
DO ii = 1, kdmx
b_d(ii,l) = e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i) ! initial gradient and saved RHS for later
END DO
!$cuf kernel do(1)
DO ii = 1, kdmx
z_d(ii,l) = b_d(ii,l)
END DO
call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i)) ! initial preconditioned gradient
end do
!- project on conduction bands
CALL start_clock( 'pcg:ortho' )
CALL gpu_ZGEMM( 'C', 'N', nbnd, nnew, kdim, ONE, spsi0_d, kdmx, z_d(:,nactive+1), kdmx, ZERO, &
spsi0vec_d, nbnd )
CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
CALL gpu_ZGEMM( 'N', 'N', kdim, nnew, nbnd, (-1.D0,0.D0), psi0_d, kdmx, spsi0vec_d, nbnd, ONE, &
z_d(:,nactive+1), kdmx )
CALL stop_clock( 'pcg:ortho' )
!-
do l=nactive+1,nactive+nnew; i=l+done
g0(l) = gpu_DDOT( 2*kdim, z_d(:,l), 1, b_d(:,l), 1 )
end do
CALL mp_sum( g0(nactive+1:nactive+nnew), intra_bgrp_comm ) ! g0 = < initial z | initial gradient b >
do l=nactive+1,nactive+nnew; i=l+done
!write(6,*) ' l =',l,' i =',i
ff(l) = 0.d0 ; ff0(l) = ff(l)
!write (6,*) 0, g0(l), ff(l)
! ethr_cg = ethr ! CG convergence threshold could be set from input but it is not ...
ethr_cg(l) = 1.0D-2 ! it makes more sense to fix the convergence of the CG solution to a
! fixed function of the RHS (see ethr_cg update later).
ethr_cg(l) = max ( 0.01*ethr, ethr_cg(l) * g0(l) ) ! here we set the convergence of the correction
!write(6,*) 'ethr_cg :', ethr_cg(l)
! zero the trial solution
!$cuf kernel do(1)
DO ii = 1, kdmx
psi_d(ii,i) = ZERO
hpsi_d(ii,i) = ZERO
spsi_d(ii,i) = ZERO
END DO
! initial search direction
!$cuf kernel do(1)
DO ii = 1, kdmx
p_d(ii,l) = z_d(ii,l)
END DO
cg_iter(l) = 0 ! this is a new correction vector, reset its interation count
end do
nactive = nactive + nnew
end if
!write(6,*) ' done =',done, ' nactive =',nactive
! iterate: ! DO cg_iter = 1, maxter ! THIS IS THE ENTRY POINT OF THE PCG LOOP
if ( nactive == 0 ) EXIT MAIN_LOOP ! this is the only MAIN_LOOP EXIT condition
cg_iter(1:nactive) = cg_iter(1:nactive) + 1 ! update interation counters
CALL start_clock( 'pcg:hs_1psi' )
! do l = 1, nactive ! THIS COULD/SHOULD BE A GLOBAL CALL (ONLY WITHIN ONE BGRP THOUGH)
! CALL hs_1psi( npwx, npw, p(:,l), hp(:,l), sp(:,l) ) ! apply H to a single wavefunction (no bgrp parallelization here!)
! end do
CALL hs_psi_gpu( npwx, npw, nactive, p_d, hp_d, sp_d ) ! apply H to a single wavefunction (no bgrp parallelization here!)
CALL stop_clock( 'pcg:hs_1psi' )
do l = 1, nactive; i=l+done
gamma(l) = gpu_DDOT( 2*kdim, p_d(:,l), 1, hp_d(:,l), 1 )
gamma(l) = gamma(l) - e_d(i) * gpu_DDOT( 2*kdim, p_d(:,l), 1, sp_d(:,l), 1 )
end do
CALL mp_sum( gamma(1:nactive), intra_bgrp_comm ) ! gamma = < p | hp - e sp >
do l = 1, nactive; i=l+done
!write(6,*) ' l =',l,' i =',i
alpha(l) = g0(l)/gamma(l)
!write(6,*) 'g0, gamma, alpha :', g0(l), gamma(l), alpha(l)
tmp = alpha(l)
!$cuf kernel do(1)
DO ii = 1, kdmx
psi_d(ii,i) = psi_d(ii,i) + tmp * p_d(ii,l) ! updated solution
hpsi_d(ii,i) = hpsi_d(ii,i) + tmp * hp_d(ii,l) ! updated solution
spsi_d(ii,i) = spsi_d(ii,i) + tmp * sp_d(ii,l) ! updated solution
END DO
g2(l) = gpu_DDOT(2*kdim,z_d(:,l),1,b_d(:,l),1) &
+ e_d(i) * gpu_DDOT(2*kdim,z_d(:,l),1,spsi_d(:,i),1) &
- gpu_DDOT(2*kdim,z_d(:,l),1,hpsi_d(:,i),1)
end do
CALL mp_sum( g2(1:nactive), intra_bgrp_comm ) ! g2 = < old z | new gradient b + e spsi - hpsi >
do l = 1, nactive; i=l+done ! update the preconditioned gradient
!$cuf kernel do(1)
DO ii = 1, kdmx
z_d(ii,l) = b_d(ii,l) + e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i)
END DO
call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i))
end do
!- project on conduction bands
CALL start_clock( 'pcg:ortho' )
CALL gpu_ZGEMM( 'C', 'N', nbnd, nactive, kdim, ONE, spsi0_d, kdmx, z_d, kdmx, ZERO, spsi0vec_d, nbnd )
CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
CALL gpu_ZGEMM( 'N', 'N', kdim, nactive, nbnd, (-1.D0,0.D0), psi0_d, kdmx, spsi0vec_d, nbnd, ONE, z_d, kdmx )
CALL stop_clock( 'pcg:ortho' )
!-
do l = 1, nactive; i=l+done
g1(l) = gpu_DDOT(2*kdim,z_d(:,l),1,b_d(:,l),1) &
+ e_d(i) * gpu_DDOT(2*kdim,z_d(:,l),1,spsi_d(:,i),1) &
- gpu_DDOT(2*kdim,z_d(:,l),1,hpsi_d(:,i),1)
end do
CALL mp_sum( g1(1:nactive), intra_bgrp_comm ) ! g1 = < new z | new gradient b + e spsi - hpsi >
do l = 1, nactive; i = l + done ! evaluate the function ff
ff(l) = -0.5_DP * ( e_d(i)*gpu_DDOT(2*kdim,psi_d(:,i),1,spsi_d(:,i),1)&
-gpu_DDOT(2*kdim,psi_d(:,i),1,hpsi_d(:,i),1) )
ff(l) = ff(l) - gpu_DDOT(2*kdim,psi_d(:,i),1,b_d(:,l),1)
end do
CALL mp_sum( ff(1:nactive), intra_bgrp_comm ) ! function minimum -0.5 < psi | e spsi - hpsi > - < psi | b >
newdone = 0 ! number of correction vectors that converge (or are done) at this iteration
do l = 1, nactive; i = l + done
!write (6,*) cg_iter(l), g1(l), ff(l), gamma(l)
IF ( ff(l) > ff0(l) .AND. ff0(l) < 0.d0 ) THEN
tmp = alpha(l)
!$cuf kernel do(1)
DO ii = 1, kdmx
psi_d(ii,i) = psi_d(ii,i) - tmp * p_d(ii,l) ! fallback solution: if last iter failed to improve ff0
hpsi_d(ii,i) = hpsi_d(ii,i) - tmp * hp_d(ii,l)! exit whitout updating and ...
spsi_d(ii,i) = spsi_d(ii,i) - tmp * sp_d(ii,l)! hope next time it'll be better
END DO
END IF
!write(6,*) 'g0, g1, g2 :', g0(l), g1(l), g2(l)
!write(6,*) 'ff0, ff : ', ff0(l), ff(l)
IF ( ABS ( g1(l) ) < ethr_cg(l) .OR. ( ff(l) > ff0(l) ) .OR. cg_iter(l) == maxter) THEN ! EXIT iterate
!write (6,*) ' exit pcg loop'
!write(6,*) ' l =',l,' i =',i
!if ( cg_iter(l) == maxter.and. ABS(g1(l)) > ethr_cg(l)) write (6,*) 'CG not converged maxter exceeded', cg_iter(l), g1(l), g0(l), ethr_cg(l)
!IF ( ABS ( g1(l) ) < ethr_cg(l)) write (6,*) 'CG correction converged ', cg_iter(l), g1(l), ethr_cg(l)
!IF ( ABS ( g1(l) ) > g0(l) ) write (6,*) 'CG not converged ', cg_iter(l), g1(l), g0(l), ethr_cg(l)
nhpsi = nhpsi + cg_iter(l) ! update nhpsi count
IF (.NOT. (ABS(g1(l))< ethr_cg(l) .OR. (ff(l)>ff0(l)) ) .AND. cg_iter(l)==maxter) nhpsi = nhpsi + 1 ! because this would be the count
newdone = newdone + 1 ! one more solution found (or no more active anyway)
!write(6,*) ' newdone = ', newdone
CALL start_clock( 'pcg:move' )
!write(6,*) ' swapping converged psi/hpsi/spsi i = ',i, " with i' = ",done+newdone
! swap the terminated vector with the first in the list of the active ones
!$cuf kernel do(1)
DO ii = 1, kdmx
p_d (ii,l) = psi_d (ii,done+newdone)
psi_d (ii,done+newdone) = psi_d (ii,i)
psi_d (ii,i) = p_d (ii,l)
hp_d(ii,l) = hpsi_d(ii,done+newdone)
hpsi_d(ii,done+newdone) = hpsi_d(ii,i)
hpsi_d(ii,i) = hp_d(ii,l)
sp_d(ii,l) = spsi_d(ii,done+newdone)
spsi_d(ii,done+newdone) = spsi_d(ii,i)
spsi_d(ii,i) = sp_d(ii,l)
END DO
ee = e_d(done+newdone)
!$cuf kernel do(1)
DO ii = 1, 1
e_d(done+newdone) = e_d(i)
END DO
e_d(i) = ee
!write(6,*) ' overwrite converged p/hp/etc l = ',l, ' with newdone = ',newdone
! move information of the swapped active vector in the right place to keep going
!$cuf kernel do(1)
DO ii = 1, kdmx
p_d(ii,l) = p_d(ii,newdone)
hp_d(ii,l) = p_d(ii,newdone)
sp_d(ii,l) = sp_d(ii,newdone)
b_d(ii,l) = b_d(ii,newdone)
z_d(ii,l) = z_d(ii,newdone)
END DO
alpha(l) = alpha(newdone)
ff0(l) = ff0(newdone)
ff(l) = ff(newdone)
g0(l) = g0(newdone)
g1(l) = g1(newdone)
g2(l) = g2(newdone)
cg_iter(l) = cg_iter(newdone)
ethr_cg(l) = ethr_cg(newdone)
CALL stop_clock( 'pcg:move' )
ELSE
!write(6,*) ' l =',l,' i =',i
beta = (g1(l)-g2(l))/g0(l) ! Polak - Ribiere style update
g0(l) = g1(l) ! < new z | new gradient > -> < old z | old gradient >
!$cuf kernel do(1)
DO ii = 1, kdmx
p_d(ii,l) = z_d(ii,l) + beta * p_d(ii,l) ! updated search direction
END DO
!write(6,*) 'beta :', beta
ff0(l) = ff(l) ! updated minimum value reached by the function
END IF
end do
IF ( newdone > 0 ) THEN
done = done + newdone
nactive = nactive - newdone
!write(6,*) ' there have been ', newdone, ' new converged solution'
!write(6,*) ' done = ', done, ' nactive =', nactive
CALL start_clock( 'pcg:move' )
do l=1, nactive
!write(6,*) ' l+newdone =',l+newdone,' -> l =',l
!$cuf kernel do(1)
DO ii = 1, kdmx
p_d (ii,l) = p_d (ii,l+newdone)
hp_d(ii,l) = hp_d(ii,l+newdone)
sp_d(ii,l) = sp_d(ii,l+newdone)
b_d(ii,l) = b_d(ii,l+newdone)
z_d(ii,l) = z_d(ii,l+newdone)
END DO
ff0(l) = ff0(l+newdone) ; ff(l) = ff(l+newdone)
g0(l) = g0(l+newdone) ; g1(l) = g1(l+newdone) ; g2(l) = g2(l+newdone)
cg_iter(l) = cg_iter(l+newdone) ; ethr_cg(l) = ethr_cg(l+newdone)
end do
CALL stop_clock( 'pcg:move' )
END IF
! END DO iterate Here is where the pcg loop would terminate
END DO MAIN_LOOP
!write (6,*) ' exit pcg loop'
DEALLOCATE( spsi0vec_d )
DEALLOCATE( b_d, p_d, hp_d, sp_d, z_d )
DEALLOCATE( ethr_cg, ff, ff0, cg_iter )
DEALLOCATE( g0, g1, g2, gamma )
DEALLOCATE( alpha )
!
CALL stop_clock( 'pcg' )
!
RETURN
!
END SUBROUTINE bpcg_k_gpu

13
KS_Solvers/ParO/make.gpu Normal file
View File

@ -0,0 +1,13 @@
PARO += \
paro_gamma_new_gpu.o \
paro_k_new_gpu.o \
bpcg_gamma_gpu.o \
bpcg_k_gpu.o
bpcg_gamma_gpu.o : ../../UtilXlib/mp.o
bpcg_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
bpcg_gamma_gpu.o : ../../UtilXlib/util_param.o
paro_gamma_new_gpu.o : ../../UtilXlib/mp.o
paro_gamma_new_gpu.o : ../../UtilXlib/mp_bands_util.o
paro_gamma_new_gpu.o : ../../UtilXlib/util_param.o

View File

@ -0,0 +1,327 @@
!
! Copyright (C) 2015-2016 Aihui Zhou's group
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-------------------------------------------------------------------------------
!
! We propose some parallel orbital updating based plane wave basis methods
! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
! parallelization, which make them have great advantage in large-scale parallelization.
!
! The approach following Algorithm is the parallel orbital updating algorithm:
! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
! the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$
! and let $n=0$.
! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
! $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))] $$
! in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the
! $n$-th iteration or the former iterations.
! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$ satisfying
! $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad \forall v \in \tilde{V}_N$$
! where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$,
! $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else, stop.
!
! You can see the detailed information through
! X. Dai, X. Gong, A. Zhou, J. Zhu,
! A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
! X. Dai, Z. Liu, X. Zhang, A. Zhou,
! A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations,
! arXiv:1510.07230 (2015).
! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
! A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
!
! The file is written mainly by Stefano de Gironcoli and Yan Pan.
!
! GPU version by Ivan Carnimeo
!
!-------------------------------------------------------------------------------
SUBROUTINE paro_gamma_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap, &
npwx, npw, nbnd, evc_d, eig_d, btype, ethr, notconv, nhpsi )
!-------------------------------------------------------------------------------
!paro_flag = 1: modified parallel orbital-updating method
!
#if defined (__CUDA)
USE cudafor
#endif
! global variables
USE util_param, ONLY : DP, stdout
USE mp_bands_util, ONLY : inter_bgrp_comm, nbgrp, my_bgrp_id
USE mp, ONLY : mp_sum, mp_allgather, mp_barrier, &
mp_type_create_column_section, mp_type_free
IMPLICIT NONE
!
INCLUDE 'laxlib.fh'
! I/O variables
LOGICAL, INTENT(IN) :: overlap
INTEGER, INTENT(IN) :: npw, npwx, nbnd
COMPLEX(DP), INTENT(INOUT) :: evc_d(npwx,nbnd)
REAL(DP), INTENT(IN) :: ethr
REAL(DP), INTENT(INOUT) :: eig_d(nbnd)
INTEGER, INTENT(IN) :: btype(nbnd)
INTEGER, INTENT(OUT) :: notconv, nhpsi
! INTEGER, INTENT(IN) :: paro_flag
! local variables (used in the call to cegterg )
!------------------------------------------------------------------------
EXTERNAL h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu
! subroutine h_psi (npwx,npw,nvec,evc,hpsi) computes H*evc using band parallelization
! subroutine s_psi (npwx,npw,nvec,evc,spsi) computes S*evc using band parallelization
! subroutine hs_1psi(npwx,npw,evc,hpsi,spsi) computes H*evc and S*evc for a single band
! subroutine g_1psi (npwx,npw,psi,eig) computes g*psi for a single band
!
! ... local variables
!
INTEGER :: itry, paro_ntr, nconv, nextra, nactive, nbase, ntrust, ndiag, nvecx, nproc_ortho
LOGICAL, ALLOCATABLE :: conv(:)
REAL(DP), PARAMETER :: extra_factor = 0.5 ! workspace is at most this factor larger than nbnd
INTEGER, PARAMETER :: min_extra = 4 ! but at least this lager
INTEGER :: ibnd, ibnd_start, ibnd_end, how_many, lbnd, kbnd, last_unconverged, &
recv_counts(nbgrp), displs(nbgrp), column_type
INTEGER :: ii, jj, kk ! indexes for cuf kernel loops
!civn 2fix: these are needed only for __MPI = true (protate)
COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
REAL(DP), ALLOCATABLE :: eig(:), ew(:)
!
!
! ... device variables
!
COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
REAL(DP), ALLOCATABLE :: ew_d(:)
LOGICAL, ALLOCATABLE :: conv_d(:)
#if defined (__CUDA)
attributes(device) :: psi_d, hpsi_d, spsi_d
attributes(device) :: evc_d, eig_d, ew_d
attributes(device) :: conv_d
#endif
!
! ... init local variables
!
CALL laxlib_getval( nproc_ortho = nproc_ortho )
paro_ntr = 20
nvecx = nbnd + max ( nint ( extra_factor * nbnd ), min_extra )
!
CALL start_clock( 'paro_gamma' ); !write (6,*) ' enter paro diag'
CALL mp_type_create_column_section(evc_d(1,1), 0, npwx, npwx, column_type)
ALLOCATE ( conv(nbnd) )
ALLOCATE ( conv_d(nbnd) )
ALLOCATE ( psi_d(npwx,nvecx), hpsi_d(npwx,nvecx), spsi_d(npwx,nvecx), ew_d(nvecx) )
CALL start_clock( 'paro:init' );
conv(:) = .FALSE. ; nconv = COUNT ( conv(:) )
!$cuf kernel do(1)
do ii = 1, nbnd
conv_d(ii) = .FALSE.
end do
!$cuf kernel do(1)
DO ii = 1, npwx
psi_d(ii,1:nbnd) = evc_d(ii,1:nbnd) ! copy input evc into work vector
END DO
call h_psi_gpu (npwx,npw,nbnd,psi_d,hpsi_d) ! computes H*psi
call s_psi_gpu (npwx,npw,nbnd,psi_d,spsi_d) ! computes S*psi
nhpsi = 0 ; IF (my_bgrp_id==0) nhpsi = nbnd
CALL stop_clock( 'paro:init' );
#if defined(__MPI)
IF ( nproc_ortho == 1 ) THEN
#endif
CALL rotate_HSpsi_gamma_gpu ( npwx, npw, nbnd, nbnd, psi_d, hpsi_d, overlap, spsi_d, eig_d )
#if defined(__MPI)
ELSE
!civn 2fix
ALLOCATE ( psi(npwx,nvecx), hpsi(npwx,nvecx), spsi(npwx,nvecx), eig(nbnd) )
eig = eig_d
psi = psi_d
hpsi = hpsi_d
spsi = spsi_d
CALL protate_HSpsi_gamma( npwx, npw, nbnd, nbnd, psi, hpsi, overlap, spsi, eig )
eig_d = eig
psi_d = psi
hpsi_d = hpsi
spsi_d = spsi
DEALLOCATE ( psi, hpsi, spsi, eig )
ENDIF
#endif
!write (6,'(10f10.4)') psi(1:5,1:3)
!write (6,*) eig(1:nbnd)
ParO_loop : &
DO itry = 1,paro_ntr
!write (6,*) ' paro_itry =', itry, ethr
!----------------------------
nactive = nbnd - (nconv+1)/2 ! number of correction vectors to be computed (<nbnd)
notconv = nbnd - nconv ! number of needed roots
nextra = nactive - notconv ! number of extra vectors
nbase = nconv + nactive ! number of orbitals the correction should be orthogonal to (<2*nbnd)
ndiag = nbase + nactive ! dimension of the matrix to be diagonalized at this iteration (<2*nbnd)
!----------------------------
nactive = min ( (nvecx-nconv)/2, nvecx-nbnd) ! number of corrections there is space for
notconv = nbnd - nconv ! number of needed roots
nextra = max ( nactive - notconv, 0 ) ! number of extra vectors, if any
nbase = max ( nconv + nactive , nbnd ) ! number of orbitals to be orthogonal to (<nvecx)
ntrust = min ( nconv + nactive , nbnd ) ! number of orbitals that will be actually corrected
ndiag = nbase + nactive ! dimension of the matrix to be diagonalized at this iteration (<nvecx)
!write (6,*) itry, notconv, conv
!write (6,*) ' nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag =', nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag
CALL divide_all(inter_bgrp_comm,nactive,ibnd_start,ibnd_end,recv_counts,displs)
how_many = ibnd_end - ibnd_start + 1
!write (6,*) nactive, ibnd_start, ibnd_end, recv_counts, displs
CALL start_clock( 'paro:pack' );
lbnd = 1; kbnd = 1
DO ibnd = 1, ntrust ! pack unconverged roots in the available space
IF (.NOT.conv(ibnd) ) THEN
!$cuf kernel do(1)
DO ii = 1, npwx
psi_d (ii,nbase+kbnd) = psi_d(ii,ibnd)
hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
END DO
!$cuf kernel do(1)
DO ii = 1, 1
ew_d(kbnd) = eig_d(ibnd)
END DO
last_unconverged = ibnd
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
END IF
END DO
DO ibnd = nbnd+1, nbase ! add extra vectors if it is the case
!$cuf kernel do(1)
DO ii = 1, npwx
psi_d (ii,nbase+kbnd) = psi_d(ii,ibnd)
hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
END DO
!$cuf kernel do(1)
DO ii = 1, 1
ew_d(kbnd) = eig_d(last_unconverged)
END DO
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
END DO
!$cuf kernel do(2)
DO ii = 1, npwx
DO jj = nbase+1, nbase+how_many
kk = jj + ibnd_start - 1
psi_d (ii,jj) = psi_d (ii,kk)
hpsi_d(ii,jj) = hpsi_d(ii,kk)
spsi_d(ii,jj) = spsi_d(ii,kk)
END DO
END DO
!$cuf kernel do(1)
DO ii = 1, how_many
kk = ii + ibnd_start - 1
ew_d(ii) = ew_d(kk)
END DO
CALL stop_clock( 'paro:pack' );
! write (6,*) ' check nactive = ', lbnd, nactive
if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED '
CALL bpcg_gamma_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, how_many, &
psi_d(:,nbase+1), hpsi_d(:,nbase+1), spsi_d(:,nbase+1), ethr, ew_d(1), nhpsi)
CALL start_clock( 'paro:mp_bar' );
CALL mp_barrier(inter_bgrp_comm)
CALL stop_clock( 'paro:mp_bar' );
CALL start_clock( 'paro:mp_sum' );
!$cuf kernel do(2)
DO ii = 1, npwx
DO jj = nbase+1, nbase+how_many
kk = jj + ibnd_start - 1
psi_d(ii, kk) = psi_d(ii, jj)
hpsi_d(ii, kk) = hpsi_d(ii, jj)
spsi_d(ii, kk) = spsi_d(ii, jj)
END DO
END DO
CALL mp_allgather(psi_d (:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
CALL mp_allgather(hpsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
CALL mp_allgather(spsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
CALL stop_clock( 'paro:mp_sum' );
#if defined(__MPI)
IF ( nproc_ortho == 1 ) THEN
#endif
CALL rotate_HSpsi_gamma_gpu ( npwx, npw, ndiag, ndiag, psi_d, hpsi_d, overlap, spsi_d, ew_d )
#if defined(__MPI)
ELSE
!civn 2fix
ALLOCATE ( psi(npwx,nvecx), hpsi(npwx,nvecx), spsi(npwx,nvecx), ew(nvecx) )
ew = ew_d
psi = psi_d
hpsi = hpsi_d
spsi = spsi_d
CALL protate_HSpsi_gamma( npwx, npw, ndiag, ndiag, psi, hpsi, overlap, spsi, ew )
ew_d = ew
psi_d = psi
hpsi_d = hpsi
spsi_d = spsi
DEALLOCATE ( psi, hpsi, spsi, ew )
ENDIF
#endif
!write (6,*) ' ew : ', ew(1:nbnd)
! only the first nbnd eigenvalues are relevant for convergence
! but only those that have actually been corrected should be trusted
conv(1:nbnd) = .FALSE.
!$cuf kernel do(1)
do ii = 1, nbnd
conv_d(ii) = .FALSE.
end do
!$cuf kernel do(1)
DO ii = 1, ntrust
conv_d(ii) = ABS(ew_d(ii) - eig_d(ii)).LT.ethr
END DO
conv = conv_d
nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv
!$cuf kernel do(1)
DO ii = 1, nbnd
eig_d(ii) = ew_d(ii)
END DO
IF ( nconv == nbnd ) EXIT ParO_loop
END DO ParO_loop
!$cuf kernel do(1)
DO ii = 1, npwx
DO jj = 1, nbnd
evc_d(ii,jj) = psi_d(ii,jj)
END DO
END DO
CALL mp_sum(nhpsi,inter_bgrp_comm)
DEALLOCATE ( ew_d, conv )
DEALLOCATE ( conv_d )
DEALLOCATE ( psi_d, hpsi_d, spsi_d )
CALL mp_type_free( column_type )
CALL stop_clock( 'paro_gamma' ); !write (6,*) ' exit paro diag'
END SUBROUTINE paro_gamma_new_gpu

View File

@ -0,0 +1,325 @@
!
! Copyright (C) 2015-2016 Aihui Zhou's group
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-------------------------------------------------------------------------------
!
! We propose some parallel orbital updating based plane wave basis methods
! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
! parallelization, which make them have great advantage in large-scale parallelization.
!
! The approach following Algorithm is the parallel orbital updating algorithm:
! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
! the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$
! and let $n=0$.
! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
! $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))] $$
! in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the
! $n$-th iteration or the former iterations.
! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$ satisfying
! $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad \forall v \in \tilde{V}_N$$
! where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$,
! $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else, stop.
!
! You can see the detailed information through
! X. Dai, X. Gong, A. Zhou, J. Zhu,
! A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
! X. Dai, Z. Liu, X. Zhang, A. Zhou,
! A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations,
! arXiv:1510.07230 (2015).
! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
! A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
!
! The file is written mainly by Stefano de Gironcoli and Yan Pan.
!
!-------------------------------------------------------------------------------
SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap, &
npwx, npw, nbnd, npol, evc_d, eig_d, btype, ethr, notconv, nhpsi )
!-------------------------------------------------------------------------------
!paro_flag = 1: modified parallel orbital-updating method
#if defined (__CUDA)
USE cudafor
#endif
! global variables
USE util_param, ONLY : DP, stdout
USE mp_bands_util, ONLY : inter_bgrp_comm, nbgrp, my_bgrp_id
USE mp, ONLY : mp_sum, mp_allgather, mp_barrier, &
mp_type_create_column_section, mp_type_free
IMPLICIT NONE
!
INCLUDE 'laxlib.fh'
! I/O variables
LOGICAL, INTENT(IN) :: overlap
INTEGER, INTENT(IN) :: npw, npwx, nbnd, npol
COMPLEX(DP), INTENT(INOUT) :: evc_d(npwx*npol,nbnd)
REAL(DP), INTENT(IN) :: ethr
REAL(DP), INTENT(INOUT) :: eig_d(nbnd)
REAL(DP), ALLOCATABLE :: eig(:) ! copy for protate
INTEGER, INTENT(IN) :: btype(nbnd)
INTEGER, INTENT(OUT) :: notconv, nhpsi
! INTEGER, INTENT(IN) :: paro_flag
! local variables (used in the call to cegterg )
!------------------------------------------------------------------------
EXTERNAL h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu
! subroutine h_psi (npwx,npw,nvec,evc,hpsi) computes H*evc using band parallelization
! subroutine s_psi (npwx,npw,nvec,evc,spsi) computes S*evc using band parallelization
! subroutine hs_1psi(npwx,npw,evc,hpsi,spsi) computes H*evc and S*evc for a single band
! subroutine g_1psi (npwx,npw,psi,eig) computes g*psi for a single band
!
! ... local variables
!
INTEGER :: itry, paro_ntr, nconv, nextra, nactive, nbase, ntrust, ndiag, nvecx, nproc_ortho
LOGICAL, ALLOCATABLE :: conv(:)
REAL(DP), PARAMETER :: extra_factor = 0.5 ! workspace is at most this factor larger than nbnd
INTEGER, PARAMETER :: min_extra = 4 ! but at least this lager
INTEGER :: ibnd, ibnd_start, ibnd_end, how_many, lbnd, kbnd, last_unconverged, &
recv_counts(nbgrp), displs(nbgrp), column_type
INTEGER :: ii, jj, kk ! indexes for cuf kernel loops
!
!civn 2fix: these are needed only for __MPI = true (protate)
REAL(DP), ALLOCATABLE :: ew(:)
COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
!
! .. device variables
!
REAL(DP), ALLOCATABLE :: ew_d(:)
COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
LOGICAL, ALLOCATABLE :: conv_d(:)
#if defined (__CUDA)
attributes(device) :: evc_d, eig_d
attributes(device) :: psi_d, hpsi_d, spsi_d, ew_d
attributes(device) :: conv_d
#endif
!
! ... init local variables
!
CALL laxlib_getval( nproc_ortho = nproc_ortho )
paro_ntr = 20
nvecx = nbnd + max ( nint ( extra_factor * nbnd ), min_extra )
!
CALL start_clock( 'paro_k' ); !write (6,*) ' enter paro diag'
CALL mp_type_create_column_section(evc_d(1,1), 0, npwx*npol, npwx*npol, column_type)
ALLOCATE ( ew_d(nvecx), conv(nbnd) )
ALLOCATE ( conv_d(nbnd) )
ALLOCATE ( psi_d(npwx*npol,nvecx), hpsi_d(npwx*npol,nvecx), spsi_d(npwx*npol,nvecx) )
CALL start_clock( 'paro:init' );
conv(:) = .FALSE. ; nconv = COUNT ( conv(:) )
!$cuf kernel do(1)
do ii = 1, nbnd
conv_d(ii) = .FALSE.
end do
!$cuf kernel do(2)
DO ii = 1, npwx*npol
DO jj = 1, nbnd
psi_d(ii,jj) = evc_d(ii,jj)
END DO
END DO
call h_psi_gpu (npwx,npw,nbnd,psi_d,hpsi_d) ! computes H*psi
call s_psi_gpu (npwx,npw,nbnd,psi_d,spsi_d) ! computes S*psi
nhpsi = 0 ; IF (my_bgrp_id==0) nhpsi = nbnd
CALL stop_clock( 'paro:init' );
#if defined(__MPI)
IF ( nproc_ortho == 1 ) THEN
#endif
CALL rotate_HSpsi_k_gpu ( npwx, npw, nbnd, nbnd, npol, psi_d, hpsi_d, overlap, spsi_d, eig_d )
#if defined(__MPI)
ELSE
!civn 2fix
ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), eig(nbnd) )
psi = psi_d
hpsi = hpsi_d
spsi = spsi_d
eig = eig_d
CALL protate_HSpsi_k( npwx, npw, nbnd, nbnd, npol, psi, hpsi, overlap, spsi, eig )
psi_d = psi
hpsi_d = hpsi
spsi_d = spsi
eig_d = eig
DEALLOCATE ( psi, hpsi, spsi, eig )
ENDIF
#endif
!write (6,'(10f10.4)') psi(1:5,1:3)
!write (6,*) eig(1:nbnd)
ParO_loop : &
DO itry = 1,paro_ntr
!write (6,*) ' paro_itry =', itry, ethr
!----------------------------
nactive = nbnd - (nconv+1)/2 ! number of correction vectors to be computed (<nbnd)
notconv = nbnd - nconv ! number of needed roots
nextra = nactive - notconv ! number of extra vectors
nbase = nconv + nactive ! number of orbitals the correction should be orthogonal to (<2*nbnd)
ndiag = nbase + nactive ! dimension of the matrix to be diagonalized at this iteration (<2*nbnd)
!----------------------------
nactive = min ( (nvecx-nconv)/2, nvecx-nbnd) ! number of corrections there is space for
notconv = nbnd - nconv ! number of needed roots
nextra = max ( nactive - notconv, 0 ) ! number of extra vectors, if any
nbase = max ( nconv + nactive , nbnd ) ! number of orbitals to be orthogonal to (<nvecx)
ntrust = min ( nconv + nactive , nbnd ) ! number of orbitals that will be actually corrected
ndiag = nbase + nactive ! dimension of the matrix to be diagonalized at this iteration (<nvecx)
!write (6,*) itry, notconv, conv
!write (6,*) ' nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag =', nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag
CALL divide_all(inter_bgrp_comm,nactive,ibnd_start,ibnd_end,recv_counts,displs)
how_many = ibnd_end - ibnd_start + 1
!write (6,*) nactive, ibnd_start, ibnd_end, recv_counts, displs
CALL start_clock( 'paro:pack' );
lbnd = 1; kbnd = 1
DO ibnd = 1, ntrust ! pack unconverged roots in the available space
IF (.NOT.conv(ibnd) ) THEN
!$cuf kernel do(1)
DO ii = 1, npwx*npol
psi_d (ii,nbase+kbnd) = psi_d(ii,ibnd)
hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
END DO
!$cuf kernel do(1)
DO ii = 1, 1
ew_d(kbnd) = eig_d(ibnd)
END DO
last_unconverged = ibnd
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
END IF
END DO
DO ibnd = nbnd+1, nbase ! add extra vectors if it is the case
!$cuf kernel do(1)
DO ii = 1, npwx*npol
psi_d (ii,nbase+kbnd) = psi_d(ii,ibnd)
hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
END DO
!$cuf kernel do(1)
DO ii = 1, 1
ew_d(kbnd) = eig_d(last_unconverged)
END DO
lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
END DO
!$cuf kernel do(2)
DO jj = 1, how_many
kk = jj + ibnd_start - 1
DO ii = 1, npwx*npol
psi_d (ii,nbase+jj) = psi_d (ii,nbase+kk)
hpsi_d(ii,nbase+jj) = hpsi_d(ii,nbase+kk)
spsi_d(ii,nbase+jj) = spsi_d(ii,nbase+kk)
END DO
END DO
!$cuf kernel do(1)
DO ii = 1, how_many
ew_d(ii) = ew_d(ii+ibnd_start-1)
END DO
CALL stop_clock( 'paro:pack' );
!write (6,*) ' check nactive = ', lbnd, nactive, nconv
if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED '
CALL bpcg_k_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, npol, how_many, &
psi_d(:,nbase+1), hpsi_d(:,nbase+1), spsi_d(:,nbase+1), ethr, ew_d(1), nhpsi)
!
CALL start_clock( 'paro:mp_bar' );
CALL mp_barrier(inter_bgrp_comm)
CALL stop_clock( 'paro:mp_bar' );
CALL start_clock( 'paro:mp_sum' );
!$cuf kernel do(2)
DO ii = 1, npwx*npol
DO jj = nbase+1, nbase+how_many
kk = jj + ibnd_start - 1
psi_d (ii,kk) = psi_d (ii,jj)
hpsi_d(ii,kk) = hpsi_d(ii,jj)
spsi_d(ii,kk) = spsi_d(ii,jj)
END DO
END DO
CALL mp_allgather(psi_d (:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
CALL mp_allgather(hpsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
CALL mp_allgather(spsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
CALL stop_clock( 'paro:mp_sum' );
#if defined(__MPI)
IF ( nproc_ortho == 1 ) THEN
#endif
CALL rotate_HSpsi_k_gpu ( npwx, npw, ndiag, ndiag, npol, psi_d, hpsi_d, overlap, spsi_d, ew_d )
#if defined(__MPI)
ELSE
!civn 2fix
ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), ew(nvecx) )
psi = psi_d
hpsi = hpsi_d
spsi = spsi_d
ew = ew_d
CALL protate_HSpsi_k( npwx, npw, ndiag, ndiag, npol, psi, hpsi, overlap, spsi, ew )
psi_d = psi
hpsi_d = hpsi
spsi_d = spsi
ew_d = ew
DEALLOCATE ( psi, hpsi, spsi, ew )
ENDIF
#endif
!write (6,*) ' ew : ', ew(1:nbnd)
! only the first nbnd eigenvalues are relevant for convergence
! but only those that have actually been corrected should be trusted
conv(1:nbnd) = .FALSE.
!$cuf kernel do(1)
do ii = 1, nbnd
conv_d(ii) = .FALSE.
end do
!$cuf kernel do(1)
DO ii = 1, ntrust
conv_d(ii) = ABS(ew_d(ii) - eig_d(ii)).LT.ethr
END DO
conv = conv_d
nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv
!$cuf kernel do(1)
DO ii = 1, nbnd
eig_d(ii) = ew_d(ii)
END DO
IF ( nconv == nbnd ) EXIT ParO_loop
END DO ParO_loop
!$cuf kernel do(2)
DO ii = 1, npwx*npol
DO jj = 1, nbnd
evc_d(ii,jj) = psi_d(ii,jj)
END DO
END DO
CALL mp_sum(nhpsi,inter_bgrp_comm)
DEALLOCATE ( ew_d, conv )
DEALLOCATE ( conv_d )
DEALLOCATE ( psi_d, hpsi_d, spsi_d )
CALL mp_type_free( column_type )
CALL stop_clock( 'paro_k' ); !write (6,*) ' exit paro diag'
END SUBROUTINE paro_k_new_gpu

18
KS_Solvers/make.gpu Normal file
View File

@ -0,0 +1,18 @@
# Makefile for DAVIDSON and CG on GPU
ALLOBJS += \
Davidson/cegterg_gpu.o \
Davidson/regterg_gpu.o \
DENSE/rotate_wfc_k_gpu.o \
DENSE/rotate_wfc_gamma_gpu.o \
CG/rcgdiagg_gpu.o \
CG/ccgdiagg_gpu.o \
PPCG/generic_cublas.o \
PPCG/ppcg_gamma_gpu.o \
PPCG/ppcg_k_gpu.o \
ParO/bpcg_gamma_gpu.o \
ParO/bpcg_k_gpu.o \
ParO/paro_gamma_new_gpu.o \
ParO/paro_k_new_gpu.o \
DENSE/rotate_HSpsi_gamma_gpu.o \
DENSE/rotate_HSpsi_k_gpu.o

View File

@ -2,10 +2,13 @@
include ../make.inc
EXTLIBS=$(CUDA_LIBS) $(SCALAPACK_LIBS) $(LAPACK_LIBS) $(BLAS_LIBS) $(MPI_LIBS)
LAX = la_types.o \
la_error.o \
la_module.o \
la_helper.o \
cdiaghg.o \
rdiaghg.o \

442
LAXlib/la_module.f90 Normal file
View File

@ -0,0 +1,442 @@
MODULE LAXlib
#ifdef __CUDA
USE cudafor
#endif
IMPLICIT NONE
!
INTERFACE diaghg
MODULE PROCEDURE cdiaghg_cpu_, rdiaghg_cpu_
#ifdef __CUDA
MODULE PROCEDURE cdiaghg_gpu_, rdiaghg_gpu_
#endif
END INTERFACE
!
INTERFACE pdiaghg
MODULE PROCEDURE pcdiaghg_, prdiaghg_
#ifdef __CUDA
MODULE PROCEDURE pcdiaghg__gpu, prdiaghg__gpu
#endif
END INTERFACE
!
CONTAINS
!
!----------------------------------------------------------------------------
SUBROUTINE cdiaghg_cpu_( n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm, offload )
!----------------------------------------------------------------------------
!
! ... calculates eigenvalues and eigenvectors of the generalized problem
! ... Hv=eSv, with H hermitean matrix, S overlap matrix.
! ... On output both matrix are unchanged
!
! ... LAPACK version - uses both ZHEGV and ZHEGVX
!
#if defined (__CUDA)
USE cudafor
#endif
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n, m, ldh
! dimension of the matrix to be diagonalized
! number of eigenstates to be calculate
! leading dimension of h, as declared in the calling pgm unit
COMPLEX(DP), INTENT(INOUT) :: h(ldh,n), s(ldh,n)
! actually intent(in) but compilers don't know and complain
! matrix to be diagonalized
! overlap matrix
REAL(DP), INTENT(OUT) :: e(n)
! eigenvalues
COMPLEX(DP), INTENT(OUT) :: v(ldh,m)
! eigenvectors (column-wise)
INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
!
LOGICAL, OPTIONAL :: offload
! optionally solve the eigenvalue problem on the GPU
LOGICAL :: loffload
!
#if defined(__CUDA)
COMPLEX(DP), ALLOCATABLE, DEVICE :: v_d(:,:), h_d(:,:), s_d(:,:)
REAL(DP), ALLOCATABLE, DEVICE :: e_d(:)
INTEGER :: info
#endif
!
loffload = .false.
!
! the following ifdef ensures no offload if not compiling from GPU
#if defined(__CUDA)
IF (PRESENT(offload)) loffload = offload
#endif
!
! ... always false when compiling without CUDA support
!
IF ( loffload ) THEN
#if defined(__CUDA)
!
ALLOCATE(s_d, source=s); ALLOCATE(h_d, source=h)
ALLOCATE(e_d(n), v_d(ldh,n))
!
CALL laxlib_cdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
!
e = e_d
v(1:ldh,1:m) = v_d(1:ldh,1:m)
!
DEALLOCATE(h_d, s_d, e_d, v_d)
#endif
ELSE
CALL laxlib_cdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
END IF
!
RETURN
!
END SUBROUTINE cdiaghg_cpu_
!
#if defined(__CUDA)
!----------------------------------------------------------------------------
SUBROUTINE cdiaghg_gpu_( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm, onhost )
!----------------------------------------------------------------------------
!
! ... calculates eigenvalues and eigenvectors of the generalized problem
! ... Hv=eSv, with H hermitean matrix, S overlap matrix.
! ... On output both matrix are unchanged
!
! ... LAPACK version - uses both ZHEGV and ZHEGVX
!
USE cudafor
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n, m, ldh
! dimension of the matrix to be diagonalized
! number of eigenstates to be calculate
! leading dimension of h, as declared in the calling pgm unit
COMPLEX(DP), DEVICE, INTENT(INOUT) :: h_d(ldh,n), s_d(ldh,n)
! actually intent(in) but compilers don't know and complain
! matrix to be diagonalized
! overlap matrix
REAL(DP), DEVICE, INTENT(OUT) :: e_d(n)
! eigenvalues
COMPLEX(DP), DEVICE, INTENT(OUT) :: v_d(ldh,n)
! eigenvectors (column-wise)
INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
! communicators
LOGICAL, OPTIONAL :: onhost
! optionally solve the eigenvalue problem on the CPU
LOGICAL :: lonhost
!
COMPLEX(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
REAL(DP), ALLOCATABLE :: e(:)
!
INTEGER :: info
!
lonhost = .false.
!
IF (PRESENT(onhost)) lonhost = onhost
!
!
IF ( lonhost ) THEN
!
ALLOCATE(s, source=s_d); ALLOCATE(h, source=h_d)
ALLOCATE(e(n), v(ldh,m))
!
CALL laxlib_cdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
!
e_d = e
v_d(1:ldh,1:m) = v(1:ldh,1:m)
!
DEALLOCATE(h, s, e, v)
ELSE
CALL laxlib_cdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
END IF
!
RETURN
!
END SUBROUTINE cdiaghg_gpu_
#endif
!
!----------------------------------------------------------------------------
SUBROUTINE rdiaghg_cpu_( n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm, offload )
!----------------------------------------------------------------------------
!
! ... general interface for rdiaghg
!
#if defined(__CUDA)
USE cudafor
#endif
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n, m, ldh
! dimension of the matrix to be diagonalized
! number of eigenstates to be calculate
! leading dimension of h, as declared in the calling pgm unit
REAL(DP), INTENT(INOUT) :: h(ldh,n), s(ldh,n)
! actually intent(in) but compilers don't know and complain
! matrix to be diagonalized
! overlap matrix
REAL(DP), INTENT(OUT) :: e(n)
! eigenvalues
REAL(DP), INTENT(OUT) :: v(ldh,m)
! eigenvectors (column-wise)
INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
! communicators
LOGICAL, OPTIONAL :: offload
! optionally solve the eigenvalue problem on the GPU
LOGICAL :: loffload
!
#if defined(__CUDA)
REAL(DP), ALLOCATABLE, DEVICE :: v_d(:,:), h_d(:,:), s_d(:,:)
REAL(DP), ALLOCATABLE, DEVICE :: e_d(:)
INTEGER :: info
#endif
!
loffload = .false.
!
! the following ifdef ensures no offload if not compiling from GPU
#if defined(__CUDA)
IF (PRESENT(offload)) loffload = offload
#endif
!
! ... always false when compiling without CUDA support
!
IF ( loffload ) THEN
#if defined(__CUDA)
!
ALLOCATE(s_d, source=s); ALLOCATE(h_d, source=h)
ALLOCATE(e_d(n), v_d(ldh,n))
!
CALL laxlib_rdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
!
e = e_d
v(1:ldh,1:m) = v_d(1:ldh,1:m)
!
DEALLOCATE(h_d, s_d, e_d, v_d)
#endif
ELSE
CALL laxlib_rdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
END IF
!
RETURN
!
END SUBROUTINE rdiaghg_cpu_
!
#if defined(__CUDA)
!----------------------------------------------------------------------------
SUBROUTINE rdiaghg_gpu_( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm, onhost )
!----------------------------------------------------------------------------
!
! ... General interface to rdiaghg_gpu
!
USE cudafor
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n, m, ldh
! dimension of the matrix to be diagonalized
! number of eigenstates to be calculate
! leading dimension of h, as declared in the calling pgm unit
REAL(DP), DEVICE, INTENT(INOUT) :: h_d(ldh,n), s_d(ldh,n)
! actually intent(in) but compilers don't know and complain
! matrix to be diagonalized
! overlap matrix
REAL(DP), DEVICE, INTENT(OUT) :: e_d(n)
! eigenvalues
REAL(DP), DEVICE, INTENT(OUT) :: v_d(ldh,n)
! eigenvectors (column-wise)
INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
! communicators
LOGICAL, OPTIONAL :: onhost
! optionally solve the eigenvalue problem on the CPU
LOGICAL :: lonhost
!
REAL(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
REAL(DP), ALLOCATABLE :: e(:)
!
INTEGER :: info
!
lonhost = .false.
!
IF (PRESENT(onhost)) lonhost = onhost
!
!
IF ( lonhost ) THEN
!
ALLOCATE(s, source=s_d); ALLOCATE(h, source=h_d)
ALLOCATE(e(n), v(ldh,m))
!
CALL laxlib_rdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
!
e_d = e
v_d(1:ldh,1:m) = v(1:ldh,1:m)
!
DEALLOCATE(h, s, e, v)
ELSE
CALL laxlib_rdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
END IF
!
RETURN
!
END SUBROUTINE rdiaghg_gpu_
#endif
!
! === Parallel diagonalization interface subroutines
!
!
!----------------------------------------------------------------------------
SUBROUTINE prdiaghg_( n, h, s, ldh, e, v, idesc, offload )
!----------------------------------------------------------------------------
!
! ... calculates eigenvalues and eigenvectors of the generalized problem
! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
! ... On output both matrix are unchanged
!
! ... Parallel version with full data distribution
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n, ldh
! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
! leading dimension of h, as declared in the calling pgm unit
REAL(DP), INTENT(INOUT) :: h(ldh,ldh), s(ldh,ldh)
! matrix to be diagonalized
! overlap matrix
!
REAL(DP), INTENT(OUT) :: e(n)
! eigenvalues
REAL(DP), INTENT(OUT) :: v(ldh,ldh)
! eigenvectors (column-wise)
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!
LOGICAL, OPTIONAL :: offload
! place-holder, offloading on GPU not implemented yet
LOGICAL :: loffload
CALL laxlib_prdiaghg( n, h, s, ldh, e, v, idesc)
END SUBROUTINE
!----------------------------------------------------------------------------
SUBROUTINE pcdiaghg_( n, h, s, ldh, e, v, idesc, offload )
!----------------------------------------------------------------------------
!
! ... calculates eigenvalues and eigenvectors of the generalized problem
! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
! ... On output both matrix are unchanged
!
! ... Parallel version with full data distribution
!
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n, ldh
! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
! leading dimension of h, as declared in the calling pgm unit
COMPLEX(DP), INTENT(INOUT) :: h(ldh,ldh), s(ldh,ldh)
! matrix to be diagonalized
! overlap matrix
!
REAL(DP), INTENT(OUT) :: e(n)
! eigenvalues
COMPLEX(DP), INTENT(OUT) :: v(ldh,ldh)
! eigenvectors (column-wise)
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!
LOGICAL, OPTIONAL :: offload
! place-holder, offloading on GPU not implemented yet
LOGICAL :: loffload
CALL laxlib_pcdiaghg( n, h, s, ldh, e, v, idesc)
END SUBROUTINE
!
#if defined(__CUDA)
!----------------------------------------------------------------------------
SUBROUTINE prdiaghg__gpu( n, h_d, s_d, ldh, e_d, v_d, idesc, onhost )
!----------------------------------------------------------------------------
!
! ... calculates eigenvalues and eigenvectors of the generalized problem
! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
! ... On output both matrix are unchanged
!
! ... Parallel version with full data distribution
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n, ldh
! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
! leading dimension of h, as declared in the calling pgm unit
REAL(DP), INTENT(INOUT), DEVICE :: h_d(ldh,ldh), s_d(ldh,ldh)
! matrix to be diagonalized
! overlap matrix
!
REAL(DP), INTENT(OUT), DEVICE :: e_d(n)
! eigenvalues
REAL(DP), INTENT(OUT), DEVICE :: v_d(ldh,ldh)
! eigenvectors (column-wise)
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!
LOGICAL, OPTIONAL :: onhost
! place-holder, prdiaghg on GPU not implemented yet
LOGICAL :: lonhost
!
REAL(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
REAL(DP), ALLOCATABLE :: e(:)
ALLOCATE(h(ldh,ldh), s(ldh,ldh), e(n), v(ldh,ldh))
h = h_d; s = s_d;
CALL laxlib_prdiaghg( n, h, s, ldh, e, v, idesc)
e_d = e; v_d = v
DEALLOCATE(h,s,v,e)
!
END SUBROUTINE
!----------------------------------------------------------------------------
SUBROUTINE pcdiaghg__gpu( n, h_d, s_d, ldh, e_d, v_d, idesc, onhost )
!----------------------------------------------------------------------------
!
! ... calculates eigenvalues and eigenvectors of the generalized problem
! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
! ... On output both matrix are unchanged
!
! ... Parallel version with full data distribution
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n, ldh
! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
! leading dimension of h, as declared in the calling pgm unit
COMPLEX(DP), INTENT(INOUT), DEVICE :: h_d(ldh,ldh), s_d(ldh,ldh)
! matrix to be diagonalized
! overlap matrix
!
REAL(DP), INTENT(OUT), DEVICE :: e_d(n)
! eigenvalues
COMPLEX(DP), INTENT(OUT), DEVICE :: v_d(ldh,ldh)
! eigenvectors (column-wise)
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!
LOGICAL, OPTIONAL :: onhost
! place-holder, pcdiaghg on GPU not implemented yet
LOGICAL :: lonhost
!
COMPLEX(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
REAL(DP), ALLOCATABLE :: e(:)
ALLOCATE(h(ldh,ldh), s(ldh,ldh), e(n), v(ldh,ldh))
h = h_d; s = s_d;
CALL laxlib_pcdiaghg( n, h, s, ldh, e, v, idesc)
e_d = e; v_d = v
DEALLOCATE(h,s,v,e)
!
END SUBROUTINE
#endif
END MODULE LAXlib

View File

@ -395,4 +395,16 @@ SUBROUTINE redist_row2col_x( n, a, b, ldx, nx, idesc )
REAL(DP) :: a(ldx,nx), b(ldx,nx)
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
END SUBROUTINE
#if defined (__CUDA)
SUBROUTINE redist_row2col_gpu_x( n, a, b, ldx, nx, idesc )
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: ldx, nx
REAL(DP), DEVICE :: a(:,:)
REAL(DP), DEVICE :: b(:,:)
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
END SUBROUTINE
#endif
END INTERFACE

View File

@ -7,6 +7,7 @@ la_error.o : la_param.o
la_helper.o : dspev_drv.o
la_helper.o : la_param.o
la_helper.o : la_types.o
la_module.o : la_param.o
la_helper.o : mp_diag.o
mp_diag.o : la_param.o
ptoolkit.o : la_param.o

View File

@ -3719,13 +3719,10 @@ SUBROUTINE redist_row2col_gpu_x( n, a, b, ldx, nx, idesc )
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: ldx, nx
REAL(DP), DEVICE :: a(:,:)
REAL(DP) :: b(:,:)
REAL(DP), DEVICE :: b(:,:)
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!
REAL(DP), ALLOCATABLE :: wrk(:,:)
#if defined(__GPU_MPI)
ATTRIBUTES(DEVICE) :: wrk
#endif
REAL(DP), ALLOCATABLE :: a_h(:,:), b_h(:,:)
INTEGER :: ierr
INTEGER :: np, rowid, colid
INTEGER :: comm
@ -3778,30 +3775,30 @@ SUBROUTINE redist_row2col_gpu_x( n, a, b, ldx, nx, idesc )
CALL lax_error__( " redist_row2col_gpu ", " in MPI_BARRIER ", ABS( ierr ) )
!
#if defined(__GPU_MPI)
ALLOCATE( wrk(SIZE(b,1),SIZE(b,2)), STAT=ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " allocating wrk ", ABS( ierr ) )
!
ierr = cudaDeviceSynchronize()
CALL MPI_SENDRECV(a, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
wrk, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
b = wrk
!
DEALLOCATE( wrk )
#else
ALLOCATE( wrk, SOURCE=a, STAT=ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " allocating wrk ", ABS( ierr ) )
!
CALL MPI_SENDRECV(wrk, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
b, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
#else
ALLOCATE( a_h, SOURCE=a, STAT=ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " allocating a_h ", ABS( ierr ) )
ALLOCATE( b_h, MOLD=b, STAT=ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " allocating b_h ", ABS( ierr ) )
!
DEALLOCATE( wrk )
CALL MPI_SENDRECV(a_h, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
b_h, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
!
b = b_h
!
DEALLOCATE( a_h, b_h )
#endif
!
#else

1
LAXlib/tests/.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
*.bin binary

38
LAXlib/tests/Makefile Normal file
View File

@ -0,0 +1,38 @@
# Makefile for LAXlib
include ../../make.inc
MODFLAGS= $(MOD_FLAG)../../UtilXlib $(MOD_FLAG)../../ELPA/src $(MOD_FLAG).. $(MOD_FLAG).
SRCS = test_diaghg_1.f90 \
test_diaghg_2.f90 \
test_diaghg_3.f90 \
test_diaghg_4.f90 \
test_diaghg_gpu_1.f90 \
test_diaghg_gpu_2.f90 \
test_diaghg_gpu_3.f90 \
test_diaghg_gpu_4.f90
EXECS = $(SRCS:.f90=.x)
UTILS = tester.o mp_world.o utils.o test_io.o test_helpers.o
all: common $(EXECS)
common: $(UTILS)
%.x: %.o
$(LD) $(LDFLAGS) $< $(UTILS) -o $@ ../libqela.a \
../../UtilXlib/libutil.a $(LIBOBJS) $(QELIBS)
clean :
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
# .PHONY forces execution of a rule irrespective of the presence of an
# updated file with the same name of the rule. In this way, the script
# that generates version.f90 always runs, updating the version if you
# execute "svn update". The update_version script takes care of not
# changing the file if the svn version did not change
.PHONY: all clean

10
LAXlib/tests/README.md Normal file
View File

@ -0,0 +1,10 @@
# LAXlib testing suite
In order to run the tests first run `./configure` in QE topdir and generate a
valid `make.inc`.
You may also download large eigenvalue problems (in this directory) with:
wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1EAB3xkoD-i9p4nW6NJDED3WaEK8ZCcf4' -O SiGeK1.bin
wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=13lFkDbv99V8fqiXER1N2IzoJ_EhuGtt9' -O SiGeK2.bin
Finally `make` and run all `.x` executable files.

BIN
LAXlib/tests/ZnOG1.bin Normal file

Binary file not shown.

BIN
LAXlib/tests/ZnOG2.bin Normal file

Binary file not shown.

BIN
LAXlib/tests/ZnOK1.bin Normal file

Binary file not shown.

BIN
LAXlib/tests/ZnOK2.bin Normal file

Binary file not shown.

89
LAXlib/tests/mp_world.f90 Normal file
View File

@ -0,0 +1,89 @@
!----------------------------------------------------------------------------
MODULE mp_world
!----------------------------------------------------------------------------
!
#if defined(__MPI)
USE mpi
#endif
USE mp, ONLY : mp_barrier, mp_start, mp_end, mp_stop
USE mp, ONLY : mp_count_nodes
!
IMPLICIT NONE
SAVE
!
! ... World group - all QE routines using mp_world_start to start MPI
! ... will work in the communicator passed as input to mp_world_start
!
INTEGER :: nnode = 1 ! number of nodes
INTEGER :: nproc = 1 ! number of processors
INTEGER :: mpime = 0 ! processor index (starts from 0 to nproc-1)
INTEGER :: root = 0 ! index of the root processor
INTEGER :: world_comm = 0 ! communicator
!
! ... library_mode =.true. if QE is called as a library by an external code
! ... if true, MPI_Init() is not called when starting MPI,
! ... MPI_Finalize() is not called when stopping MPI
!
!
#if defined(__MPI)
LOGICAL :: library_mode = .FALSE.
#endif
PRIVATE
PUBLIC :: nnode, nproc, mpime, root, world_comm, mp_world_start, mp_world_end
!
CONTAINS
!
!-----------------------------------------------------------------------
SUBROUTINE mp_world_start ( my_world_comm )
!-----------------------------------------------------------------------
!
IMPLICIT NONE
INTEGER, INTENT(IN) :: my_world_comm
#if defined(__MPI)
INTEGER :: ierr
#endif
#if defined(_OPENMP)
INTEGER :: PROVIDED
#endif
!
world_comm = my_world_comm
!
! ... check if mpi is already initialized (library mode) or not
!
#if defined(__MPI)
CALL MPI_Initialized ( library_mode, ierr)
IF (ierr/=0) CALL mp_stop( 8000 )
IF (.NOT. library_mode ) THEN
#if defined(_OPENMP)
CALL MPI_Init_thread(MPI_THREAD_FUNNELED, PROVIDED, ierr)
#else
CALL MPI_Init(ierr)
#endif
IF (ierr/=0) CALL mp_stop( 8001 )
END IF
#endif
!
CALL mp_start( nproc, mpime, world_comm )
!CALL mp_count_nodes ( nnode, world_comm )
!
RETURN
!
END SUBROUTINE mp_world_start
!
!-----------------------------------------------------------------------
SUBROUTINE mp_world_end ( )
!-----------------------------------------------------------------------
#if defined(__MPI)
INTEGER :: ierr
#endif
!
CALL mp_barrier( world_comm )
CALL mp_end ( world_comm )
#if defined(__MPI)
CALL mpi_finalize(ierr)
IF (ierr/=0) CALL mp_stop( 8002 )
#endif
!
END SUBROUTINE mp_world_end
!
END MODULE mp_world

View File

@ -0,0 +1,122 @@
program test_diaghg
#if defined(__MPI)
USE MPI
#endif
USE mp, ONLY : mp_bcast
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
root, nproc, world_comm
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
USE tester
IMPLICIT NONE
!
TYPE(tester_t) :: test
INTEGER :: world_group = 0
!
CALL test%init()
#if defined(__MPI)
world_group = MPI_COMM_WORLD
#endif
CALL mp_world_start(world_group)
!
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
!
CALL real_1(test)
!
CALL complex_1(test)
!
CALL collect_results(test)
!
CALL mp_world_end()
!
IF (mpime .eq. 0) CALL test%print()
!
CONTAINS
!
SUBROUTINE real_1(test)
USE LAXlib
USE la_param, ONLY : DP
implicit none
!
TYPE(tester_t) :: test
! real variables
real(DP) :: h(2,2)
real(DP) :: s(2,2)
real(DP) :: e(2)
real(DP) :: v(2,2)
h = 0.d0
h(1,1) = 1.d0
h(2,2) = 1.d0
s = 0.d0
s(1,1) = 1.d0
s(2,2) = 1.d0
!
v = 0.d0
e = 0.d0
CALL diaghg( 2, 2, h, s, 2, e, v, .false. )
!
CALL test%assert_close( e, [1.d0, 1.d0] )
CALL test%assert_close( RESHAPE(v, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
CALL test%assert_close( RESHAPE(h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
CALL test%assert_close( RESHAPE(s, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
!
v = 0.d0
e = 0.d0
CALL diaghg( 2, 2, h, s, 2, e, v, .true. )
!
CALL test%assert_close( e, [1.d0, 1.d0] )
CALL test%assert_close( RESHAPE(v, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
CALL test%assert_close( RESHAPE(h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
CALL test%assert_close( RESHAPE(s, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
!
END SUBROUTINE real_1
!
SUBROUTINE complex_1(test)
USE LAXlib
USE la_param, ONLY : DP
implicit none
!
TYPE(tester_t) :: test
! real variables
complex(DP) :: h(2,2)
complex(DP) :: h_save(2,2)
complex(DP) :: s(2,2)
complex(DP) :: s_save(2,2)
real(DP) :: e(2)
complex(DP) :: v(2,2)
!
h = 0.d0
h(1,1) = (1.d0, 0.d0)
h(1,2) = (0.d0, -2.d0)
h(2,1) = ( 0.d0, 2.d0)
h(2,2) = ( 5.d0, 0.d0)
s = 0.d0
s(1,1) = (1.d0, 0.d0)
s(2,2) = (1.d0, 0.d0)
!
h_save = h
s_save = s
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( 2, 2, h, s, 2, e, v, .false. )
! 0.1715728752538099, 5.82842712474619
CALL test%assert_close( e, [0.1715728752538099d0, 5.82842712474619d0] )
CALL test%assert_close( v(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
CALL test%assert_close( v(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
CALL test%assert_close( RESHAPE(h, [4]), RESHAPE(h_save, [4]))
CALL test%assert_close( RESHAPE(s, [4]), RESHAPE(s_save, [4]))
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( 2, 2, h, s, 2, e, v, .true. )
!
CALL test%assert_close( e, [0.1715728752538099d0, 5.82842712474619d0] )
CALL test%assert_close( v(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
CALL test%assert_close( v(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
CALL test%assert_close( RESHAPE(h, [4]), RESHAPE(h_save, [4]))
CALL test%assert_close( RESHAPE(s, [4]), RESHAPE(s_save, [4]))
!
END SUBROUTINE complex_1
end program test_diaghg

View File

@ -0,0 +1,110 @@
program test_diaghg_2
#if defined(__MPI)
USE MPI
#endif
USE mp, ONLY : mp_bcast
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
root, nproc, world_comm
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
USE tester
IMPLICIT NONE
!
TYPE(tester_t) :: test
INTEGER :: world_group = 0
!
CALL test%init()
#if defined(__MPI)
world_group = MPI_COMM_WORLD
#endif
CALL mp_world_start(world_group)
!
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
!
CALL complex_1(test)
!
CALL collect_results(test)
!
CALL mp_world_end()
!
IF (mpime .eq. 0) CALL test%print()
!
CONTAINS
!
SUBROUTINE complex_1(test)
USE LAXlib
USE la_param, ONLY : DP
implicit none
!
TYPE(tester_t) :: test
!
integer, parameter :: m_size=1024
complex(DP) :: h(m_size,m_size)
complex(DP) :: h_save(m_size,m_size)
complex(DP) :: s(m_size,m_size)
complex(DP) :: s_save(m_size,m_size)
real(DP) :: e(m_size)
complex(DP) :: v(m_size,m_size)
real(DP) :: e_save(m_size)
complex(DP) :: v_save(m_size,m_size)
!
CALL hermitian(m_size, h)
CALL hermitian(m_size, s)
!
h_save = h
s_save = s
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( m_size, m_size, h, s, m_size, e, v, .false. )
!
CALL test%assert_close( RESHAPE(h, [m_size*m_size]), RESHAPE(h_save, [m_size*m_size]))
CALL test%assert_close( RESHAPE(s, [m_size*m_size]), RESHAPE(s_save, [m_size*m_size]))
!
e_save = e
v_save = v
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( m_size, m_size, h, s, m_size, e, v, .true. )
!
CALL test%assert_close( RESHAPE(h, [m_size*m_size]), RESHAPE(h_save, [m_size*m_size]))
CALL test%assert_close( RESHAPE(s, [m_size*m_size]), RESHAPE(s_save, [m_size*m_size]))
test%tolerance32=1.d-5
test%tolerance64=1.d-14
CALL test%assert_close( e, e_save)
!
END SUBROUTINE complex_1
!
SUBROUTINE hermitian(mSize, M)
USE la_param, ONLY : DP
IMPLICIT NONE
integer, intent(in) :: msize
complex(dp), intent(out) :: M(:,:)
!
real(dp), allocatable :: rnd(:)
complex(dp), allocatable :: tmp(:,:)
INTEGER :: h, k, j
!
ALLOCATE(rnd(mSize*(mSize+1)))
CALL RANDOM_NUMBER(rnd)
rnd = 1.d0*rnd - 5.d-1
!
M = (0.d0, 0.d0)
j = 1
DO k=1,mSize
DO h=1,mSize
IF(h>k) THEN
M(h,k) = CMPLX(rnd(j), rnd(j+1))
M(k,h) = CONJG(M(h,k))
j=j+2;
ELSE IF(k == h) THEN
M(k,h) = CMPLX(mSize, 0.d0, kind=DP)
END IF
END DO
END DO
!
END SUBROUTINE hermitian
end program test_diaghg_2

View File

@ -0,0 +1,174 @@
program test_diaghg_3
#if defined(__MPI)
USE MPI
#endif
USE mp, ONLY : mp_bcast
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
root, nproc, world_comm
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
USE tester
IMPLICIT NONE
!
TYPE(tester_t) :: test
INTEGER :: world_group = 0
!
CALL test%init()
test%tolerance64=1.d-8
!
#if defined(__MPI)
world_group = MPI_COMM_WORLD
#endif
CALL mp_world_start(world_group)
!
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
!
CALL real_1(test)
!
CALL complex_1(test)
!
CALL collect_results(test)
!
CALL mp_world_end()
!
IF (mpime .eq. 0) CALL test%print()
!
CONTAINS
!
SUBROUTINE real_1(test)
USE mp_world, ONLY : mpime
USE LAXlib
USE la_param, ONLY : DP
USE test_io
implicit none
!
TYPE(tester_t) :: test
!
integer :: ldh, n, m
real(DP), allocatable :: h(:,:)
real(DP), allocatable :: h_save(:,:)
real(DP), allocatable :: s(:,:)
real(DP), allocatable :: s_save(:,:)
real(DP), allocatable :: e(:)
real(DP), allocatable :: v(:,:)
real(DP), allocatable :: e_save(:)
real(DP), allocatable :: v_save(:,:)
!
character(len=20) :: inputs(2)
integer :: i, j, info
!
inputs = ["ZnOG1.bin", "ZnOG2.bin"]
!
DO i=1, SIZE(inputs)
!
CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
!
IF (info /= 0) THEN
IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
CYCLE
END IF
!
ALLOCATE(h_save, SOURCE=h)
ALLOCATE(s_save, SOURCE=s)
ALLOCATE(e_save, SOURCE=e)
ALLOCATE(v_save, SOURCE=v)
!
h_save = h
s_save = s
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, .false. )
!
test%tolerance64=1.d-6 ! check this
DO j = 1, m
CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
test%tolerance64=1.d-8 ! check this
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
h = h_save
s = s_save
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, .true. )
!
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m))
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save)
END DO
!
END SUBROUTINE real_1
!
SUBROUTINE complex_1(test)
USE mp_world, ONLY : mpime
USE LAXlib
USE la_param, ONLY : DP
USE test_io
implicit none
!
TYPE(tester_t) :: test
!
integer :: ldh, n, m
complex(DP), allocatable :: h(:,:)
complex(DP), allocatable :: h_save(:,:)
complex(DP), allocatable :: s(:,:)
complex(DP), allocatable :: s_save(:,:)
real(DP), allocatable :: e(:)
complex(DP), allocatable :: v(:,:)
real(DP), allocatable :: e_save(:)
complex(DP), allocatable :: v_save(:,:)
!
character(len=20) :: inputs(4)
integer :: i, j, info
!
inputs = ["ZnOK1.bin ", &
"ZnOK2.bin ", &
"SiGeK1.bin", &
"SiGeK2.bin"]
!
DO i=1, SIZE(inputs)
!
CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
!
IF (info /= 0) THEN
IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
CYCLE
END IF
!
ALLOCATE(h_save, SOURCE=h)
ALLOCATE(s_save, SOURCE=s)
ALLOCATE(e_save, SOURCE=e)
ALLOCATE(v_save, SOURCE=v)
!
h_save = h
s_save = s
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, .false. )
!
DO j = 1, m
CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
h = h_save
s = s_save
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, .true. )
!
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m))
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save)
END DO
!
END SUBROUTINE complex_1
end program test_diaghg_3

View File

@ -0,0 +1,312 @@
! This test uses the internal parallel diagonalization algorithm of LAXlib
! to solve the problems stored in binary files:
!
! - ZnOG1.bin
! - ZnOG2.bin
! - ZnOK1.bin
! - ZnOK2.bin
! - SiGeK1.bin
! - SiGeK2.bin
!
! If the scalacpak or ELPA driver is used, the test is skipped.
!
#if ! defined(__SCALAPACK)
program test_diaghg_4
#if defined(__MPI)
USE MPI
#endif
USE mp, ONLY : mp_bcast
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
root, nproc, world_comm
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
USE tester
IMPLICIT NONE
!
TYPE(tester_t) :: test
INTEGER :: world_group = 0
!
CALL test%init()
test%tolerance64=1.d-8
!
#if defined(__MPI)
world_group = MPI_COMM_WORLD
#endif
CALL mp_world_start(world_group)
!
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
!
CALL parallel_real_1(test)
!
CALL parallel_complex_1(test)
!
CALL collect_results(test)
!
CALL mp_world_end()
!
IF (mpime .eq. 0) CALL test%print()
!
CONTAINS
!
SUBROUTINE parallel_real_1(test)
USE mp_world, ONLY : mpime
USE LAXlib
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
USE la_param, ONLY : DP
USE test_io
implicit none
!
TYPE(tester_t) :: test
!
TYPE(la_descriptor) :: desc
integer :: ldh, n, m
real(DP), allocatable :: h(:,:), hdst(:,:) !< full and distributed Hpsi
real(DP), allocatable :: h_save(:,:) !< full Hpsi, used to check consistence across calls
real(DP), allocatable :: s(:,:), sdst(:,:) !< full and distributed Spsi
real(DP), allocatable :: s_save(:,:) !< full Spsi, used to check consistence across calls
real(DP), allocatable :: e(:) !< full set of eigenvalues
real(DP), allocatable :: v(:,:), vdst(:,:) !< full and distributed eigenvectors
real(DP), allocatable :: e_save(:) !< full set of eigenvalues, used for checks
real(DP), allocatable :: v_save(:,:) !< full set of eigenvectors, used for checks
!
character(len=20) :: inputs(2)
integer :: l, i, j, ii, jj, info, nrdst
logical :: la_proc
!
inputs = ["ZnOG1.bin", "ZnOG2.bin"]
!
DO l=1, SIZE(inputs)
!
CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
!
IF (info /= 0) THEN
IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
CYCLE
END IF
!
ALLOCATE(h_save, SOURCE=h)
ALLOCATE(s_save, SOURCE=s)
ALLOCATE(e_save, SOURCE=e)
ALLOCATE(v_save, SOURCE=v)
!
h_save = h
s_save = s
!
CALL init_parallel_diag(desc, n)
!
IF( desc%active_node > 0 ) la_proc = .TRUE.
nrdst = desc%nrcx
IF (.not. la_proc) nrdst = 1
!
v = (0.d0, 0.d0)
e = 0.d0
print *, nrdst, n, m
ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
!
IF (la_proc) THEN
DO j = 1, desc%nc ! number of column in the local block of lambda
DO i = 1, desc%nr ! number of row in the local block of lambda
ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
hdst(i, j) = h(ii, jj)
sdst(i, j) = s(ii, jj)
END DO
END DO
END IF
!
CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .false. )
!
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
v = (0.d0, 0.d0)
e = 0.d0
CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .true. )
!
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m))
!
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
END DO
!
END SUBROUTINE parallel_real_1
!
SUBROUTINE parallel_complex_1(test)
USE mp_world, ONLY : mpime
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
USE LAXlib
USE la_param, ONLY : DP
USE test_io
implicit none
!
TYPE(tester_t) :: test
!
integer :: ldh, n, m
complex(DP), allocatable :: h(:,:), hdst(:,:) !< full and distributed Hpsi
complex(DP), allocatable :: h_save(:,:) !< full Hpsi, used to check consistence across calls
complex(DP), allocatable :: s(:,:), sdst(:,:) !< full and distributed Spsi
complex(DP), allocatable :: s_save(:,:) !< full Spsi, used to check consistence across calls
real(DP), allocatable :: e(:) !< full set of eigenvalues
complex(DP), allocatable :: v(:,:), vdst(:,:) !< full and distributed eigenvectors
real(DP), allocatable :: e_save(:) !< full set of eigenvalues, used for checks
complex(DP), allocatable :: v_save(:,:) !< full set of eigenvectors, used for checks
TYPE(la_descriptor) :: desc
!
character(len=20) :: inputs(4)
integer :: l, i, j, ii, jj, info, nrdst
logical :: la_proc
!
inputs = ["ZnOK1.bin ", &
"ZnOK2.bin ", &
"SiGeK1.bin", &
"SiGeK2.bin"]
!
DO l=1, SIZE(inputs)
!
CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
!
IF (info /= 0) THEN
IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
CYCLE
END IF
!
ALLOCATE(h_save, SOURCE=h)
ALLOCATE(s_save, SOURCE=s)
ALLOCATE(e_save, SOURCE=e)
ALLOCATE(v_save, SOURCE=v)
!
h_save = h
s_save = s
!
CALL init_parallel_diag(desc, n)
!
IF( desc%active_node > 0 ) la_proc = .TRUE.
nrdst = desc%nrcx
IF (.not. la_proc) nrdst = 1
!
v = (0.d0, 0.d0)
e = 0.d0
print *, nrdst, n, m
ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
!
IF (la_proc) THEN
DO j = 1, desc%nc ! number of column in the local block of lambda
DO i = 1, desc%nr ! number of row in the local block of lambda
ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
hdst(i, j) = h(ii, jj)
sdst(i, j) = s(ii, jj)
END DO
END DO
END IF
!
CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .false. )
!
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
v = (0.d0, 0.d0)
e = 0.d0
CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .true. )
!
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m))
!
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
END DO
!
END SUBROUTINE parallel_complex_1
!
SUBROUTINE init_parallel_diag(desc, n)
USE mp_world, ONLY : mpime, nproc, world_comm
USE mp_diag, ONLY : ortho_parent_comm
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
USE LAXlib
USE la_param, ONLY : DP
implicit none
!
TYPE(la_descriptor) :: desc
INTEGER :: n ! global dimension of the matrix
!
INTEGER :: ierr = 0
INTEGER :: color, key
!
INTEGER :: np_ortho(2) = 1 ! size of the processor grid used in ortho
INTEGER :: me_ortho(2) = 0 ! coordinates of the processors
INTEGER :: me_ortho1 = 0 ! task id for the ortho group
INTEGER :: nproc_ortho = 1 ! size of the ortho group:
INTEGER :: ortho_comm = 0 ! communicator for the ortho group
INTEGER :: ortho_row_comm = 0 ! communicator for the ortho row group
INTEGER :: ortho_col_comm = 0 ! communicator for the ortho col group
INTEGER :: ortho_comm_id = 0 ! id of the ortho_comm
!
ortho_parent_comm = world_comm
!
#if defined __MPI
!
CALL grid2d_dims( 'S', nproc, np_ortho(1), np_ortho(2) )
!
nproc_ortho = np_ortho(1) * np_ortho(2)
!
! here we choose the first "nproc_ortho" processors
!
color = 0
IF( mpime < nproc_ortho ) color = 1
!
key = mpime
!
! initialize the communicator for the new group by splitting the input
! communicator
!
CALL mpi_comm_split( MPI_COMM_WORLD , color, key, ortho_comm, ierr )
!
! Computes coordinates of the processors, in row maior order
!
CALL mpi_comm_rank( ortho_comm, me_ortho1, ierr)
!
IF( mpime == 0 .AND. me_ortho1 /= 0 ) &
CALL lax_error__( " init_ortho_group ", " wrong root task in ortho group ", ierr )
!
if( color == 1 ) then
! this task belong to the ortho_group compute its coordinates
ortho_comm_id = 1
CALL GRID2D_COORDS( 'R', me_ortho1, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
CALL GRID2D_RANK( 'R', np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2), ierr )
IF( ierr /= me_ortho1 ) &
CALL lax_error__( " init_ortho_group ", " wrong task coordinates in ortho group ", ierr )
IF( me_ortho1 /= mpime ) &
CALL lax_error__( " init_ortho_group ", " wrong rank assignment in ortho group ", ierr )
CALL mpi_comm_split( ortho_comm , me_ortho(2), me_ortho(1), ortho_col_comm, ierr )
CALL mpi_comm_split( ortho_comm , me_ortho(1), me_ortho(2), ortho_row_comm, ierr )
else
! this task does NOT belong to the ortho_group set dummy values
ortho_comm_id = 0
me_ortho(1) = me_ortho1
me_ortho(2) = me_ortho1
endif
#else
ortho_comm_id = 1
#endif
CALL descla_init( desc, n, n, np_ortho, me_ortho, ortho_comm, -1, ortho_comm_id )
END SUBROUTINE init_parallel_diag
end program test_diaghg_4
#else
program test_diaghg_4
end program test_diaghg_4
#endif

View File

@ -0,0 +1,175 @@
#if defined(__CUDA)
program test_diaghg_gpu
#if defined(__MPI)
USE MPI
#endif
USE mp, ONLY : mp_bcast
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
root, nproc, world_comm
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
USE tester
IMPLICIT NONE
!
TYPE(tester_t) :: test
INTEGER :: world_group = 0
!
CALL test%init()
#if defined(__MPI)
world_group = MPI_COMM_WORLD
#endif
CALL mp_world_start(world_group)
!
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
!
CALL real_1(test)
!
CALL complex_1(test)
!
CALL collect_results(test)
!
CALL mp_world_end()
!
IF (mpime .eq. 0) CALL test%print()
!
CONTAINS
!
SUBROUTINE real_1(test)
USE LAXlib
USE la_param, ONLY : DP
USE cudafor
implicit none
!
TYPE(tester_t) :: test
! variables on device
real(DP), device :: h_d(2,2)
real(DP), device :: s_d(2,2)
real(DP), device :: e_d(2)
real(DP), device :: v_d(2,2)
! variables on host
real(DP) :: h_h(2,2)
real(DP) :: s_h(2,2)
real(DP) :: e_h(2)
real(DP) :: v_h(2,2)
h_h = 0.d0
h_h(1,1) = 1.d0
h_h(2,2) = 1.d0
h_d = h_h
s_h = 0.d0
s_h(1,1) = 1.d0
s_h(2,2) = 1.d0
s_d = s_h
!
v_d = 0.d0
e_d = 0.d0
CALL diaghg( 2, 2, h_d, s_d, 2, e_d, v_d, .false. )
v_h = v_d
e_h = e_d
h_h = h_d
s_h = s_d
!
CALL test%assert_close( e_h, [1.d0, 1.d0] )
CALL test%assert_close( RESHAPE(v_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
CALL test%assert_close( RESHAPE(h_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
CALL test%assert_close( RESHAPE(s_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
!
v_d = 0.d0
e_d = 0.d0
CALL diaghg( 2, 2, h_d, s_d, 2, e_d, v_d, .true. )
v_h = v_d
e_h = e_d
h_h = h_d
s_h = s_d
!
CALL test%assert_close( e_h, [1.d0, 1.d0] )
CALL test%assert_close( RESHAPE(v_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
CALL test%assert_close( RESHAPE(h_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
CALL test%assert_close( RESHAPE(s_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
!
END SUBROUTINE real_1
!
SUBROUTINE complex_1(test)
USE LAXlib
USE la_param, ONLY : DP
USE cudafor
implicit none
!
TYPE(tester_t) :: test
! variables on device
complex(DP), device :: h_d(2,2)
complex(DP), device :: s_d(2,2)
complex(DP), device :: v_d(2,2)
real(DP), device :: e_d(2)
! variables on host
complex(DP) :: h_h(2,2)
complex(DP) :: s_h(2,2)
complex(DP) :: v_h(2,2)
real(DP) :: e_h(2)
complex(DP) :: s_save(2,2)
complex(DP) :: h_save(2,2)
!
h_h = 0.d0
h_h(1,1) = (1.d0, 0.d0)
h_h(1,2) = (0.d0, -2.d0)
h_h(2,1) = ( 0.d0, 2.d0)
h_h(2,2) = ( 5.d0, 0.d0)
s_h = 0.d0
s_h(1,1) = (1.d0, 0.d0)
s_h(2,2) = (1.d0, 0.d0)
!
! save for later comparison
h_save = h_h
s_save = s_h
!
! Update device
h_d = h_h
s_d = s_h
!
v_h = (0.d0, 0.d0)
e_h = 0.d0
v_d = v_h; e_d = e_h
!
CALL diaghg( 2, 2, h_d, s_d, 2, e_d, v_d, .false. )
v_h = v_d
e_h = e_d
h_h = h_d
s_h = s_d
!
! 0.1715728752538099, 5.82842712474619
CALL test%assert_close( e_h, [0.1715728752538099d0, 5.82842712474619d0] )
CALL test%assert_close( v_h(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
CALL test%assert_close( v_h(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
CALL test%assert_close( RESHAPE(h_h, [4]), RESHAPE(h_save, [4]))
CALL test%assert_close( RESHAPE(s_h, [4]), RESHAPE(s_save, [4]))
!
v_h = (0.d0, 0.d0)
e_h = 0.d0
!
! Update device
h_d = h_h
s_d = s_h
v_d = v_h; e_d = e_h
CALL diaghg( 2, 2, h_d, s_d, 2, e_d, v_d, .true. )
v_h = v_d
e_h = e_d
h_h = h_d
s_h = s_d
!
CALL test%assert_close( e_h, [0.1715728752538099d0, 5.82842712474619d0] )
CALL test%assert_close( v_h(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
CALL test%assert_close( v_h(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
CALL test%assert_close( RESHAPE(h_h, [4]), RESHAPE(h_save, [4]))
CALL test%assert_close( RESHAPE(s_h, [4]), RESHAPE(s_save, [4]))
!
END SUBROUTINE complex_1
end program test_diaghg_gpu
#else
program test_diaghg_gpu
end program test_diaghg_gpu
#endif

View File

@ -0,0 +1,151 @@
#if defined(__CUDA)
program test_diaghg_gpu_2
#if defined(__MPI)
USE MPI
#endif
USE mp, ONLY : mp_bcast
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
root, nproc, world_comm
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
USE tester
IMPLICIT NONE
!
TYPE(tester_t) :: test
INTEGER :: world_group = 0
!
CALL test%init()
#if defined(__MPI)
world_group = MPI_COMM_WORLD
#endif
CALL mp_world_start(world_group)
!
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
!
CALL complex_1(test)
!
CALL collect_results(test)
!
CALL mp_world_end()
!
IF (mpime .eq. 0) CALL test%print()
!
CONTAINS
!
SUBROUTINE complex_1(test)
USE LAXlib
USE la_param, ONLY : DP
USE cudafor
implicit none
!
TYPE(tester_t) :: test
!
integer, parameter :: m_size=1024
integer :: i
complex(DP) :: h(m_size,m_size)
complex(DP), DEVICE :: h_d(m_size,m_size)
complex(DP) :: h_save(m_size,m_size)
complex(DP) :: s(m_size,m_size)
complex(DP), DEVICE :: s_d(m_size,m_size)
complex(DP) :: s_save(m_size,m_size)
real(DP) :: e(m_size), e_(m_size)
real(DP), DEVICE :: e_d(m_size)
complex(DP) :: v(m_size,m_size)
complex(DP), DEVICE :: v_d(m_size,m_size)
real(DP) :: e_save(m_size)
complex(DP) :: v_save(m_size,m_size)
!
CALL hermitian(m_size, h)
CALL hermitian(m_size, s)
!
h_save = h
s_save = s
h_d = h ! copy H and S to device
s_d = s ! <----------|
!
v = (0.d0, 0.d0)
e = 0.d0
v_d = (0.d0, 0.d0)
e_d = 0.d0
!
! 1. Compare same algorithm starting from data on device ...
CALL diaghg( m_size, m_size-1, h_d, s_d, m_size, e_d, v_d )
e_save = e_d
v_save = v_d
! 2. ... and on the host, this will trigger the same subroutine used above
CALL diaghg( m_size, m_size-1, h, s, m_size, e, v, .true. )
!
CALL test%assert_close( RESHAPE(h, [m_size*m_size]), RESHAPE(h_save, [m_size*m_size]))
CALL test%assert_close( RESHAPE(s, [m_size*m_size]), RESHAPE(s_save, [m_size*m_size]))
test%tolerance32=1.d-5
test%tolerance64=1.d-10
DO i=1, m_size-1
CALL test%assert_close( v(1:m_size,i), v_save(1:m_size,i))
CALL test%assert_close( e(i), e_save(i) )
END DO
!
! reset data
h = h_save
s = s_save
v = (0.d0, 0.d0)
e = 0.d0
!
! 3. repeat the same task but with CPU subroutine now.
! note that it uses a different algorithm and produces slightly
! different eigenvectors.
!
CALL diaghg( m_size, m_size-1, h, s, m_size, e, v )
h = h_save; s = s_save
!
! Solve-again, with the same algorithm used in the GPU version.
! This is needed to compare eigenvectors.
CALL solve_with_zhegvd(m_size, h, s, m_size, e_)
!
test%tolerance32=1.d-5
test%tolerance64=1.d-10
DO i=1, m_size-1
! compare eigenvectors obtained in 1. with LAPACK zhegvd
CALL test%assert_close( v_save(1:m_size,i), h(1:m_size,i) )
! compare eigenvalues obtained with zhegvd, 1. and 3.
CALL test%assert_close( e(i), e_save(i) )
CALL test%assert_close( e_(i), e_save(i) )
END DO
!
END SUBROUTINE complex_1
!
SUBROUTINE hermitian(mSize, M)
USE la_param, ONLY : DP
IMPLICIT NONE
integer, intent(in) :: msize
complex(dp), intent(out) :: M(:,:)
!
real(dp), allocatable :: rnd(:)
complex(dp), allocatable :: tmp(:,:)
INTEGER :: h, k, j
!
ALLOCATE(rnd(mSize*(mSize+1)))
CALL RANDOM_NUMBER(rnd)
rnd = 1.d0*rnd - 5.d-1
!
M = (0.d0, 0.d0)
j = 1
DO k=1,mSize
DO h=1,mSize
IF(h>k) THEN
M(h,k) = CMPLX(rnd(j), rnd(j+1))
M(k,h) = CONJG(M(h,k))
j=j+2;
ELSE IF(k == h) THEN
M(k,h) = CMPLX(mSize, 0.d0, kind=DP)
END IF
END DO
END DO
!
END SUBROUTINE hermitian
!
end program test_diaghg_gpu_2
#else
program test_diaghg_gpu_2
end program test_diaghg_gpu_2
#endif

View File

@ -0,0 +1,283 @@
#ifdef __CUDA
program test_diaghg_gpu_3
#if defined(__MPI)
USE MPI
#endif
USE mp, ONLY : mp_bcast
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
root, nproc, world_comm
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
USE tester
IMPLICIT NONE
!
TYPE(tester_t) :: test
INTEGER :: world_group = 0
!
CALL test%init()
test%tolerance64=1.d-8
!
#if defined(__MPI)
world_group = MPI_COMM_WORLD
#endif
CALL mp_world_start(world_group)
!
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
!
CALL real_1(test)
!
CALL complex_1(test)
!
CALL collect_results(test)
!
CALL mp_world_end()
!
IF (mpime .eq. 0) CALL test%print()
!
CONTAINS
!
SUBROUTINE real_1(test)
USE mp_world, ONLY : mpime
USE LAXlib
USE la_param, ONLY : DP
USE test_io
implicit none
!
TYPE(tester_t) :: test
!
integer :: ldh, n, m
real(DP), allocatable :: h(:,:)
real(DP), allocatable :: h_save(:,:)
real(DP), allocatable :: s(:,:)
real(DP), allocatable :: s_save(:,:)
real(DP), allocatable :: e(:)
real(DP), allocatable :: v(:,:)
real(DP), allocatable :: e_save(:)
real(DP), allocatable :: v_save(:,:)
real(DP), allocatable :: e_dc(:) ! stores results from divide and conquer
real(DP), allocatable :: v_dc(:,:) !
! device copies
real(DP), allocatable, device :: h_d(:,:)
real(DP), allocatable, device :: s_d(:,:)
real(DP), allocatable, device :: e_d(:)
real(DP), allocatable, device :: v_d(:,:)
!
!
character(len=20) :: inputs(2)
integer :: i, j, info
!
inputs = ["ZnOG1.bin", "ZnOG2.bin"]
!
DO i=1, SIZE(inputs)
!
CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
!
IF (info /= 0) THEN
IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
CYCLE
END IF
!
ALLOCATE(h_save, SOURCE=h)
ALLOCATE(s_save, SOURCE=s)
ALLOCATE(e_save, SOURCE=e)
ALLOCATE(v_save, SOURCE=v)
!
h_save = h
s_save = s
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, .false. )
!
test%tolerance64=1.d-6 ! <- check this
DO j = 1, m
CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
test%tolerance64=1.d-8 ! <- check this
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, .true. )
!
ALLOCATE(v_dc, SOURCE=h_save)
ALLOCATE(e_dc(n))
s = s_save
CALL solve_with_dsygvd(n, v_dc, s, ldh, e_dc)
s = s_save
!
DO j = 1, m
CALL test%assert_close( v(1:n, j), v_dc(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m))
!
! GPU data & subroutines
!
v = (0.d0, 0.d0)
e = 0.d0
s = s_save
h = h_save
ALLOCATE(e_d, SOURCE=e); ALLOCATE(v_d, SOURCE=v)
ALLOCATE(h_d, SOURCE=h); ALLOCATE(s_d, SOURCE=s)
!
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, .false. )
!
v(1:n, 1:m) = v_d(1:n, 1:m)
e(1:m) = e_d(1:m)
DO j = 1, m
CALL test%assert_close( v(1:n, j), v_dc(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
v_d = (0.d0, 0.d0)
e_d = 0.d0
s_d = s_save
h_d = h_save
!
! Start from data on the GPU and diagonalize on the CPU
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, .true. )
!
v(1:n, 1:m) = v_d(1:n, 1:m)
e(1:m) = e_d(1:m)
!
test%tolerance64=1.d-6 ! <- check this
DO j = 1, m
CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
test%tolerance64=1.d-8 ! <- check this
!
CALL test%assert_close( e(1:m), e_save(1:m))
!
DEALLOCATE(h_d, s_d, e_d, v_d)
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, v_dc, e_dc)
END DO
!
END SUBROUTINE real_1
!
SUBROUTINE complex_1(test)
USE mp_world, ONLY : mpime
USE LAXlib
USE la_param, ONLY : DP
USE test_io
implicit none
!
TYPE(tester_t) :: test
!
integer :: ldh, n, m
complex(DP), allocatable :: h(:,:)
complex(DP), allocatable :: h_save(:,:)
complex(DP), allocatable :: s(:,:)
complex(DP), allocatable :: s_save(:,:)
real(DP), allocatable :: e(:)
complex(DP), allocatable :: v(:,:)
real(DP), allocatable :: e_save(:)
complex(DP), allocatable :: v_save(:,:)
real(DP), allocatable :: e_dc(:) ! stores results from divide and conquer
complex(DP), allocatable :: v_dc(:,:) !
! device copies
complex(DP), allocatable, device :: h_d(:,:)
complex(DP), allocatable, device :: s_d(:,:)
real(DP), allocatable, device :: e_d(:)
complex(DP), allocatable, device :: v_d(:,:)
!
!
character(len=20) :: inputs(4)
integer :: i, j, info
!
inputs = ["ZnOK1.bin ", &
"ZnOK2.bin ", &
"SiGeK1.bin", &
"SiGeK2.bin"]
!
DO i=1, SIZE(inputs)
!
CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
!
IF (info /= 0) THEN
IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
CYCLE
END IF
!
ALLOCATE(h_save, SOURCE=h)
ALLOCATE(s_save, SOURCE=s)
ALLOCATE(e_save, SOURCE=e)
ALLOCATE(v_save, SOURCE=v)
ALLOCATE(e_dc(n))
ALLOCATE(v_dc, SOURCE=h)
!
h_save = h
s_save = s
!
! == Check CPU interface without and with offloading ==
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, .false. )
!
DO j = 1, m
CALL test%assert_close( v(1:n, j), v_save(1:n, j) )
END DO
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
h = h_save; s = s_save;
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, .true. )
!
! N.B.: GPU eigensolver uses a different algorithm: zhegvd
s = s_save; e_dc = 0.d0
CALL solve_with_zhegvd(n, v_dc, s, ldh, e_dc)
!
DO j = 1, m
CALL test%assert_close( v(1:n, j), v_dc(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m))
!
! GPU data & subroutines
!
v = (0.d0, 0.d0)
e = 0.d0
s = s_save
h = h_save
ALLOCATE(e_d, SOURCE=e); ALLOCATE(v_d, SOURCE=v)
ALLOCATE(h_d, SOURCE=h); ALLOCATE(s_d, SOURCE=s)
!
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, .false. )
!
v(1:n, 1:m) = v_d(1:n, 1:m)
e(1:m) = e_d(1:m)
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
v_d = (0.d0, 0.d0)
e_d = 0.d0
s_d = s_save
h_d = h_save
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, .true. )
!
v(1:n, 1:m) = v_d(1:n, 1:m)
e(1:m) = e_d(1:m)
!
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m))
!
DEALLOCATE(h_d, s_d, e_d, v_d)
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save)
DEALLOCATE(e_dc, v_dc)
END DO
!
END SUBROUTINE complex_1
!
end program test_diaghg_gpu_3
#else
program test_diaghg_gpu_3
end program test_diaghg_gpu_3
#endif

View File

@ -0,0 +1,345 @@
! This test uses the internal parallel diagonalization algorithm of LAXlib
! (GPU interface) to solve the problems stored in binary files:
!
! - ZnOG1.bin
! - ZnOG2.bin
! - ZnOK1.bin
! - ZnOK2.bin
! - SiGeK1.bin
! - SiGeK2.bin
!
! If the scalacpak or ELPA driver is used, the test is skipped.
!
#if ( ! defined(__SCALAPACK) ) && defined(__CUDA)
program test_diaghg_gpu_4
#if defined(__MPI)
USE MPI
#endif
USE mp, ONLY : mp_bcast
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
root, nproc, world_comm
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
USE tester
IMPLICIT NONE
!
TYPE(tester_t) :: test
INTEGER :: world_group = 0
!
CALL test%init()
test%tolerance64=1.d-8
!
#if defined(__MPI)
world_group = MPI_COMM_WORLD
#endif
CALL mp_world_start(world_group)
!
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
!
CALL parallel_real_1(test)
!
CALL parallel_complex_1(test)
!
CALL collect_results(test)
!
CALL mp_world_end()
!
IF (mpime .eq. 0) CALL test%print()
!
CONTAINS
!
SUBROUTINE parallel_real_1(test)
USE cudafor
USE mp_world, ONLY : mpime
USE LAXlib
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
USE la_param, ONLY : DP
USE test_io
implicit none
!
TYPE(tester_t) :: test
!
TYPE(la_descriptor) :: desc
integer :: ldh, n, m
real(DP), allocatable :: h(:,:), hdst(:,:) !< full and distributed Hpsi
real(DP), allocatable, device :: hdst_d(:,:) !< distributed Hpsi on device
real(DP), allocatable :: h_save(:,:) !< full Hpsi, used to check consistence across calls
real(DP), allocatable :: s(:,:), sdst(:,:) !< full and distributed Spsi
real(DP), allocatable, device :: sdst_d(:,:) !< distributed Spsi on device
real(DP), allocatable :: s_save(:,:) !< full Spsi, used to check consistence across calls
real(DP), allocatable :: e(:) !< full set of eigenvalues
real(DP), allocatable, device :: e_d(:) !< full set of eigenvalues
real(DP), allocatable :: v(:,:), vdst(:,:) !< full and distributed eigenvectors
real(DP), allocatable, device :: vdst_d(:,:) !< full and distributed eigenvectors
real(DP), allocatable :: e_save(:) !< full set of eigenvalues, used for checks
real(DP), allocatable :: v_save(:,:) !< full set of eigenvectors, used for checks
!
character(len=20) :: inputs(2)
integer :: l, i, j, ii, jj, info, nrdst
logical :: la_proc
!
inputs = ["ZnOG1.bin", "ZnOG2.bin"]
!
DO l=1, SIZE(inputs)
!
! Read the problem from specified input file.
! this will also allocate(h, s, e, v)
CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
!
IF (info /= 0) THEN
IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
CYCLE
END IF
!
ALLOCATE(h_save, SOURCE=h)
ALLOCATE(s_save, SOURCE=s)
ALLOCATE(e_save, SOURCE=e)
ALLOCATE(v_save, SOURCE=v)
!
h_save = h
s_save = s
!
CALL init_parallel_diag(desc, n)
!
IF( desc%active_node > 0 ) la_proc = .TRUE. ! selects processors involved in diagonalization
nrdst = desc%nrcx ! stores square distributed matrix sizes
IF (.not. la_proc) nrdst = 1 ! dummy value to avoid 0 allocations
!
v = (0.d0, 0.d0)
e = 0.d0
print *, nrdst, n, m
ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
!
IF (la_proc) THEN
DO j = 1, desc%nc ! number of column in the local block of lambda
DO i = 1, desc%nr ! number of row in the local block of lambda
ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
hdst(i, j) = h(ii, jj)
sdst(i, j) = s(ii, jj)
END DO
END DO
END IF
!
ALLOCATE(hdst_d, SOURCE=hdst)
ALLOCATE(sdst_d, SOURCE=sdst)
ALLOCATE(vdst_d( nrdst , nrdst ), e_d(n))
e(1:n) = 0.d0
e_d(1:n) = 0.d0
!
CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .false. )
!
e(1:n) = e_d
!
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
e(1:n) = 0.d0
e_d(1:n) = 0.d0
!
CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .true. )
!
e(1:n) = e_d
!
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m))
!
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
DEALLOCATE(e_d, hdst_d, sdst_d, vdst_d)
END DO
!
END SUBROUTINE parallel_real_1
!
SUBROUTINE parallel_complex_1(test)
USE cudafor
USE mp_world, ONLY : mpime
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
USE LAXlib
USE la_param, ONLY : DP
USE test_io
implicit none
!
TYPE(tester_t) :: test
!
integer :: ldh, n, m
complex(DP), allocatable :: h(:,:), hdst(:,:) !< full and distributed Hpsi
complex(DP), allocatable, device :: hdst_d(:,:) !< distributed Hpsi on device
complex(DP), allocatable :: h_save(:,:) !< full Hpsi, used to check consistence across calls
complex(DP), allocatable :: s(:,:), sdst(:,:) !< full and distributed Spsi
complex(DP), allocatable, device :: sdst_d(:,:) !< distributed Spsi on device
complex(DP), allocatable :: s_save(:,:) !< full Spsi, used to check consistence across calls
real(DP), allocatable :: e(:) !< full set of eigenvalues
real(DP), allocatable, device :: e_d(:) !< full set of eigenvalues
complex(DP), allocatable :: v(:,:), vdst(:,:) !< full and distributed eigenvectors
complex(DP), allocatable, device :: vdst_d(:,:) !< full and distributed eigenvectors
real(DP), allocatable :: e_save(:) !< full set of eigenvalues, used for checks
complex(DP), allocatable :: v_save(:,:) !< full set of eigenvectors, used for checks
TYPE(la_descriptor) :: desc
!
character(len=20) :: inputs(4)
integer :: l, i, j, ii, jj, info, nrdst
logical :: la_proc
!
inputs = ["ZnOK1.bin ", &
"ZnOK2.bin ", &
"SiGeK1.bin", &
"SiGeK2.bin"]
!
DO l=1, SIZE(inputs)
!
! Read the problem from specified input file.
! this will also allocate(h, s, e, v)
CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
!
IF (info /= 0) THEN
IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
CYCLE
END IF
!
ALLOCATE(h_save, SOURCE=h)
ALLOCATE(s_save, SOURCE=s)
ALLOCATE(e_save, SOURCE=e)
ALLOCATE(v_save, SOURCE=v)
!
h_save = h
s_save = s
!
CALL init_parallel_diag(desc, n)
!
IF( desc%active_node > 0 ) la_proc = .TRUE.
nrdst = desc%nrcx
IF (.not. la_proc) nrdst = 1
!
v = (0.d0, 0.d0)
e = 0.d0
!
ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
!
IF (la_proc) THEN
DO j = 1, desc%nc ! number of column in the local block of lambda
DO i = 1, desc%nr ! number of row in the local block of lambda
ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
hdst(i, j) = h(ii, jj)
sdst(i, j) = s(ii, jj)
END DO
END DO
END IF
!
ALLOCATE(hdst_d, SOURCE=hdst)
ALLOCATE(sdst_d, SOURCE=sdst)
ALLOCATE(vdst_d( nrdst , nrdst ), e_d(n))
!
e_d(1:n) = 0.d0
CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .false. )
e = e_d
!
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
e_d(1:n) = 0.d0
CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .true. )
!
e = e_d
DO j = 1, m
!CALL test%assert_close( v(1:n, j), v_save(1:n, j))
END DO
CALL test%assert_close( e(1:m), e_save(1:m))
!
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
DEALLOCATE(e_d, hdst_d, sdst_d, vdst_d)
END DO
!
END SUBROUTINE parallel_complex_1
!
SUBROUTINE init_parallel_diag(desc, n)
USE mp_world, ONLY : mpime, nproc, world_comm
USE mp_diag, ONLY : ortho_parent_comm
USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
USE LAXlib
USE la_param, ONLY : DP
implicit none
!
TYPE(la_descriptor) :: desc
INTEGER :: n ! global dimension of the matrix
!
INTEGER :: ierr = 0
INTEGER :: color, key
!
INTEGER :: np_ortho(2) = 1 ! size of the processor grid used in ortho
INTEGER :: me_ortho(2) = 0 ! coordinates of the processors
INTEGER :: me_ortho1 = 0 ! task id for the ortho group
INTEGER :: nproc_ortho = 1 ! size of the ortho group:
INTEGER :: ortho_comm = 0 ! communicator for the ortho group
INTEGER :: ortho_row_comm = 0 ! communicator for the ortho row group
INTEGER :: ortho_col_comm = 0 ! communicator for the ortho col group
INTEGER :: ortho_comm_id = 0 ! id of the ortho_comm
!
ortho_parent_comm = world_comm
!
#if defined __MPI
!
CALL grid2d_dims( 'S', nproc, np_ortho(1), np_ortho(2) )
!
nproc_ortho = np_ortho(1) * np_ortho(2)
!
! here we choose the first "nproc_ortho" processors
!
color = 0
IF( mpime < nproc_ortho ) color = 1
!
key = mpime
!
! initialize the communicator for the new group by splitting the input
! communicator
!
CALL mpi_comm_split( MPI_COMM_WORLD , color, key, ortho_comm, ierr )
!
! Computes coordinates of the processors, in row maior order
!
CALL mpi_comm_rank( ortho_comm, me_ortho1, ierr)
!
IF( mpime == 0 .AND. me_ortho1 /= 0 ) &
CALL lax_error__( " init_ortho_group ", " wrong root task in ortho group ", ierr )
!
if( color == 1 ) then
! this task belong to the ortho_group compute its coordinates
ortho_comm_id = 1
CALL GRID2D_COORDS( 'R', me_ortho1, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
CALL GRID2D_RANK( 'R', np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2), ierr )
IF( ierr /= me_ortho1 ) &
CALL lax_error__( " init_ortho_group ", " wrong task coordinates in ortho group ", ierr )
IF( me_ortho1 /= mpime ) &
CALL lax_error__( " init_ortho_group ", " wrong rank assignment in ortho group ", ierr )
CALL mpi_comm_split( ortho_comm , me_ortho(2), me_ortho(1), ortho_col_comm, ierr )
CALL mpi_comm_split( ortho_comm , me_ortho(1), me_ortho(2), ortho_row_comm, ierr )
else
! this task does NOT belong to the ortho_group set dummy values
ortho_comm_id = 0
me_ortho(1) = me_ortho1
me_ortho(2) = me_ortho1
endif
#else
ortho_comm_id = 1
#endif
CALL descla_init( desc, n, n, np_ortho, me_ortho, ortho_comm, -1, ortho_comm_id )
END SUBROUTINE init_parallel_diag
end program test_diaghg_gpu_4
#else
program test_diaghg_gpu_4
end program test_diaghg_gpu_4
#endif

View File

@ -0,0 +1,76 @@
!
SUBROUTINE solve_with_zhegvd(n, v, s, ldh, e)
USE la_param, ONLY : DP
IMPLICIT NONE
!
complex(DP) :: v(ldh,n)
complex(DP) :: s(ldh,n)
real(DP) :: e(n)
INTEGER :: n
!
INTEGER :: lwork, lrwork, liwork, info, ldh
!
REAL(DP) :: abstol
INTEGER, ALLOCATABLE :: iwork(:), ifail(:)
REAL(DP), ALLOCATABLE :: rwork(:)
COMPLEX(DP), ALLOCATABLE :: work(:)
! various work space
!
ALLOCATE(work(1), rwork(1), iwork(1))
CALL ZHEGVD( 1, 'V', 'U', n, v, ldh, &
s, ldh, e, work, -1, rwork, -1, iwork, -1, info )
IF (info /= 0) print *, "Workspace not computed!"
lwork = work(1)
lrwork = rwork(1)
liwork = iwork(1)
DEALLOCATE(work, rwork, iwork)
ALLOCATE(work(lwork), rwork(lrwork), iwork(liwork))
CALL ZHEGVD( 1, 'V', 'U', n, v, ldh, &
s, ldh, e, work, lwork, rwork, lrwork, iwork, liwork, info )
DEALLOCATE(work, rwork, iwork)
!
END SUBROUTINE solve_with_zhegvd
!
SUBROUTINE solve_with_dsygvd(n, v, s, ldh, e)
USE la_param, ONLY : DP
IMPLICIT NONE
!
REAL(DP) :: v(ldh,n)
REAL(DP) :: s(ldh,n)
real(DP) :: e(n)
INTEGER :: n
!
INTEGER :: lwork, liwork, info, ldh
!
REAL(DP) :: abstol
INTEGER, ALLOCATABLE :: iwork(:), ifail(:)
REAL(DP), ALLOCATABLE :: work(:)
! various work space
!
ALLOCATE(work(1), iwork(1))
CALL dsygvd( 1, 'V', 'U', n, v, ldh, &
s, ldh, e, work, -1, iwork, -1, info )
IF (info /= 0) print *, "Workspace not computed!"
lwork = work(1)
liwork = iwork(1)
DEALLOCATE(work, iwork)
ALLOCATE(work(lwork), iwork(liwork))
!
CALL dsygvd( 1, 'V', 'U', n, v, ldh, &
s, ldh, e, work, lwork, iwork, liwork, info )
!
DEALLOCATE(work, iwork)
!
END SUBROUTINE solve_with_dsygvd
!
! TODO: add check for eigenvalue probelm

104
LAXlib/tests/test_io.f90 Normal file
View File

@ -0,0 +1,104 @@
MODULE test_io
!
IMPLICIT NONE
!
INTERFACE read_problem
MODULE PROCEDURE read_cmplx_problem, read_real_problem
END INTERFACE
!
CONTAINS
!
SUBROUTINE read_cmplx_problem(fname, ldh, n, m, h, s, e, v, info)
USE la_param, ONLY : DP
IMPLICIT NONE
character(len=*), intent(in) :: fname
integer, intent(out) :: ldh, n, m
complex(dp), allocatable, intent(inout) :: h(:,:)
complex(dp), allocatable, intent(inout) :: s(:,:)
real(dp), allocatable, intent(inout) :: e(:)
complex(dp), allocatable, intent(inout) :: v(:,:)
integer, intent(out) :: info
!
real(dp) :: aux1, aux2
integer :: i, j, t
logical :: exist
!
character(len=20):: fname_
!
info = 0
fname_ = trim(fname)
!
print *, "reading ", fname_
inquire(file=fname_, exist=exist)
!
if (.not. exist ) then
info=1
return
end if
!
open (unit=15, file=fname_, status='old', form='unformatted', action='read')
!
t = 0
read (15) n
read (15) m
read (15) ldh
!
t = t + 3
!
ALLOCATE(h(ldh, n), s(ldh, n), e(n), v(ldh, m))
!
READ(15) h
READ(15) s
READ(15) e
READ(15) v
!
close(15)
END SUBROUTINE read_cmplx_problem
!
SUBROUTINE read_real_problem(fname, ldh, n, m, h, s, e, v, info)
USE la_param, ONLY : DP
IMPLICIT NONE
character(len=*), intent(in) :: fname
integer, intent(out) :: ldh, n, m
real(dp), allocatable, intent(inout) :: h(:,:)
real(dp), allocatable, intent(inout) :: s(:,:)
real(dp), allocatable, intent(inout) :: e(:)
real(dp), allocatable, intent(inout) :: v(:,:)
integer, intent(out) :: info
!
real(dp) :: aux1, aux2
integer :: i, j, t
logical :: exist
!
character(len=20):: fname_
!
info = 0
fname_ = trim(fname)
!
print *, "reading ", fname_
inquire(file=fname_, exist=exist)
!
if (.not. exist ) then
info=1
return
end if
!
open (unit=15, file=fname_, status='old', form='unformatted', action='read')
!
t = 0
read (15) n
read (15) m
read (15) ldh
!
t = t + 3
!
ALLOCATE(h(ldh, n), s(ldh, n), e(n), v(ldh, m))
!
READ(15) h
READ(15) s
READ(15) e
READ(15) v
!
close(15)
END SUBROUTINE read_real_problem
END MODULE test_io

862
LAXlib/tests/tester.f90 Normal file
View File

@ -0,0 +1,862 @@
! This file is part of fortran_tester
! Copyright 2015 Pierre de Buyl and Peter Colberg
! 2016 Pierre de Buyl and Stefano Szaghi
! 2018 Pierre de Buyl and Pietro Bonfa
! License: BSD
!> Routines to test Fortran programs
!!
!! fortran_tester is a pure-Fortran module. It provides a datatype to hold test results and
!! routines to test for equality, closeness, and positivity of variables. The routines are
!! overloaded and the resulting interface consists of a small number of names.
module tester
use, intrinsic :: iso_fortran_env, only : int8, int16, int32, int64, real32, real64
implicit none
private
public :: tester_t
!> The main **tester** class.
type :: tester_t
integer(int32) :: n_errors=0_int32 !< Number of errors.
integer(int32) :: n_tests=0_int32 !< Number of tests.
real(real32) :: tolerance32=2._real32*epsilon(1._real32) !< Real tolerance, 32 bits.
real(real64) :: tolerance64=2._real64*epsilon(1._real64) !< Real tolerance, 64 bits.
contains
procedure :: init !< Initialize the tester.
procedure :: print !< Print tests results.
generic, public :: assert_equal => &
assert_equal_i8, &
assert_equal_i16, &
assert_equal_i32, &
assert_equal_i64, &
assert_equal_r32, &
assert_equal_r64, &
assert_equal_c32, &
assert_equal_c64, &
assert_equal_l, &
assert_equal_i8_1, &
assert_equal_i16_1, &
assert_equal_i32_1, &
assert_equal_i64_1, &
assert_equal_r32_1, &
assert_equal_r64_1, &
assert_equal_c32_1, &
assert_equal_c64_1, &
assert_equal_l_1 !< Check if two values (integer, real, complex or logical) are equal.
procedure, private :: assert_equal_i8 !< Check if two integers (8 bits) are equal.
procedure, private :: assert_equal_i16 !< Check if two integers (16 bits) are equal.
procedure, private :: assert_equal_i32 !< Check if two integers (32 bits) are equal.
procedure, private :: assert_equal_i64 !< Check if two integers (64 bits) are equal.
procedure, private :: assert_equal_r32 !< Check if two reals (32 bits) are equal.
procedure, private :: assert_equal_r64 !< Check if two reals (64 bits) are equal.
procedure, private :: assert_equal_c32 !< Check if two complex numbers (32 bits) are equal.
procedure, private :: assert_equal_c64 !< Check if two complex numbers (64 bits) are equal.
procedure, private :: assert_equal_l !< Check if two logicals are equal.
procedure, private :: assert_equal_i8_1 !< Check if two integer (8 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_i16_1 !< Check if two integer (16 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_i32_1 !< Check if two integer (32 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_i64_1 !< Check if two integer (64 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_r32_1 !< Check if two real (32 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_r64_1 !< Check if two real (64 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_c32_1 !< Check if two complex (32 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_c64_1 !< Check if two complex (64 bits) arrays (rank 1) are equal.
procedure, private :: assert_equal_l_1 !< Check if two logical arrays (rank 1) are equal.
generic, public :: assert_positive => &
assert_positive_i8, &
assert_positive_i16, &
assert_positive_i32, &
assert_positive_i64, &
assert_positive_r32, &
assert_positive_r64, &
assert_positive_i8_1, &
assert_positive_i16_1, &
assert_positive_i32_1, &
assert_positive_i64_1, &
assert_positive_r32_1, &
assert_positive_r64_1 !< Check if a number (integer or real) is positive.
procedure, private :: assert_positive_i8 !< Check if a integer (8 bits) is positive.
procedure, private :: assert_positive_i16 !< Check if a integer (16 bits) is positive.
procedure, private :: assert_positive_i32 !< Check if a integer (32 bits) is positive.
procedure, private :: assert_positive_i64 !< Check if a integer (64 bits) is positive.
procedure, private :: assert_positive_r32 !< Check if a real (32 bits) is positive.
procedure, private :: assert_positive_r64 !< Check if a real (64 bits) is positive.
procedure, private :: assert_positive_i8_1 !< Check if a integer (8 bits) array (rank 1) is positive.
procedure, private :: assert_positive_i16_1 !< Check if a integer (16 bits) array (rank 1) is positive.
procedure, private :: assert_positive_i32_1 !< Check if a integer (32 bits) array (rank 1) is positive.
procedure, private :: assert_positive_i64_1 !< Check if a integer (64 bits) array (rank 1) is positive.
procedure, private :: assert_positive_r32_1 !< Check if a real (32 bits) array (rank 1) is positive.
procedure, private :: assert_positive_r64_1 !< Check if a real (64 bits) array (rank 1) is positive.
generic, public :: assert_close => &
assert_close_r32, &
assert_close_r64, &
assert_close_c32, &
assert_close_c64, &
assert_close_r32_1, &
assert_close_r64_1, &
assert_close_c32_1, &
assert_close_c64_1 !< Check if two values (real or complex) are close with respect a tolerance.
procedure, private :: assert_close_r32 !< Check if two reals (32 bits) are close with respect a tolerance.
procedure, private :: assert_close_r64 !< Check if two reals (64 bits) are close with respect a tolerance.
procedure, private :: assert_close_c32 !< Check if two complex numbers (32 bits) are close with respect a tolerance.
procedure, private :: assert_close_c64 !< Check if two complex numbers (64 bits) are close with respect a tolerance.
procedure, private :: assert_close_r32_1 !< Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
procedure, private :: assert_close_r64_1 !< Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
procedure, private :: assert_close_c32_1 !< Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
procedure, private :: assert_close_c64_1 !< Check if two complex (64 bits) arrays (rank 1) are close with respect a tolerance.
end type tester_t
contains
!> Initialize the tester.
subroutine init(this, tolerance32, tolerance64)
class(tester_t), intent(out) :: this !< The tester.
real(real32), intent(in), optional :: tolerance32 !< Real tolerance, 32 bits.
real(real64), intent(in), optional :: tolerance64 !< Real tolerance, 64 bits.
this% n_errors = 0
this% n_tests = 0
if (present(tolerance64)) then
this% tolerance64 = tolerance64
else
this% tolerance64 = 2._real64*epsilon(1._real64)
end if
if (present(tolerance32)) then
this% tolerance32 = tolerance32
else
this% tolerance32 = 2._real32*epsilon(1._real32)
end if
end subroutine init
!> Print tests results.
subroutine print(this, errorstop)
class(tester_t), intent(in) :: this !< The tester.
logical, intent(in), optional :: errorstop !< Flag to activate error stop if one test fails.
logical :: do_errorstop
if (present(errorstop)) then
do_errorstop = errorstop
else
do_errorstop = .true.
end if
write(*,*) 'fortran_tester:', this% n_errors, ' error(s) for', this% n_tests, 'test(s)'
if (this% n_errors == 0) then
write(*,*) 'fortran_tester: all tests succeeded'
else
write(*,*) 'fortran_tester: tests failed'
if (do_errorstop) then
stop 1
end if
end if
end subroutine print
!> Check if two integers (8 bits) are equal.
subroutine assert_equal_i8(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int8), intent(in) :: i1 !< Value to compare.
integer(int8), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i1 .ne. i2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_i8
!> Check if two integers (16 bits) are equal.
subroutine assert_equal_i16(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int16), intent(in) :: i1 !< Value to compare.
integer(int16), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i1 .ne. i2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_i16
!> Check if two integers (32 bits) are equal.
subroutine assert_equal_i32(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int32), intent(in) :: i1 !< Value to compare.
integer(int32), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i1 .ne. i2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_i32
!> Check if two integers (64 bits) are equal.
subroutine assert_equal_i64(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int64), intent(in) :: i1 !< Value to compare.
integer(int64), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i1 .ne. i2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_i64
!> Check if two reals (32 bits) are equal.
subroutine assert_equal_r32(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), intent(in) :: r1 !< Value to compare.
real(real32), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (r1 .ne. r2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_r32
!> Check if two reals (64 bits) are equal.
subroutine assert_equal_r64(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), intent(in) :: r1 !< Value to compare.
real(real64), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (r1 .ne. r2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_r64
!> Check if two complex numbers (32 bits) are equal.
subroutine assert_equal_c32(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real32), intent(in) :: c1 !< Value to compare.
complex(real32), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (c1 .ne. c2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_c32
!> Check if two complex numbers (64 bits) are equal.
subroutine assert_equal_c64(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real64), intent(in) :: c1 !< Value to compare.
complex(real64), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (c1 .ne. c2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_c64
!> Check if two logicals are equal.
subroutine assert_equal_l(this, l1, l2, fail)
class(tester_t), intent(inout) :: this !< The tester.
logical, intent(in) :: l1 !< Value to compare.
logical, intent(in) :: l2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (l1 .neqv. l2) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_equal_l
!> Check if two integer (8 bits) arrays (rank 1) are equal.
subroutine assert_equal_i8_1(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int8), dimension(:), intent(in) :: i1 !< Value to compare.
integer(int8), dimension(:), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(i1) .ne. size(i2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(i1-i2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_i8_1
!> Check if two integer (16 bits) arrays (rank 1) are equal.
subroutine assert_equal_i16_1(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int16), dimension(:), intent(in) :: i1 !< Value to compare.
integer(int16), dimension(:), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(i1) .ne. size(i2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(i1-i2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_i16_1
!> Check if two integer (32 bits) arrays (rank 1) are equal.
subroutine assert_equal_i32_1(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int32), dimension(:), intent(in) :: i1 !< Value to compare.
integer(int32), dimension(:), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(i1) .ne. size(i2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(i1-i2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_i32_1
!> Check if two integer (64 bits) arrays (rank 1) are equal.
subroutine assert_equal_i64_1(this, i1, i2, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int64), dimension(:), intent(in) :: i1 !< Value to compare.
integer(int64), dimension(:), intent(in) :: i2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(i1) .ne. size(i2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(i1-i2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_i64_1
!> Check if two real (32 bits) arrays (rank 1) are equal.
subroutine assert_equal_r32_1(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), dimension(:), intent(in) :: r1 !< Value to compare.
real(real32), dimension(:), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(r1) .ne. size(r2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(r1-r2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_r32_1
!> Check if two real (64 bits) arrays (rank 1) are equal.
subroutine assert_equal_r64_1(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), dimension(:), intent(in) :: r1 !< Value to compare.
real(real64), dimension(:), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(r1) .ne. size(r2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(r1-r2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_r64_1
!> Check if two complex (32 bits) arrays (rank 1) are equal.
subroutine assert_equal_c32_1(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real32), dimension(:), intent(in) :: c1 !< Value to compare.
complex(real32), dimension(:), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(c1) .ne. size(c2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(c1-c2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_c32_1
!> Check if two complex (64 bits) arrays (rank 1) are equal.
subroutine assert_equal_c64_1(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real64), dimension(:), intent(in) :: c1 !< Value to compare.
complex(real64), dimension(:), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(c1) .ne. size(c2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(c1-c2)) > 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_equal_c64_1
!> Check if two logical arrays (rank 1) are equal.
subroutine assert_equal_l_1(this, l1, l2, fail)
class(tester_t), intent(inout) :: this !< The tester.
logical, intent(in), dimension(:) :: l1 !< Value to compare.
logical, intent(in), dimension(:) :: l2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
integer :: k
this% n_tests = this% n_tests + 1
if ( size(l1) .ne. size(l2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
do k = 1, size(l1)
if (l1(k) .neqv. l2(k)) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
exit
end if
end do
end if
end subroutine assert_equal_l_1
!> Check if a integer (32 bits) is positive.
subroutine assert_positive_i8(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int8), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i8
!> Check if a integer (16 bits) is positive.
subroutine assert_positive_i16(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int16), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i16
!> Check if a integer (32 bits) is positive.
subroutine assert_positive_i32(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int32), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i32
!> Check if a integer (32 bits) is positive.
subroutine assert_positive_i64(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int64), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (i < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i64
!> Check if a real (32 bits) is positive.
subroutine assert_positive_r32(this, r, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), intent(in) :: r !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (r < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_r32
!> Check if a real (64 bits) is positive.
subroutine assert_positive_r64(this, r, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), intent(in) :: r !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if (r < 0) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_r64
!> Check if a integer (8 bits) array (rank 1) is positive.
subroutine assert_positive_i8_1(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int8), dimension(:), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(i) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i8_1
!> Check if a integer (16 bits) array (rank 1) is positive.
subroutine assert_positive_i16_1(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int16), dimension(:), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(i) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i16_1
!> Check if a integer (32 bits) array (rank 1) is positive.
subroutine assert_positive_i32_1(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int32), dimension(:), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(i) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i32_1
!> Check if a integer (64 bits) array (rank 1) is positive.
subroutine assert_positive_i64_1(this, i, fail)
class(tester_t), intent(inout) :: this !< The tester.
integer(int64), dimension(:), intent(in) :: i !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(i) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_i64_1
!> Check if a real (32 bits) array (rank 1) is positive.
subroutine assert_positive_r32_1(this, r, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), dimension(:), intent(in) :: r !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(r) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_r32_1
!> Check if a real (64 bits) array (rank 1) is positive.
subroutine assert_positive_r64_1(this, r, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), dimension(:), intent(in) :: r !< Value to check.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( minval(r) < 0 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_positive_r64_1
!> Check if two reals (32 bits) are close with respect a tolerance.
subroutine assert_close_r32(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), intent(in) :: r1 !< Value to compare.
real(real32), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( abs(r1-r2) > this% tolerance32 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_close_r32
!> Check if two reals (64 bits) are close with respect a tolerance.
subroutine assert_close_r64(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), intent(in) :: r1 !< Value to compare.
real(real64), intent(in) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( abs(r1-r2) > this% tolerance64 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_close_r64
!> Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
subroutine assert_close_r32_1(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real32), intent(in), dimension(:) :: r1 !< Value to compare.
real(real32), intent(in), dimension(:) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(r1) .ne. size(r2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_close_r32_1
!> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
subroutine assert_close_r64_1(this, r1, r2, fail)
class(tester_t), intent(inout) :: this !< The tester.
real(real64), intent(in), dimension(:) :: r1 !< Value to compare.
real(real64), intent(in), dimension(:) :: r2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(r1) .ne. size(r2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_close_r64_1
!> Check if two complex numbers (32 bits) are close with respect a tolerance.
subroutine assert_close_c32(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real32), intent(in) :: c1 !< Value to compare.
complex(real32), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( abs(c1-c2) > this% tolerance32 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_close_c32
!> Check if two complex numbers (64 bits) are close with respect a tolerance.
subroutine assert_close_c64(this, r1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real64), intent(in) :: r1 !< Value to compare.
complex(real64), intent(in) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( abs(r1-c2) > this% tolerance64 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end subroutine assert_close_c64
!> Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
subroutine assert_close_c32_1(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real32), intent(in), dimension(:) :: c1 !< Value to compare.
complex(real32), intent(in), dimension(:) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(c1) .ne. size(c2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(c1-c2)) > this% tolerance32 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_close_c32_1
!> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
subroutine assert_close_c64_1(this, c1, c2, fail)
class(tester_t), intent(inout) :: this !< The tester.
complex(real64), intent(in), dimension(:) :: c1 !< Value to compare.
complex(real64), intent(in), dimension(:) :: c2 !< Value to compare.
logical, intent(in), optional :: fail !< Fail flag.
this% n_tests = this% n_tests + 1
if ( size(c1) .ne. size(c2) ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
else
if ( maxval(abs(c1-c2)) > this% tolerance64 ) then
if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
this% n_errors = this% n_errors + 1
end if
end if
end if
end subroutine assert_close_c64_1
end module tester

53
LAXlib/tests/utils.f90 Normal file
View File

@ -0,0 +1,53 @@
SUBROUTINE collect_results(test)
#if defined(__MPI)
USE mpi
#endif
USE tester
IMPLICIT NONE
!
TYPE(tester_t) :: test
INTEGER :: itottests, itoterr, ierr, me
!
#if defined(__MPI)
!
CALL MPI_REDUCE(test%n_errors, itoterr, 1, MPI_INTEGER, MPI_SUM, &
0, MPI_COMM_WORLD, ierr)
! Fail in case MPI fails...
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
!
CALL MPI_REDUCE(test%n_tests, itottests, 1, MPI_INTEGER, MPI_SUM, &
0, MPI_COMM_WORLD, ierr)
! Fail in case MPI fails...
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
!
test%n_tests = itottests
test%n_errors = itoterr
!
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
!
CALL MPI_Comm_rank(MPI_COMM_WORLD, me, ierr);
!
IF (ierr /= 0) CALL test%assert_equal(0, ierr)
!
#endif
END SUBROUTINE collect_results
SUBROUTINE no_test
#if defined(__MPI)
USE mpi
#endif
USE tester
IMPLICIT NONE
!TYPE(tester_t) :: test
INTEGER :: ierr
!
#if defined(__MPI)
CALL MPI_Init(ierr)
#endif
!CALL test%init()
!CALL print_results(test)
#if defined(__MPI)
CALL mpi_finalize(ierr)
#endif
END SUBROUTINE no_test

View File

@ -141,6 +141,8 @@ w0gauss.o \
w1gauss.o \
deviatoric.o
-include make.gpu
TLDEPS= libfox libutil libla libfft librxc
all : libqemod.a

387
Modules/becmod_gpu.f90 Normal file
View File

@ -0,0 +1,387 @@
!
! Copyright (C) 2002-2011 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#define DIMS1D(my_array) lbound(my_array,1):ubound(my_array,1)
#define DIMS2D(my_array) lbound(my_array,1):ubound(my_array,1),lbound(my_array,2):ubound(my_array,2)
#define DIMS3D(my_array) lbound(my_array,1):ubound(my_array,1),lbound(my_array,2):ubound(my_array,2),lbound(my_array,3):ubound(my_array,3)
#define DIMS4D(my_array) lbound(my_array,1):ubound(my_array,1),lbound(my_array,2):ubound(my_array,2),lbound(my_array,3):ubound(my_array,3),lbound(my_array,4):ubound(my_array,4)
!=----------------------------------------------------------------------------=!
MODULE becmod_gpum
!=----------------------------------------------------------------------------=!
#if defined(__CUDA)
USE cudafor
#endif
IMPLICIT NONE
SAVE
INTEGER, PARAMETER :: DP = selected_real_kind(14,200)
INTEGER, PARAMETER :: sgl = selected_real_kind(6,30)
INTEGER, PARAMETER :: i4b = selected_int_kind(9)
INTEGER, PARAMETER :: i8b = selected_int_kind(18)
INTEGER :: iverbosity = 0
#if defined(__DEBUG)
iverbosity = 1
#endif
!
TYPE bec_type_d
#if defined(__CUDA)
REAL(DP), ALLOCATABLE, DEVICE :: r_d(:, :)
#else
REAL(DP), ALLOCATABLE :: r_d(:, :)
#endif
#if defined(__CUDA)
COMPLEX(DP), ALLOCATABLE, DEVICE :: k_d(:, :)
#else
COMPLEX(DP), ALLOCATABLE :: k_d(:, :)
#endif
#if defined(__CUDA)
COMPLEX(DP), ALLOCATABLE, DEVICE :: nc_d(:, :, :)
#else
COMPLEX(DP), ALLOCATABLE :: nc_d(:, :, :)
#endif
INTEGER :: comm
INTEGER :: nbnd
INTEGER :: nproc
INTEGER :: mype
INTEGER :: nbnd_loc
INTEGER :: ibnd_begin
END TYPE bec_type_d
!
TYPE (bec_type_d), TARGET :: becp_d ! <beta|psi>
!
LOGICAL :: becp_r_ood = .false. ! used to flag out of date variables
LOGICAL :: becp_d_r_d_ood = .false. ! used to flag out of date variables
LOGICAL :: becp_k_ood = .false. ! used to flag out of date variables
LOGICAL :: becp_d_k_d_ood = .false. ! used to flag out of date variables
LOGICAL :: becp_nc_ood = .false. ! used to flag out of date variables
LOGICAL :: becp_d_nc_d_ood = .false. ! used to flag out of date variables
!
CONTAINS
!
SUBROUTINE using_becp_r(intento, debug_info)
!
! intento is used to specify what the variable will be used for :
! 0 -> in , the variable needs to be synchronized but won't be changed
! 1 -> inout , the variable needs to be synchronized AND will be changed
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
!
#if defined(__CUDA) || defined(__CUDA_GNU)
INTEGER :: intento_
intento_ = intento
!
IF (PRESENT(debug_info) ) print *, debug_info
!
IF (becp_r_ood) THEN
IF (.not. allocated(becp_d%r_d)) THEN
CALL errore('using_r_d', 'PANIC: sync of becp%r from becp_d%r_d with unallocated array. Bye!!', 1)
stop
END IF
IF (.not. allocated(becp%r)) THEN
IF (intento_ /= 2) THEN
print *, "WARNING: sync of becp%r with unallocated array and intento /= 2? Changed to 2!"
intento_ = 2
END IF
END IF
IF (intento_ < 2) THEN
IF ( iverbosity > 0 ) print *, "Really copied becp%r D->H"
becp%r = becp_d%r_d
END IF
! ALWAYS update auxiliary variables
!IF ( becp%comm /= becp_d%comm ) &
! print *, "WARNING: auxiliary variable becp%comm changed"
becp%comm = becp_d%comm
!IF ( becp%nbnd /= becp_d%nbnd ) &
! print *, "WARNING: auxiliary variable becp%nbnd changed"
becp%nbnd = becp_d%nbnd
!IF ( becp%nproc /= becp_d%nproc ) &
! print *, "WARNING: auxiliary variable becp%nproc changed"
becp%nproc = becp_d%nproc
!IF ( becp%mype /= becp_d%mype ) &
! print *, "WARNING: auxiliary variable becp%mype changed"
becp%mype = becp_d%mype
!IF ( becp%nbnd_loc /= becp_d%nbnd_loc ) &
! print *, "WARNING: auxiliary variable becp%nbnd_loc changed"
becp%nbnd_loc = becp_d%nbnd_loc
!IF ( becp%ibnd_begin /= becp_d%ibnd_begin ) &
! print *, "WARNING: auxiliary variable becp%ibnd_begin changed"
becp%ibnd_begin = becp_d%ibnd_begin
!
becp_r_ood = .false.
ENDIF
IF (intento_ > 0) becp_d_r_d_ood = .true.
#endif
END SUBROUTINE using_becp_r
!
SUBROUTINE using_becp_r_d(intento, debug_info)
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
!
#if defined(__CUDA) || defined(__CUDA_GNU)
!
IF (PRESENT(debug_info) ) print *, debug_info
!
IF (.not. allocated(becp%r)) THEN
IF (intento /= 2) print *, "WARNING: sync of becp%r_d with unallocated array and intento /= 2?"
IF (allocated(becp_d%r_d)) DEALLOCATE(becp_d%r_d)
becp_d_r_d_ood = .false.
RETURN
END IF
! here we know that r is allocated, check if size is 0
IF ( SIZE(becp%r) == 0 ) THEN
print *, "Refusing to allocate 0 dimensional array becp_d%r_d. If used, code will crash."
RETURN
END IF
!
IF (becp_d_r_d_ood) THEN
IF ( allocated(becp_d%r_d) .and. (SIZE(becp_d%r_d)/=SIZE(becp%r))) deallocate(becp_d%r_d)
IF (.not. allocated(becp_d%r_d)) ALLOCATE(becp_d%r_d(DIMS2D(becp%r))) ! MOLD does not work on all compilers
IF (intento < 2) THEN
IF ( iverbosity > 0 ) print *, "Really copied becp%r H->D"
becp_d%r_d = becp%r
END IF
! ALWAYS update auxiliary variables
becp_d%comm = becp%comm
becp_d%nbnd = becp%nbnd
becp_d%nproc = becp%nproc
becp_d%mype = becp%mype
becp_d%nbnd_loc = becp%nbnd_loc
becp_d%ibnd_begin = becp%ibnd_begin
!
becp_d_r_d_ood = .false.
ENDIF
IF (intento > 0) becp_r_ood = .true.
#else
CALL errore('using_becp_d%r_d', 'Trying to use device data without device compilated code!', 1)
#endif
END SUBROUTINE using_becp_r_d
!
SUBROUTINE using_becp_k(intento, debug_info)
!
! intento is used to specify what the variable will be used for :
! 0 -> in , the variable needs to be synchronized but won't be changed
! 1 -> inout , the variable needs to be synchronized AND will be changed
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
!
#if defined(__CUDA) || defined(__CUDA_GNU)
INTEGER :: intento_
intento_ = intento
!
IF (PRESENT(debug_info) ) print *, debug_info
!
IF (becp_k_ood) THEN
IF (.not. allocated(becp_d%k_d)) THEN
CALL errore('using_k_d', 'PANIC: sync of becp%k from becp_d%k_d with unallocated array. Bye!!', 1)
stop
END IF
IF (.not. allocated(becp%k)) THEN
IF (intento_ /= 2) THEN
print *, "WARNING: sync of becp%k with unallocated array and intento /= 2? Changed to 2!"
intento_ = 2
END IF
END IF
IF (intento_ < 2) THEN
IF ( iverbosity > 0 ) print *, "Really copied becp%k D->H"
becp%k = becp_d%k_d
END IF
! ALWAYS update auxiliary variables
!IF ( becp%comm /= becp_d%comm ) &
! print *, "WARNING: auxiliary variable becp%comm changed"
becp%comm = becp_d%comm
!IF ( becp%nbnd /= becp_d%nbnd ) &
! print *, "WARNING: auxiliary variable becp%nbnd changed"
becp%nbnd = becp_d%nbnd
!IF ( becp%nproc /= becp_d%nproc ) &
! print *, "WARNING: auxiliary variable becp%nproc changed"
becp%nproc = becp_d%nproc
!IF ( becp%mype /= becp_d%mype ) &
! print *, "WARNING: auxiliary variable becp%mype changed"
becp%mype = becp_d%mype
!IF ( becp%nbnd_loc /= becp_d%nbnd_loc ) &
! print *, "WARNING: auxiliary variable becp%nbnd_loc changed"
becp%nbnd_loc = becp_d%nbnd_loc
!IF ( becp%ibnd_begin /= becp_d%ibnd_begin ) &
! print *, "WARNING: auxiliary variable becp%ibnd_begin changed"
becp%ibnd_begin = becp_d%ibnd_begin
!
becp_k_ood = .false.
ENDIF
IF (intento_ > 0) becp_d_k_d_ood = .true.
#endif
END SUBROUTINE using_becp_k
!
SUBROUTINE using_becp_k_d(intento, debug_info)
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
!
#if defined(__CUDA) || defined(__CUDA_GNU)
!
IF (PRESENT(debug_info) ) print *, debug_info
!
IF (.not. allocated(becp%k)) THEN
IF (intento /= 2) print *, "WARNING: sync of becp%k_d with unallocated array and intento /= 2?"
IF (allocated(becp_d%k_d)) DEALLOCATE(becp_d%k_d)
becp_d_k_d_ood = .false.
RETURN
END IF
! here we know that k is allocated, check if size is 0
IF ( SIZE(becp%k) == 0 ) THEN
print *, "Refusing to allocate 0 dimensional array becp_d%k_d. If used, code will crash."
RETURN
END IF
!
IF (becp_d_k_d_ood) THEN
IF ( allocated(becp_d%k_d) .and. (SIZE(becp_d%k_d)/=SIZE(becp%k))) deallocate(becp_d%k_d)
IF (.not. allocated(becp_d%k_d)) ALLOCATE(becp_d%k_d(DIMS2D(becp%k))) ! MOLD does not work on all compilers
IF (intento < 2) THEN
IF ( iverbosity > 0 ) print *, "Really copied becp%k H->D"
becp_d%k_d = becp%k
END IF
! ALWAYS update auxiliary variables
becp_d%comm = becp%comm
becp_d%nbnd = becp%nbnd
becp_d%nproc = becp%nproc
becp_d%mype = becp%mype
becp_d%nbnd_loc = becp%nbnd_loc
becp_d%ibnd_begin = becp%ibnd_begin
!
becp_d_k_d_ood = .false.
ENDIF
IF (intento > 0) becp_k_ood = .true.
#else
CALL errore('using_becp_d%k_d', 'Trying to use device data without device compilated code!', 1)
#endif
END SUBROUTINE using_becp_k_d
!
SUBROUTINE using_becp_nc(intento, debug_info)
!
! intento is used to specify what the variable will be used for :
! 0 -> in , the variable needs to be synchronized but won't be changed
! 1 -> inout , the variable needs to be synchronized AND will be changed
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
!
#if defined(__CUDA) || defined(__CUDA_GNU)
INTEGER :: intento_
intento_ = intento
!
IF (PRESENT(debug_info) ) print *, debug_info
!
IF (becp_nc_ood) THEN
IF (.not. allocated(becp_d%nc_d)) THEN
CALL errore('using_nc_d', 'PANIC: sync of becp%nc from becp_d%nc_d with unallocated array. Bye!!', 1)
stop
END IF
IF (.not. allocated(becp%nc)) THEN
IF (intento_ /= 2) THEN
print *, "WARNING: sync of becp%nc with unallocated array and intento /= 2? Changed to 2!"
intento_ = 2
END IF
END IF
IF (intento_ < 2) THEN
IF ( iverbosity > 0 ) print *, "Really copied becp%nc D->H"
becp%nc = becp_d%nc_d
END IF
! ALWAYS update auxiliary variables
!IF ( becp%comm /= becp_d%comm ) &
! print *, "WARNING: auxiliary variable becp%comm changed"
becp%comm = becp_d%comm
!IF ( becp%nbnd /= becp_d%nbnd ) &
! print *, "WARNING: auxiliary variable becp%nbnd changed"
becp%nbnd = becp_d%nbnd
!IF ( becp%nproc /= becp_d%nproc ) &
! print *, "WARNING: auxiliary variable becp%nproc changed"
becp%nproc = becp_d%nproc
!IF ( becp%mype /= becp_d%mype ) &
! print *, "WARNING: auxiliary variable becp%mype changed"
becp%mype = becp_d%mype
!IF ( becp%nbnd_loc /= becp_d%nbnd_loc ) &
! print *, "WARNING: auxiliary variable becp%nbnd_loc changed"
becp%nbnd_loc = becp_d%nbnd_loc
!IF ( becp%ibnd_begin /= becp_d%ibnd_begin ) &
! print *, "WARNING: auxiliary variable becp%ibnd_begin changed"
becp%ibnd_begin = becp_d%ibnd_begin
!
becp_nc_ood = .false.
ENDIF
IF (intento_ > 0) becp_d_nc_d_ood = .true.
#endif
END SUBROUTINE using_becp_nc
!
SUBROUTINE using_becp_nc_d(intento, debug_info)
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
!
#if defined(__CUDA) || defined(__CUDA_GNU)
!
IF (PRESENT(debug_info) ) print *, debug_info
!
IF (.not. allocated(becp%nc)) THEN
IF (intento /= 2) print *, "WARNING: sync of becp%nc_d with unallocated array and intento /= 2?"
IF (allocated(becp_d%nc_d)) DEALLOCATE(becp_d%nc_d)
becp_d_nc_d_ood = .false.
RETURN
END IF
! here we know that nc is allocated, check if size is 0
IF ( SIZE(becp%nc) == 0 ) THEN
print *, "Refusing to allocate 0 dimensional array becp_d%nc_d. If used, code will crash."
RETURN
END IF
!
IF (becp_d_nc_d_ood) THEN
IF ( allocated(becp_d%nc_d) .and. (SIZE(becp_d%nc_d)/=SIZE(becp%nc))) deallocate(becp_d%nc_d)
IF (.not. allocated(becp_d%nc_d)) ALLOCATE(becp_d%nc_d(DIMS3D(becp%nc))) ! MOLD does not work on all compilers
IF (intento < 2) THEN
IF ( iverbosity > 0 ) print *, "Really copied becp%nc H->D"
becp_d%nc_d = becp%nc
END IF
! ALWAYS update auxiliary variables
becp_d%comm = becp%comm
becp_d%nbnd = becp%nbnd
becp_d%nproc = becp%nproc
becp_d%mype = becp%mype
becp_d%nbnd_loc = becp%nbnd_loc
becp_d%ibnd_begin = becp%ibnd_begin
!
becp_d_nc_d_ood = .false.
ENDIF
IF (intento > 0) becp_nc_ood = .true.
#else
CALL errore('using_becp_d%nc_d', 'Trying to use device data without device compilated code!', 1)
#endif
END SUBROUTINE using_becp_nc_d
!
SUBROUTINE deallocate_becmod_gpu
IF( ALLOCATED( becp_d%r_d ) ) DEALLOCATE( becp_d%r_d )
IF( ALLOCATED( becp_d%k_d ) ) DEALLOCATE( becp_d%k_d )
IF( ALLOCATED( becp_d%nc_d ) ) DEALLOCATE( becp_d%nc_d )
END SUBROUTINE deallocate_becmod_gpu
!=----------------------------------------------------------------------------=!
END MODULE becmod_gpum
!=----------------------------------------------------------------------------=!

609
Modules/becmod_subs_gpu.f90 Normal file
View File

@ -0,0 +1,609 @@
!
! Copyright (C) 2001-2007 PWSCF group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
!
MODULE becmod_subs_gpum
! NOTA BENE : THE SUBROUTINES IN THIS FILE ARE ONLY PARTIALLY TESTED!
!
! ... *bec* contain <beta|psi> - used in h_psi, s_psi, many other places
! ... calbec( npw, beta, psi, betapsi [, nbnd ] ) is an interface calculating
! ... betapsi(i,j) = <beta(i)|psi(j)> (the sum is over npw components)
! ... or betapsi(i,s,j)= <beta(i)|psi(s,j)> (s=polarization index)
!
USE kinds, ONLY : DP
USE control_flags, ONLY : gamma_only, smallmem
USE gvect, ONLY : gstart
USE noncollin_module, ONLY : noncolin, npol
USE becmod_gpum, ONLY : bec_type_d
!
SAVE
!
PRIVATE
!
INTERFACE calbec_gpu
!
MODULE PROCEDURE calbec_k_gpu, calbec_gamma_gpu, calbec_gamma_nocomm_gpu, calbec_nc_gpu, calbec_bec_type_gpu
!
END INTERFACE
INTERFACE becscal_gpu
!
MODULE PROCEDURE becscal_nck_gpu, becscal_gamma_gpu
!
END INTERFACE
!
PUBLIC :: allocate_bec_type_gpu, deallocate_bec_type_gpu, calbec_gpu, &
beccopy_gpu, becscal_gpu, is_allocated_bec_type_gpu, &
synchronize_bec_type_gpu, &
using_becp_auto, using_becp_d_auto
!
CONTAINS
!-----------------------------------------------------------------------
SUBROUTINE calbec_bec_type_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
!-----------------------------------------------------------------------
!_
USE mp_bands, ONLY: intra_bgrp_comm
USE mp, ONLY: mp_get_comm_null
!
IMPLICIT NONE
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
TYPE (bec_type_d), TARGET, INTENT (inout) :: betapsi_d ! NB: must be INOUT otherwise
! the allocatd array is lost
INTEGER, INTENT (in) :: npw
INTEGER, OPTIONAL :: nbnd
#if defined(__CUDA)
attributes(DEVICE) :: beta_d, psi_d
#endif
!
INTEGER :: local_nbnd
INTEGER, EXTERNAL :: ldim_block, gind_block
INTEGER :: m_loc, m_begin, ip
REAL(DP), ALLOCATABLE :: dtmp_d(:,:) ! replace this with buffers !
INTEGER :: i, j, nkb
REAL(DP), POINTER :: betapsi_d_r_d(:,:)
#if defined(__CUDA)
attributes(DEVICE) :: dtmp_d, betapsi_d_r_d
#endif
!
IF ( present (nbnd) ) THEN
local_nbnd = nbnd
ELSE
local_nbnd = size ( psi_d, 2)
ENDIF
IF ( gamma_only ) THEN
!
IF( betapsi_d%comm == mp_get_comm_null() ) THEN
!
CALL calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d%r_d, local_nbnd, intra_bgrp_comm )
!
ELSE
!
ALLOCATE( dtmp_d( SIZE( betapsi_d%r_d, 1 ), SIZE( betapsi_d%r_d, 2 ) ) )
!
DO ip = 0, betapsi_d%nproc - 1
m_loc = ldim_block( betapsi_d%nbnd , betapsi_d%nproc, ip )
m_begin = gind_block( 1, betapsi_d%nbnd, betapsi_d%nproc, ip )
IF( ( m_begin + m_loc - 1 ) > local_nbnd ) m_loc = local_nbnd - m_begin + 1
IF( m_loc > 0 ) THEN
CALL calbec_gamma_gpu ( npw, beta_d, psi_d(:,m_begin:m_begin+m_loc-1), dtmp_d, m_loc, betapsi_d%comm )
IF( ip == betapsi_d%mype ) THEN
nkb = SIZE( betapsi_d%r_d, 1 )
betapsi_d_r_d => betapsi_d%r_d
!$cuf kernel do(2) <<<*,*>>>
DO j=1,m_loc
DO i=1, nkb
betapsi_d_r_d(i,j) = dtmp_d(i,j)
END DO
END DO
END IF
END IF
END DO
DEALLOCATE( dtmp_d )
!
END IF
!
ELSEIF ( noncolin) THEN
!
CALL calbec_nc_gpu ( npw, beta_d, psi_d, betapsi_d%nc_d, local_nbnd )
!
ELSE
!
CALL calbec_k_gpu ( npw, beta_d, psi_d, betapsi_d%k_d, local_nbnd )
!
ENDIF
!
RETURN
!
END SUBROUTINE calbec_bec_type_gpu
!-----------------------------------------------------------------------
SUBROUTINE calbec_gamma_nocomm_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
!-----------------------------------------------------------------------
USE mp_bands, ONLY: intra_bgrp_comm
IMPLICIT NONE
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
REAL (DP), INTENT (out) :: betapsi_d(:,:)
INTEGER, INTENT (in) :: npw
INTEGER, OPTIONAL :: nbnd
INTEGER :: m
#if defined(__CUDA)
attributes(DEVICE) :: beta_d, psi_d, betapsi_d
#endif
IF ( present (nbnd) ) THEN
m = nbnd
ELSE
m = size ( psi_d, 2)
ENDIF
CALL calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d, m, intra_bgrp_comm )
RETURN
!
END SUBROUTINE calbec_gamma_nocomm_gpu
!-----------------------------------------------------------------------
SUBROUTINE calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd, comm )
!-----------------------------------------------------------------------
!
! ... matrix times matrix with summation index (k=1,npw) running on
! ... half of the G-vectors or PWs - assuming k=0 is the G=0 component:
! ... betapsi(i,j) = 2Re(\sum_k beta^*(i,k)psi(k,j)) + beta^*(i,0)psi(0,j)
!
USE mp, ONLY : mp_sum, mp_size
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
REAL (DP), INTENT (out) :: betapsi_d(:,:)
INTEGER, INTENT (in) :: npw
INTEGER, INTENT (in) :: nbnd
INTEGER, INTENT (in) :: comm
!
#if defined(__CUDA)
attributes(DEVICE) :: beta_d, psi_d, betapsi_d
#endif
INTEGER :: nkb, npwx, m
INTEGER :: i,j
!
m = nbnd
!
nkb = size (beta_d, 2)
IF ( nkb == 0 ) RETURN
!
CALL start_clock( 'calbec' )
IF ( npw == 0 ) betapsi_d(:,:)=0.0_DP
npwx= size (beta_d, 1)
IF ( npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
#if defined(DEBUG)
WRITE (*,*) 'calbec gamma'
WRITE (*,*) nkb, size (betapsi_d,1) , m , size (betapsi_d, 2)
#endif
IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 2) ) &
CALL errore ('calbec', 'size mismatch', 3)
!
IF ( m == 1 ) THEN
!
CALL cudaDGEMV( 'C', 2*npw, nkb, 2.0_DP, beta_d, 2*npwx, psi_d, 1, 0.0_DP, &
betapsi_d, 1 )
IF ( gstart == 2 ) THEN
!betapsi_d(:,1) = betapsi_d(:,1) - beta_d(1,:)*psi_d(1,1)
!$cuf kernel do(1) <<<*,*>>>
DO i=1, nkb
betapsi_d(i,1) = betapsi_d(i,1) - DBLE(beta_d(1,i)*psi_d(1,1))
END DO
END IF
!
ELSE
!
CALL DGEMM( 'C', 'N', nkb, m, 2*npw, 2.0_DP, beta_d, 2*npwx, psi_d, &
2*npwx, 0.0_DP, betapsi_d, nkb )
IF ( gstart == 2 ) &
CALL cudaDGER( nkb, m, -1.0_DP, beta_d, 2*npwx, psi_d, 2*npwx, betapsi_d, nkb )
!
ENDIF
!
IF (mp_size(comm) > 1) CALL mp_sum( betapsi_d( :, 1:m ), comm )
!
CALL stop_clock( 'calbec' )
!
RETURN
!
END SUBROUTINE calbec_gamma_gpu
!
!-----------------------------------------------------------------------
SUBROUTINE calbec_k_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
!-----------------------------------------------------------------------
!
! ... matrix times matrix with summation index (k=1,npw) running on
! ... G-vectors or PWs : betapsi(i,j) = \sum_k beta^*(i,k) psi(k,j)
!
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum, mp_size
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
COMPLEX (DP), INTENT (out) :: betapsi_d(:,:)
INTEGER, INTENT (in) :: npw
INTEGER, OPTIONAL :: nbnd
!
INTEGER :: nkb, npwx, m
!
#if defined(__CUDA)
attributes(device) :: beta_d, psi_d, betapsi_d
#endif
nkb = size (beta_d, 2)
IF ( nkb == 0 ) RETURN
!
CALL start_clock( 'calbec' )
IF ( npw == 0 ) betapsi_d(:,:)=(0.0_DP,0.0_DP)
npwx= size (beta_d, 1)
IF ( npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
IF ( present (nbnd) ) THEN
m = nbnd
ELSE
m = size ( psi_d, 2)
ENDIF
#if defined(DEBUG)
WRITE (*,*) 'calbec k'
WRITE (*,*) nkb, size (betapsi_d,1) , m , size (betapsi_d, 2)
#endif
IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 2) ) &
CALL errore ('calbec', 'size mismatch', 3)
!
IF ( m == 1 ) THEN
!
CALL ZGEMV( 'C', npw, nkb, (1.0_DP,0.0_DP), beta_d, npwx, psi_d, 1, &
(0.0_DP, 0.0_DP), betapsi_d, 1 )
!
ELSE
!
CALL ZGEMM( 'C', 'N', nkb, m, npw, (1.0_DP,0.0_DP), &
beta_d, npwx, psi_d, npwx, (0.0_DP,0.0_DP), betapsi_d, nkb )
!
ENDIF
!
IF (mp_size(intra_bgrp_comm) > 1) CALL mp_sum( betapsi_d( :, 1:m ), intra_bgrp_comm )
!
CALL stop_clock( 'calbec' )
!
RETURN
!
END SUBROUTINE calbec_k_gpu
!
!-----------------------------------------------------------------------
SUBROUTINE calbec_nc_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
!-----------------------------------------------------------------------
!
! ... matrix times matrix with summation index (k below) running on
! ... G-vectors or PWs corresponding to two different polarizations:
! ... betapsi(i,1,j) = \sum_k=1,npw beta^*(i,k) psi(k,j)
! ... betapsi(i,2,j) = \sum_k=1,npw beta^*(i,k) psi(k+npwx,j)
!
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum, mp_size
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
COMPLEX (DP), INTENT (out) :: betapsi_d(:,:,:)
INTEGER, INTENT (in) :: npw
INTEGER, OPTIONAL :: nbnd
!
INTEGER :: nkb, npwx, npol, m
!
#if defined(__CUDA)
attributes(device) :: beta_d, psi_d, betapsi_d
#endif
nkb = size (beta_d, 2)
IF ( nkb == 0 ) RETURN
!
CALL start_clock ('calbec')
IF ( npw == 0 ) betapsi_d(:,:,:)=(0.0_DP,0.0_DP)
npwx= size (beta_d, 1)
IF ( 2*npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
IF ( present (nbnd) ) THEN
m = nbnd
ELSE
m = size ( psi_d, 2)
ENDIF
npol= size (betapsi_d, 2)
#if defined(DEBUG)
WRITE (*,*) 'calbec nc'
WRITE (*,*) nkb, size (betapsi_d,1) , m , size (betapsi_d, 3)
#endif
IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 3) ) &
CALL errore ('calbec', 'size mismatch', 3)
!
CALL ZGEMM ('C', 'N', nkb, m*npol, npw, (1.0_DP, 0.0_DP), beta_d, &
npwx, psi_d, npwx, (0.0_DP, 0.0_DP), betapsi_d, nkb)
!
IF (mp_size(intra_bgrp_comm) > 1) CALL mp_sum( betapsi_d( :, :, 1:m ), intra_bgrp_comm )
!
CALL stop_clock( 'calbec' )
!
RETURN
!
END SUBROUTINE calbec_nc_gpu
!
!
!-----------------------------------------------------------------------
FUNCTION is_allocated_bec_type_gpu (bec_d) RESULT (isalloc)
!-----------------------------------------------------------------------
IMPLICIT NONE
TYPE (bec_type_d) :: bec_d
LOGICAL :: isalloc
isalloc = (allocated(bec_d%r_d) .or. allocated(bec_d%nc_d) .or. allocated(bec_d%k_d))
RETURN
!
!-----------------------------------------------------------------------
END FUNCTION is_allocated_bec_type_gpu
!-----------------------------------------------------------------------
!
!-----------------------------------------------------------------------
SUBROUTINE allocate_bec_type_gpu ( nkb, nbnd, bec_d, comm )
!-----------------------------------------------------------------------
USE mp, ONLY: mp_size, mp_rank, mp_get_comm_null
USE device_memcpy_m, ONLY : dev_memset
IMPLICIT NONE
TYPE (bec_type_d) :: bec_d
INTEGER, INTENT (in) :: nkb, nbnd
INTEGER, INTENT (in), OPTIONAL :: comm
INTEGER :: ierr, nbnd_siz
INTEGER, EXTERNAL :: ldim_block, gind_block
!
nbnd_siz = nbnd
bec_d%comm = mp_get_comm_null()
bec_d%nbnd = nbnd
bec_d%mype = 0
bec_d%nproc = 1
bec_d%nbnd_loc = nbnd
bec_d%ibnd_begin = 1
!
IF( PRESENT( comm ) .AND. gamma_only .AND. smallmem ) THEN
bec_d%comm = comm
bec_d%nproc = mp_size( comm )
IF( bec_d%nproc > 1 ) THEN
nbnd_siz = nbnd / bec_d%nproc
IF( MOD( nbnd, bec_d%nproc ) /= 0 ) nbnd_siz = nbnd_siz + 1
bec_d%mype = mp_rank( bec_d%comm )
bec_d%nbnd_loc = ldim_block( bec_d%nbnd , bec_d%nproc, bec_d%mype )
bec_d%ibnd_begin = gind_block( 1, bec_d%nbnd, bec_d%nproc, bec_d%mype )
END IF
END IF
!
IF ( gamma_only ) THEN
!
ALLOCATE( bec_d%r_d( nkb, nbnd_siz ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' allocate_bec_type ', ' cannot allocate bec_d%r ', ABS(ierr) )
!
CALL dev_memset(bec_d%r_d, 0.0D0, (/1,nkb/), 1, (/1, nbnd_siz/), 1)
!
ELSEIF ( noncolin) THEN
!
ALLOCATE( bec_d%nc_d( nkb, npol, nbnd_siz ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' allocate_bec_type ', ' cannot allocate bec_d%nc ', ABS(ierr) )
!
CALL dev_memset(bec_d%nc_d, (0.0D0,0.0D0), (/1, nkb/), 1, (/1, npol/), 1, (/1, nbnd_siz/), 1)
!
ELSE
!
ALLOCATE( bec_d%k_d( nkb, nbnd_siz ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' allocate_bec_type ', ' cannot allocate bec_d%k ', ABS(ierr) )
!
CALL dev_memset(bec_d%k_d, (0.0D0,0.0D0), (/1, nkb/), 1, (/1, npol/), 1)
!
ENDIF
!
RETURN
!
END SUBROUTINE allocate_bec_type_gpu
!-----------------------------------------------------------------------
!
!-----------------------------------------------------------------------
SUBROUTINE deallocate_bec_type_gpu (bec_d)
!-----------------------------------------------------------------------
!
USE mp, ONLY: mp_get_comm_null
IMPLICIT NONE
TYPE (bec_type_d) :: bec_d
!
bec_d%comm = mp_get_comm_null()
bec_d%nbnd = 0
!
IF (allocated(bec_d%r_d)) DEALLOCATE(bec_d%r_d)
IF (allocated(bec_d%nc_d)) DEALLOCATE(bec_d%nc_d)
IF (allocated(bec_d%k_d)) DEALLOCATE(bec_d%k_d)
!
RETURN
!
END SUBROUTINE deallocate_bec_type_gpu
!-----------------------------------------------------------------------
!
!-----------------------------------------------------------------------
SUBROUTINE synchronize_bec_type_gpu (bec_d, bec, what)
!-----------------------------------------------------------------------
!
! ... Updates a device or host version of a bec_type variable.
! ... Direction 'h' updates host version, direction 'd' the device
! ... version.
!
USE mp, ONLY: mp_get_comm_null
USE becmod, ONLY : bec_type
IMPLICIT NONE
TYPE (bec_type_d) :: bec_d
TYPE (bec_type) :: bec
CHARACTER, INTENT(IN) :: what
!
IF ( gamma_only ) THEN
!
IF (.not. (allocated(bec_d%r_d) .and. allocated(bec%r))) &
CALL errore('becmod_gpu', 'Unallocated array',1)
SELECT CASE(what)
CASE('d')
bec_d%r_d = bec%r
CASE('h')
bec%r = bec_d%r_d
CASE DEFAULT
CALL errore('becmod_gpu', 'Invalid command',2)
END SELECT
!
ELSEIF ( noncolin) THEN
!
IF (.not. (allocated(bec_d%nc_d) .and. allocated(bec%nc))) &
CALL errore('becmod_gpu', 'Unallocated array',3)
SELECT CASE(what)
CASE('d')
bec_d%nc_d = bec%nc
CASE('h')
bec%nc = bec_d%nc_d
CASE DEFAULT
CALL errore('becmod_gpu', 'Invalid command',4)
END SELECT
!
ELSE
!
IF (.not. (allocated(bec_d%k_d) .and. allocated(bec%k))) &
CALL errore('becmod_gpu', 'Unallocated array',5)
SELECT CASE(what)
CASE('d')
bec_d%k_d = bec%k
CASE('h')
bec%k = bec_d%k_d
CASE DEFAULT
CALL errore('becmod_gpu', 'Invalid command',6)
END SELECT
!
ENDIF
!
RETURN
!
END SUBROUTINE synchronize_bec_type_gpu
!-----------------------------------------------------------------------
!
!-----------------------------------------------------------------------
SUBROUTINE beccopy_gpu(bec, bec1, nkb, nbnd)
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
TYPE(bec_type_d), INTENT(in) :: bec
TYPE(bec_type_d) :: bec1
INTEGER, INTENT(in) :: nkb, nbnd
IF (gamma_only) THEN
CALL dcopy(nkb*nbnd, bec%r_d, 1, bec1%r_d, 1)
ELSEIF (noncolin) THEN
CALL zcopy(nkb*npol*nbnd, bec%nc_d, 1, bec1%nc_d, 1)
ELSE
CALL zcopy(nkb*nbnd, bec%k_d, 1, bec1%k_d, 1)
ENDIF
RETURN
END SUBROUTINE beccopy_gpu
SUBROUTINE becscal_nck_gpu(alpha, bec_d, nkb, nbnd)
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
TYPE(bec_type_d), INTENT(INOUT) :: bec_d
COMPLEX(DP), INTENT(IN) :: alpha
INTEGER, INTENT(IN) :: nkb, nbnd
IF (gamma_only) THEN
CALL errore('becscal_nck','called in the wrong case',1)
ELSEIF (noncolin) THEN
CALL zscal(nkb*npol*nbnd, alpha, bec_d%nc_d, 1)
ELSE
CALL zscal(nkb*nbnd, alpha, bec_d%k_d, 1)
ENDIF
RETURN
END SUBROUTINE becscal_nck_gpu
SUBROUTINE becscal_gamma_gpu(alpha, bec_d, nkb, nbnd)
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
TYPE(bec_type_d), INTENT(INOUT) :: bec_d
REAL(DP), INTENT(IN) :: alpha
INTEGER, INTENT(IN) :: nkb, nbnd
IF (gamma_only) THEN
CALL dscal(nkb*nbnd, alpha, bec_d%r_d, 1)
ELSE
CALL errore('becscal_gamma','called in the wrong case',1)
ENDIF
RETURN
END SUBROUTINE becscal_gamma_gpu
!
SUBROUTINE using_becp_auto(intento)
USE becmod_gpum, ONLY : using_becp_r
USE becmod_gpum, ONLY : using_becp_k
USE becmod_gpum, ONLY : using_becp_nc
IMPLICIT NONE
INTEGER, INTENT(IN) :: intento
!
!
IF ( gamma_only ) THEN
!
CALL using_becp_r(intento)
!
ELSEIF ( noncolin) THEN
!
CALL using_becp_nc(intento)
!
ELSE
!
CALL using_becp_k(intento)
!
ENDIF
END SUBROUTINE using_becp_auto
!
SUBROUTINE using_becp_d_auto(intento)
USE becmod_gpum, ONLY : using_becp_r_d
USE becmod_gpum, ONLY : using_becp_k_d
USE becmod_gpum, ONLY : using_becp_nc_d
IMPLICIT NONE
INTEGER, INTENT(IN) :: intento
!
!
IF ( gamma_only ) THEN
!
CALL using_becp_r_d(intento)
!
ELSEIF ( noncolin) THEN
!
CALL using_becp_nc_d(intento)
!
ELSE
!
CALL using_becp_k_d(intento)
!
ENDIF
END SUBROUTINE using_becp_d_auto
END MODULE becmod_subs_gpum

View File

@ -38,6 +38,7 @@ MODULE command_line_options
INTEGER :: nargs = 0
! ... QE arguments read from command line
INTEGER :: nimage_= 1, npool_= 1, ndiag_ = 0, nband_= 1, ntg_= 1, nyfft_ = 1, nmany_ = 1
LOGICAL :: pencil_decomposition_ = .false.
! ... Indicate if using library init
LOGICAL :: library_init = .FALSE.
! ... input file name read from command line
@ -125,6 +126,14 @@ CONTAINS
READ ( arg, *, ERR = 15, END = 15) ntg_ ! read the argument as ntg_
nyfft_ = ntg_ ! set nyfft_ equal to ntg_
narg = narg + 1
CASE ( '-pd', 'use_pd', '-pencil_decomposition', '-use_pencil_decomposition' )
IF (read_string) THEN
CALL my_getarg ( input_command_line, narg, arg )
ELSE
CALL get_command_argument ( narg, arg )
ENDIF
READ ( arg, *, ERR = 15, END = 15) pencil_decomposition_
narg = narg + 1
CASE ( '-nb', '-nband', '-nbgrp', '-nband_group')
IF (read_string) THEN
CALL my_getarg ( input_command_line, narg, arg )
@ -170,6 +179,7 @@ CONTAINS
CALL mp_bcast( nyfft_ , root, world_comm )
CALL mp_bcast( nband_ , root, world_comm )
CALL mp_bcast( ndiag_ , root, world_comm )
CALL mp_bcast( pencil_decomposition_ , root, world_comm )
END SUBROUTINE get_command_line
!
@ -216,17 +226,18 @@ CONTAINS
END SUBROUTINE my_getarg
SUBROUTINE set_command_line ( nimage, npool, ntg, nmany, nyfft, nband, ndiag)
SUBROUTINE set_command_line ( nimage, npool, ntg, nmany, nyfft, nband, ndiag, pencil_decomposition)
! directly set command line options without going through the command line
IMPLICIT NONE
INTEGER, INTENT(IN), OPTIONAL :: nimage, npool, ntg, nmany, nyfft, nband, ndiag
INTEGER, INTENT(IN), OPTIONAL :: nimage, npool, ntg, nmany, nyfft, nband, ndiag, pencil_decomposition
!
IF ( PRESENT(nimage) ) nimage_ = nimage
IF ( PRESENT(npool) ) npool_ = npool
IF ( PRESENT(nyfft) ) nyfft_ = nyfft
IF ( PRESENT(nband) ) nband_ = nband
IF ( PRESENT(ndiag) ) ndiag_ = ndiag
IF ( PRESENT(pencil_decomposition) ) pencil_decomposition_ = pencil_decomposition
IF ( PRESENT(ntg) .and. PRESENT(nmany) ) THEN
! ERROR!!!!
ELSEIF ( PRESENT(ntg) ) THEN

View File

@ -250,6 +250,12 @@ MODULE control_flags
LOGICAL, PUBLIC :: &
do_makov_payne = .FALSE. ! if .TRUE. makov-payne correction for isolated
! system is used
LOGICAL, PUBLIC :: &
use_gpu = .FALSE. ! if .TRUE. selects the accelerated version of the subroutines
! when available
INTEGER, PUBLIC :: &
many_fft = 16 ! the size of FFT batches in vloc_psi and
! sumband. Only use in accelerated subroutines.
!
INTEGER :: ortho_max = 0 ! maximum number of iterations in routine ortho
REAL(DP) :: ortho_eps = 0.0_DP ! threshold for convergence in routine ortho
@ -276,6 +282,7 @@ MODULE control_flags
LOGICAL, PUBLIC :: treinit_gvecs = .FALSE.
LOGICAL, PUBLIC :: diagonalize_on_host = .FALSE.
!
! ... end of module-scope declarations
!

View File

@ -0,0 +1,39 @@
!----------------------------------------------
! ... this file contains a number of subroutines optionally interfaced
! ... to cublas
!----------------------------------------------
SUBROUTINE cudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
#if defined(__CUDA)
use cudafor
use cublas
#endif
implicit none
DOUBLE PRECISION :: ALPHA,BETA
INTEGER :: INCX,INCY,LDA,M,N
CHARACTER :: TRANS
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
#if defined(__CUDA)
attributes(device) :: A, X, Y
#endif
!
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
!
END SUBROUTINE cudaDGEMV
SUBROUTINE cudaDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#if defined(__CUDA)
use cudafor
use cublas
#endif
! .. Scalar Arguments ..
DOUBLE PRECISION :: ALPHA
INTEGER :: INCX, INCY, LDA, M, N
! .. Array Arguments ..
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
#if defined(__CUDA)
attributes(device) :: A, X, Y
#endif
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
END SUBROUTINE cudaDGER

111
Modules/dylmr2_gpu.f90 Normal file
View File

@ -0,0 +1,111 @@
!
! Copyright (C) 2001 PWSCF group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-----------------------------------------------------------------------
SUBROUTINE dylmr2_gpu( nylm, ngy, g_d, gg_d, dylm_d, ipol )
!-----------------------------------------------------------------------
!! Compute \partial Y_lm(G) \over \partial (G)_ipol
!! using simple numerical derivation (SdG).
!! The spherical harmonics are calculated in ylmr2.
!
USE kinds, ONLY : DP
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: nylm
! input: number of spherical harmonics
INTEGER, INTENT(IN) :: ngy
! input: the number of g vectors to compute
INTEGER, INTENT(IN) :: ipol
! input: desired polarization
REAL(DP), INTENT(IN) :: g_d(3,ngy)
!! input: the coordinates of g vectors
REAL(DP), INTENT(IN) :: gg_d(ngy)
!! input: the moduli of g vectors
REAL(DP), INTENT(OUT) :: dylm_d(ngy,nylm)
!! output: the spherical harmonics derivatives
!
! ... local variables
!
INTEGER :: ig, lm, i
! counter on g vectors
! counter on l,m component
!
INTEGER :: apol, bpol
!
REAL(DP), PARAMETER :: delta = 1.E-6_DP
REAL(DP), ALLOCATABLE :: dg_d(:), dgi_d(:), gx_d(:,:)
REAL(DP), ALLOCATABLE :: ggx_d(:), ylmaux_d(:,:)
! dg is the finite increment for numerical derivation:
! dg = delta |G| = delta * sqrt(gg)
! dgi= 1 /(delta * sqrt(gg))
! gx = g +/- dg
! ggx = gx^2
!
#if defined(__CUDA)
attributes(DEVICE) :: g_d, gg_d, dylm_d, gx_d, ggx_d, dg_d, &
dgi_d, ylmaux_d
#endif
!
!
ALLOCATE( gx_d(3,ngy), ggx_d(ngy), dg_d(ngy) )
ALLOCATE( dgi_d(ngy), ylmaux_d(ngy,nylm) )
!$cuf kernel do (1) <<<*,*>>>
DO ig = 1, ngy
dg_d(ig) = delta * SQRT(gg_d(ig) )
IF (gg_d(ig) > 1.E-9_DP) THEN
dgi_d(ig) = 1._DP / dg_d(ig)
ELSE
dgi_d(ig) = 0._DP
ENDIF
ENDDO
!
IF ( ipol==1 ) THEN
apol = 2 ; bpol = 3
ELSEIF ( ipol==2 ) THEN
apol = 1 ; bpol = 3
ELSEIF ( ipol==3 ) THEN
apol = 1 ; bpol = 2
ENDIF
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = 1, ngy
gx_d(apol,ig) = g_d(apol,ig)
gx_d(bpol,ig) = g_d(bpol,ig)
gx_d(ipol,ig) = g_d(ipol,ig) + dg_d(ig)
ggx_d(ig) = gx_d(1,ig) * gx_d(1,ig) + &
gx_d(2,ig) * gx_d(2,ig) + &
gx_d(3,ig) * gx_d(3,ig)
ENDDO
!
CALL ylmr2_gpu( nylm, ngy, gx_d, ggx_d, dylm_d )
!
!$cuf kernel do (1) <<<*,*>>>
DO ig = 1, ngy
gx_d(ipol,ig) = g_d(ipol,ig) - dg_d(ig)
ggx_d(ig) = gx_d(1,ig) * gx_d(1,ig) + &
gx_d(2,ig) * gx_d(2,ig) + &
gx_d(3,ig) * gx_d(3,ig)
ENDDO
!
CALL ylmr2_gpu( nylm, ngy, gx_d, ggx_d, ylmaux_d )
!
!$cuf kernel do (2) <<<*,*>>>
DO lm = 1, nylm
DO ig = 1, ngy
dylm_d(ig,lm) = (dylm_d(ig,lm)-ylmaux_d(ig,lm)) * 0.5_DP * dgi_d(ig)
ENDDO
ENDDO
!
DEALLOCATE( gx_d, ggx_d, dg_d, dgi_d, ylmaux_d )
!
RETURN
!
END SUBROUTINE dylmr2_gpu

View File

@ -10,6 +10,9 @@
!------------------------------------------------------------------------------!
USE kinds, ONLY: DP
#if defined (__CUDA)
USE cudafor
#endif
!
IMPLICIT NONE
SAVE
@ -43,9 +46,13 @@
REAL(DP) :: qbac = 0.0_DP ! background neutralizing charge
INTEGER, ALLOCATABLE :: ispin(:) ! spin of each state
REAL(DP), ALLOCATABLE :: f_bgrp(:) ! occupation numbers ( at gamma )
REAL(DP), ALLOCATABLE :: f_bgrp(:) ! occupation numbers ( at gamma )
REAL(DP), ALLOCATABLE :: f_d(:) ! occupation numbers ( at gamma )
INTEGER, ALLOCATABLE :: ispin_bgrp(:) ! spin of each state
INTEGER, ALLOCATABLE :: ibgrp_g2l(:) ! local index of the i-th global band index
#if defined (__CUDA)
ATTRIBUTES( DEVICE ) :: f_d
#endif
!
!------------------------------------------------------------------------------!
CONTAINS
@ -397,6 +404,7 @@
IF( ALLOCATED( f ) ) DEALLOCATE( f )
IF( ALLOCATED( ispin ) ) DEALLOCATE( ispin )
IF( ALLOCATED( f_bgrp ) ) DEALLOCATE( f_bgrp )
IF( ALLOCATED( f_d ) ) DEALLOCATE( f_d )
IF( ALLOCATED( ispin_bgrp ) ) DEALLOCATE( ispin_bgrp )
IF( ALLOCATED( ibgrp_g2l ) ) DEALLOCATE( ibgrp_g2l )
telectrons_base_initval = .FALSE.
@ -455,6 +463,10 @@
END DO
END DO
#if defined (__CUDA)
ALLOCATE( f_d, SOURCE = f_bgrp )
#endif
RETURN
END SUBROUTINE distribute_bands

View File

@ -174,6 +174,8 @@ CONTAINS
& "395502 (2009);", &
&/9X,"""P. Giannozzi et al., J. Phys.:Condens. Matter 29 ",&
& "465901 (2017);", &
&/9X,"""P. Giannozzi et al., J. Chem. Phys. 152 ",&
& "154105 (2020);", &
&/9X," URL http://www.quantum-espresso.org"", ", &
&/5X,"in publications or presentations arising from this work. More details at",&
&/5x,"http://www.quantum-espresso.org/quote")' )

Some files were not shown because too many files have changed in this diff Show More