mirror of https://gitlab.com/QEF/q-e.git
1626 lines
50 KiB
Fortran
1626 lines
50 KiB
Fortran
!
|
|
! Copyright (C) Quantum ESPRESSO group
|
|
!
|
|
! This file is distributed under the terms of the
|
|
! GNU General Public License. See the file `License'
|
|
! in the root directory of the present distribution,
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
!
|
|
! by P. Bonfa', F. Affinito and C. Cavazzoni, Cineca
|
|
! & S. de Gironcoli, SISSA
|
|
|
|
module timers
|
|
save
|
|
LOGICAL :: ignore_time = .true. ! This is used to avoid collection of initialization times
|
|
REAL*8 :: times(21) = 0.d0 ! Array hosting various timers
|
|
CONTAINS
|
|
function mpi_wall_time ()
|
|
USE fft_param
|
|
real*8 :: mpi_wall_time
|
|
#if defined(__MPI)
|
|
mpi_wall_time = MPI_WTIME()
|
|
#else
|
|
! standard way to get the wall time, sometimes with very low precision
|
|
integer :: cr, nc
|
|
real*8, save :: t0 = -1.0
|
|
!
|
|
call system_clock(count_rate=cr)
|
|
call system_clock(count=nc)
|
|
if ( t0 < 0.0 ) t0 = dble(nc)/cr
|
|
mpi_wall_time = dble(nc)/cr - t0
|
|
#endif
|
|
end function mpi_wall_time
|
|
end module
|
|
|
|
program test
|
|
!! This mini-app provides a tool for testing and benchmarking the FFT drivers
|
|
!! contained in the FFTXlib.
|
|
!!
|
|
!! The mini-app mimics the workload of vloc_psi (a charge-density transformation
|
|
!! from complex-to-real space and back and contribution to h).
|
|
!!
|
|
!! To compile the test program, once you have properly edit the make.sys file
|
|
!! included in the FFTXlib and type:
|
|
!!
|
|
!! make TEST
|
|
!!
|
|
!! N.B.: do not run the make command alone, otherwise the FFT times will
|
|
!! not be present in the final summary.
|
|
!!
|
|
!! Then you can run your FFT tests using command like:
|
|
!!
|
|
!! mpirun -np 4 ./fft_test.x -ecutwfc 80 -alat 20 -nbnd 128 -ntg 4 -gamma .true.
|
|
!!
|
|
!! or, in case of serial build
|
|
!!
|
|
!! ./fft_test.x -ecutwfc 80 -alat 20 -nbnd 128 -ntg 4
|
|
!!
|
|
!! Command line arguments:
|
|
!!
|
|
!!-ecutwfc Plane wave energy cut off
|
|
!!
|
|
!!-ecutrho Potential energy cut off (not used)
|
|
!!
|
|
!!-alat Lattice parameter
|
|
!!
|
|
!!-nbnd Number of bands (fft cycles)
|
|
!!
|
|
!!-ntg Number of task groups
|
|
!!
|
|
!!-gamma Enables gamma point trick. Should be about 2 times faster
|
|
!!
|
|
!!-pd If .true. uses pencil decomposition, otherwise uses slab decomposition
|
|
!!
|
|
!!-av1 x y z First lattice vector, in atomic units. N.B.: when using -av1, -alat is ignored!
|
|
!!
|
|
!!-av2 x y z Second lattice vector, in atomic units. N.B.: when using -av2, -alat is ignored!
|
|
!!
|
|
!!-av3 x y z Third lattice vector, in atomic units. N.B.: when using -av3, -alat is ignored!
|
|
!!
|
|
!!-kmax kx ky kz Reciprocal lattice vector inside the BZ with maximum norm. Used to calculate max(|G+K|). (2pi/a)^2 units.
|
|
!!
|
|
!! Timings of different stages of execution are provided at the end of the
|
|
!! run.
|
|
!! In the present version, a preliminar implementation with non-blocking MPI
|
|
!! calls as been implemented. This version requires the precompilation flags
|
|
!! -D__NON_BLOCKING_SCATTER
|
|
!!
|
|
USE fft_types
|
|
USE stick_base
|
|
USE fft_parallel
|
|
USE fft_support
|
|
USE fft_helper_subroutines
|
|
USE fft_interfaces, ONLY:fwfft, invfft
|
|
USE timers
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
ENUM, BIND(C)
|
|
ENUMERATOR :: STAMP_BEGIN = 1
|
|
ENUMERATOR :: STAMP_PSI = 2
|
|
ENUMERATOR :: STAMP_INVFFT = 3
|
|
ENUMERATOR :: STAMP_VLOC = 4
|
|
ENUMERATOR :: STAMP_FWFFT = 5
|
|
ENUMERATOR :: STAMP_HPSI = 6
|
|
END ENUM
|
|
INTEGER, PARAMETER :: NUM_STAMPS = 6
|
|
!
|
|
TYPE(fft_type_descriptor) :: dfftp, dffts, dfft3d
|
|
!
|
|
TYPE(sticks_map) :: smap
|
|
INTEGER :: nx = 128
|
|
!! grid points along x (modified after)
|
|
INTEGER :: ny = 128
|
|
!! grid points along y (modified after)
|
|
INTEGER :: nz = 256
|
|
!! grid points along z (modified after)
|
|
!
|
|
INTEGER :: mype, npes, comm, root
|
|
!! MPI handles
|
|
INTEGER :: ntgs
|
|
!! number of taskgroups
|
|
INTEGER :: nbnd
|
|
!! number of bands
|
|
LOGICAL :: iope
|
|
!! I/O process
|
|
INTEGER :: ierr, i, j, ncount, ib
|
|
INTEGER :: incr=1, right_nr3
|
|
INTEGER :: ngw_, ngm_, ngs_
|
|
REAL*8 :: gcutm, gkcut, gcutms
|
|
REAL*8 :: ecutm, ecutw, ecutms
|
|
REAL*8 :: ecutrho, dual, kmax(3)
|
|
!! cut-off for density
|
|
REAL*8 :: ecutwfc
|
|
!! cut-off for the wave-function
|
|
REAL*8 :: tpiba, alat, alat_in
|
|
!! lattice parameters
|
|
REAL*8 :: time(NUM_STAMPS)
|
|
REAL*8 :: my_time(NUM_STAMPS)
|
|
REAL*8 :: time_min(NUM_STAMPS)
|
|
REAL*8 :: time_max(NUM_STAMPS)
|
|
REAL*8 :: time_avg(NUM_STAMPS)
|
|
REAL*8 :: wall
|
|
REAL*8 :: wall_avg
|
|
!
|
|
LOGICAL :: gamma_only = .false.
|
|
!! if calculations require only gamma point
|
|
LOGICAL :: use_tg
|
|
!! if calculations use task group
|
|
LOGICAL :: use_pd = .false.
|
|
!! if calculations use pencil decomposition
|
|
LOGICAL :: lpara
|
|
REAL*8 :: at(3, 3), bg(3, 3)
|
|
REAL(DP), PARAMETER :: pi = 4.0_DP * atan(1.0_DP)
|
|
!
|
|
COMPLEX(DP), ALLOCATABLE :: tg_psic(:)
|
|
COMPLEX(DP), ALLOCATABLE :: psic(:)
|
|
COMPLEX(DP), ALLOCATABLE :: psi(:, :)
|
|
!! fake argument returned by the FFT
|
|
REAL(DP), ALLOCATABLE :: v(:)
|
|
REAL(DP), ALLOCATABLE :: tg_v(:)
|
|
COMPLEX(DP), ALLOCATABLE :: hpsi(:, :)
|
|
!! array representing the potential
|
|
INTEGER :: ngms, ngsx, ngms_g
|
|
INTEGER, ALLOCATABLE :: mill(:, :), ig_l2g(:)
|
|
REAL(DP), ALLOCATABLE :: g(:, :), gg(:)
|
|
INTEGER :: ngm, ngmx, ngm_g, gstart
|
|
INTEGER :: k, group_size
|
|
INTEGER :: many_fft
|
|
!
|
|
!
|
|
integer :: nargs
|
|
CHARACTER(LEN=80) :: arg
|
|
!
|
|
#if defined(_OPENMP)
|
|
INTEGER :: PROVIDED
|
|
#endif
|
|
!
|
|
! ........
|
|
!
|
|
! default parameter (32 water molecules)
|
|
!
|
|
kmax = HUGE(0.d0) !not set
|
|
ecutwfc = 80.0d0
|
|
ecutrho = 0.d0
|
|
alat_in = 18.65
|
|
ntgs = 1
|
|
nbnd = 1
|
|
many_fft= 1
|
|
!
|
|
at(1, :) = (/0.5d0, 1.0d0, 0.0d0/)
|
|
at(2, :) = (/0.5d0, 0.0d0, 0.5d0/)
|
|
at(3, :) = (/0.0d0, 0.5d0, 1.5d0/)
|
|
!
|
|
DO i = 1, command_argument_count()
|
|
CALL getarg(i, arg)
|
|
IF ((trim(arg) == "-h") .or. (trim(arg) == "--help")) THEN
|
|
print *, "Example usage: mpirun -np 4 ./fft_test.x -ecutwfc 80 -alat 20 -nbnd 128 -ntg 4 -gamma .true."
|
|
print *, ""
|
|
print *, "For detailed information see test.f90"
|
|
STOP
|
|
END IF
|
|
END DO
|
|
|
|
nargs = command_argument_count()
|
|
do i = 1, nargs - 1
|
|
CALL get_command_argument(i, arg)
|
|
IF (TRIM(arg) == '-ecutrho') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) ecutrho
|
|
END IF
|
|
IF (TRIM(arg) == '-ecutwfc') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) ecutwfc
|
|
END IF
|
|
IF (TRIM(arg) == '-alat') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) alat_in
|
|
END IF
|
|
IF (TRIM(arg) == '-av1') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) at(1, 1)
|
|
CALL get_command_argument(i + 2, arg)
|
|
READ (arg, *) at(2, 1)
|
|
CALL get_command_argument(i + 3, arg)
|
|
READ (arg, *) at(3, 1)
|
|
alat_in = 1.0
|
|
END IF
|
|
IF (TRIM(arg) == '-av2') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) at(1, 2)
|
|
CALL get_command_argument(i + 2, arg)
|
|
READ (arg, *) at(2, 2)
|
|
CALL get_command_argument(i + 3, arg)
|
|
READ (arg, *) at(3, 2)
|
|
alat_in = 1.0
|
|
END IF
|
|
IF (TRIM(arg) == '-av3') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) at(1, 3)
|
|
CALL get_command_argument(i + 2, arg)
|
|
READ (arg, *) at(2, 3)
|
|
CALL get_command_argument(i + 3, arg)
|
|
READ (arg, *) at(3, 3)
|
|
alat_in = 1.0
|
|
END IF
|
|
IF (TRIM(arg) == '-kmax') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) kmax(1)
|
|
CALL get_command_argument(i + 2, arg)
|
|
READ (arg, *) kmax(2)
|
|
CALL get_command_argument(i + 3, arg)
|
|
READ (arg, *) kmax(3)
|
|
alat_in = 1.0
|
|
END IF
|
|
IF (TRIM(arg) == '-ntg') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) ntgs
|
|
END IF
|
|
IF (TRIM(arg) == '-nbnd') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) nbnd
|
|
END IF
|
|
IF (TRIM(arg) == '-gamma') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) gamma_only
|
|
END IF
|
|
IF (TRIM(arg) == '-pd') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) use_pd
|
|
END IF
|
|
IF ((TRIM(arg) == '-howmany').or.(TRIM(arg) == '-nh')) THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) many_fft
|
|
END IF
|
|
end do
|
|
if (ecutrho == 0.d0) ecutrho = 4.0d0*ecutwfc
|
|
|
|
#if defined(__MPI)
|
|
|
|
#if defined(_OPENMP)
|
|
IF (many_fft > 1) THEN
|
|
CALL MPI_Init_thread(MPI_THREAD_MULTIPLE, PROVIDED, ierr)
|
|
ELSE
|
|
CALL MPI_Init_thread(MPI_THREAD_FUNNELED, PROVIDED, ierr)
|
|
ENDIF
|
|
#else
|
|
CALL MPI_Init(ierr)
|
|
#endif
|
|
CALL mpi_comm_rank(MPI_COMM_WORLD, mype, ierr)
|
|
CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
|
|
comm = MPI_COMM_WORLD
|
|
root = 0
|
|
IF (mype == root) THEN
|
|
iope = .true.
|
|
ELSE
|
|
iope = .false.
|
|
ENDIF
|
|
!
|
|
! Broadcast input parameter first
|
|
!
|
|
CALL MPI_BCAST(ecutrho, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_BCAST(ecutwfc, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_BCAST(alat_in, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_BCAST(ntgs, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_BCAST(nbnd, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
|
|
!
|
|
|
|
#else
|
|
|
|
mype = 0
|
|
npes = 1
|
|
comm = 0
|
|
ntgs = 1
|
|
root = 0
|
|
iope = .true.
|
|
|
|
#endif
|
|
!
|
|
! -------- INITIALIZE DIMENSIONS AND DESCRIPTORS
|
|
!
|
|
!
|
|
ecutw = ecutwfc
|
|
! dual
|
|
ecutm = ecutrho
|
|
ecutms = ecutrho
|
|
!
|
|
at = at*alat_in
|
|
!
|
|
alat = SQRT ( at(1,1)**2+at(2,1)**2+at(3,1)**2 )
|
|
!
|
|
at(:,:) = at(:,:) / alat
|
|
!
|
|
tpiba = 2.0d0*pi/alat
|
|
!
|
|
call recips(at(1, 1), at(1, 2), at(1, 3), bg(1, 1), bg(1, 2), bg(1, 3))
|
|
!
|
|
!
|
|
IF (gamma_only) kmax = 0.d0
|
|
!
|
|
IF (ALL(kmax < HUGE(0.d0))) THEN
|
|
gkcut = sqrt ( sum(kmax(1:3)**2) ) + sqrt (ecutw/ tpiba**2)
|
|
ELSE
|
|
! use max(bg)/2 as an estimate of the largest k-point
|
|
gkcut = 0.5d0 * max ( &
|
|
sqrt (sum(bg (1:3, 1)**2) ), &
|
|
sqrt (sum(bg (1:3, 2)**2) ), &
|
|
sqrt (sum(bg (1:3, 3)**2) ) ) + sqrt (ecutw/ tpiba**2)
|
|
END IF
|
|
!
|
|
gkcut = gkcut**2 ! wave function cut-off
|
|
!
|
|
dual = ecutrho / ecutwfc
|
|
gcutm = dual * ecutwfc / tpiba**2 ! potential cut-off
|
|
!
|
|
IF ( dual > 4.000000001_dp ) THEN
|
|
gcutms = 4.D0 * ecutwfc / tpiba**2 ! smooth mesh cut-off
|
|
ELSE
|
|
gcutms = gcutm ! smooth mesh cut-off
|
|
END IF
|
|
!
|
|
if( mype == 0 ) then
|
|
|
|
write (*, *) '+-----------------------------------+'
|
|
write (*, *) '| QE FFT |'
|
|
write (*, *) '| testing & timing |'
|
|
write (*, *) '| by Carlo Cavazzoni |'
|
|
write (*, *) '+-----------------------------------+'
|
|
write (*, *)
|
|
write (*, *) 'alat = ', alat
|
|
write (*, *) 'Ecutwfc = ', ecutw
|
|
write (*, *) 'Ecutrho = ', ecutm
|
|
write (*, *) 'Ecuts = ', ecutms
|
|
write (*, *) 'Gcutrho = ', SQRT(gcutm)
|
|
write (*, *) 'Gcuts = ', SQRT(gcutms)
|
|
write (*, *) 'Gcutwfc = ', SQRT(gkcut)
|
|
write (*, *) 'Num bands = ', nbnd
|
|
write (*, *) 'Num procs = ', npes
|
|
write (*, *) 'Num Task Group = ', ntgs
|
|
write (*, *) 'Num Many FFTs = ', many_fft
|
|
write (*, *) 'Gamma trick = ', gamma_only
|
|
write (*, *) 'Pencil decomp = ', use_pd
|
|
end if
|
|
!
|
|
nx = 2*int(sqrt(gcutm)*sqrt(at(1, 1)**2 + at(2, 1)**2 + at(3, 1)**2)) + 1
|
|
ny = 2*int(sqrt(gcutm)*sqrt(at(1, 2)**2 + at(2, 2)**2 + at(3, 2)**2)) + 1
|
|
nz = 2*int(sqrt(gcutm)*sqrt(at(1, 3)**2 + at(2, 3)**2 + at(3, 3)**2)) + 1
|
|
!
|
|
!
|
|
if (mype == 0) then
|
|
write (*, *) 'nx = ', nx, ' ny = ', ny, ' nz = ', nz
|
|
end if
|
|
!
|
|
IF (gamma_only) incr = 2
|
|
dffts%has_task_groups = (ntgs > 1)
|
|
use_tg = dffts%has_task_groups
|
|
lpara = (npes > 1)
|
|
!
|
|
dffts%rho_clock_label='ffts'
|
|
dffts%wave_clock_label='fftw'
|
|
!
|
|
CALL fft_type_init(dffts, smap, "wave", gamma_only, lpara, comm, at, bg, gkcut, gcutms/gkcut, &
|
|
nyfft=ntgs, nmany=many_fft, use_pd=use_pd)
|
|
dfftp%rho_clock_label='fft'
|
|
CALL fft_type_init(dfftp, smap, "rho", gamma_only, lpara, comm, at, bg, gcutm, 4.d0, nyfft=ntgs, &
|
|
nmany=many_fft, use_pd=use_pd)
|
|
!
|
|
CALL fft_base_info(mype == 0, dffts, dfftp)
|
|
if (mype == 0) then
|
|
write (*, *) 'dffts: nr1 = ', dffts%nr1, ' nr2 = ', dffts%nr2, ' nr3 = ', dffts%nr3
|
|
write (*, *) ' nr1x= ', dffts%nr1x, ' nr2x= ', dffts%nr2x, ' nr3x= ', dffts%nr3x
|
|
end if
|
|
!
|
|
ngw_ = dffts%nwl(dffts%mype + 1)
|
|
ngs_ = dffts%ngl(dffts%mype + 1)
|
|
ngm_ = dfftp%ngl(dfftp%mype + 1)
|
|
!
|
|
IF (gamma_only) THEN
|
|
ngw_ = (ngw_ + 1)/2
|
|
ngs_ = (ngs_ + 1)/2
|
|
ngm_ = (ngm_ + 1)/2
|
|
END IF
|
|
!
|
|
ngms = ngs_
|
|
ngm = ngm_
|
|
#if defined(__MPI)
|
|
CALL MPI_ALLREDUCE(ngms, ngsx, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(ngms, ngms_g, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(ngm, ngmx, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(ngm, ngm_g, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
#else
|
|
ngsx = ngms
|
|
ngms_g= ngms
|
|
ngmx = ngm
|
|
ngm_g = ngm
|
|
#endif
|
|
!
|
|
! -------- ALLOCATE
|
|
!
|
|
IF (many_fft > 1) THEN
|
|
ALLOCATE (psic(dffts%nnr*many_fft))
|
|
ELSE
|
|
ALLOCATE (psic(dffts%nnr))
|
|
ENDIF
|
|
ALLOCATE (psi(ngms, nbnd))
|
|
ALLOCATE (hpsi(ngms, nbnd))
|
|
ALLOCATE (v(dffts%nnr))
|
|
ALLOCATE (mill(3, ngm))
|
|
ALLOCATE (g(3, ngm))
|
|
ALLOCATE (gg(ngm))
|
|
ALLOCATE (ig_l2g(ngm))
|
|
!
|
|
! -------- GENERATE G-VECTORS
|
|
!
|
|
! smallmem = .flase.
|
|
IF( .false. ) THEN
|
|
CALL ggen( dfftp, gamma_only, at, bg, gcutm, ngm_g, ngm, &
|
|
g, gg, mill, ig_l2g, gstart, .TRUE. )
|
|
ELSE
|
|
CALL ggen( dfftp, gamma_only, at, bg, gcutm, ngm_g, ngm, &
|
|
g, gg, mill, ig_l2g, gstart, .FALSE. )
|
|
END IF
|
|
CALL ggens( dffts, gamma_only, at, g, gg, mill, gcutms, ngms )
|
|
!
|
|
! -------- RESET TIMERS
|
|
!
|
|
time = 0.0d0
|
|
my_time = 0.0d0
|
|
time_min = 0.0d0
|
|
time_max = 0.0d0
|
|
time_avg = 0.0d0
|
|
!
|
|
! Test FFT for wave functions - First calls may be biased by MPI and FFT initialization
|
|
!
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(MPI_COMM_WORLD, ierr)
|
|
#endif
|
|
!
|
|
IF (use_tg) THEN
|
|
ALLOCATE (tg_psic(dffts%nnr_tg))
|
|
CALL invfft('tgWave', tg_psic, dffts)
|
|
DEALLOCATE (tg_psic)
|
|
ELSE
|
|
CALL invfft('Wave', psic, dffts)
|
|
END IF
|
|
!
|
|
IF (use_tg) THEN
|
|
ALLOCATE (tg_psic(dffts%nnr_tg))
|
|
CALL fwfft('tgWave', tg_psic, dffts)
|
|
DEALLOCATE (tg_psic)
|
|
ELSE
|
|
CALL fwfft('Wave', psic, dffts)
|
|
END IF
|
|
! Now for real,
|
|
!
|
|
! -------- RECORD TIMES
|
|
!
|
|
wall = mpi_wall_time ()
|
|
ignore_time = .false.
|
|
!
|
|
! -------- INITIALIZE WAVE FUNCTIONS psi
|
|
!
|
|
psi = 0.0d0
|
|
!
|
|
! -------- INITIALIZE POTENTIAL v
|
|
!
|
|
v = 1.0d0
|
|
!
|
|
IF (use_tg) THEN
|
|
!
|
|
ALLOCATE (tg_v(dffts%nnr_tg))
|
|
ALLOCATE (tg_psic(dffts%nnr_tg))
|
|
!
|
|
CALL tg_gather(dffts, v, tg_v)
|
|
! incr is already set to 2 for gamma_only
|
|
incr = incr*fftx_ntgrp(dffts)
|
|
!
|
|
ENDIF
|
|
|
|
!
|
|
! Execute FFT calls once more and Take time
|
|
!
|
|
ncount = 0
|
|
!
|
|
!
|
|
!
|
|
IF (use_tg) THEN
|
|
DO ib = 1, nbnd, incr
|
|
!
|
|
time(STAMP_BEGIN) = mpi_wall_time()
|
|
!
|
|
call prepare_psi_tg(ib, nbnd, ngms, psi, tg_psic, dffts, gamma_only)
|
|
time(STAMP_PSI) = mpi_wall_time()
|
|
!
|
|
CALL invfft('tgWave', tg_psic, dffts)
|
|
time(STAMP_INVFFT) = mpi_wall_time()
|
|
!
|
|
CALL tg_get_group_nr3(dffts, right_nr3)
|
|
!
|
|
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
|
tg_psic(j) = tg_psic(j)*tg_v(j)
|
|
ENDDO
|
|
!
|
|
time(STAMP_VLOC) = mpi_wall_time()
|
|
!
|
|
CALL fwfft('tgWave', tg_psic, dffts)
|
|
time(STAMP_FWFFT) = mpi_wall_time()
|
|
!
|
|
CALL accumulate_hpsi_tg(ib, nbnd, ngms, hpsi, tg_psic, dffts, gamma_only)
|
|
time(STAMP_HPSI) = mpi_wall_time()
|
|
!
|
|
DO i = 2, 6
|
|
my_time(i) = my_time(i) + (time(i) - time(i - 1))
|
|
END DO
|
|
!
|
|
ncount = ncount + 1
|
|
!
|
|
ENDDO
|
|
ELSEIF (many_fft > 1) THEN
|
|
DO ib = 1, nbnd, many_fft
|
|
!
|
|
time(STAMP_BEGIN) = mpi_wall_time()
|
|
!
|
|
group_size = MIN(many_fft, nbnd - (ib -1))
|
|
!
|
|
DO k=0, group_size - 1
|
|
!call prepare_psi(ib, nbnd, ngms, psi, psic(1+(k-1)*dffts%nnr:), dffts, gamma_only)
|
|
call prepare_psi(ib, nbnd, ngms, psi, psic, dffts, gamma_only)
|
|
ENDDO
|
|
time(STAMP_PSI) = mpi_wall_time()
|
|
!
|
|
CALL invfft('Wave', psic, dffts, howmany=group_size)
|
|
time(STAMP_INVFFT) = mpi_wall_time()
|
|
!
|
|
DO j = 1, dffts%nnr
|
|
psic(j) = psic(j)*v(j)
|
|
ENDDO
|
|
time(STAMP_VLOC) = mpi_wall_time()
|
|
!
|
|
CALL fwfft('Wave', psic, dffts, howmany=group_size)
|
|
time(STAMP_FWFFT) = mpi_wall_time()
|
|
!
|
|
DO k=0, group_size - 1
|
|
!CALL accumulate_hpsi(ib, nbnd, ngms, hpsi, psic(1+(k-1)*dffts%nnr:), dffts, gamma_only)
|
|
CALL accumulate_hpsi(ib, nbnd, ngms, hpsi, psic, dffts, gamma_only)
|
|
ENDDO
|
|
time(STAMP_HPSI) = mpi_wall_time()
|
|
!
|
|
DO i = 2, 6
|
|
my_time(i) = my_time(i) + (time(i) - time(i - 1))
|
|
END DO
|
|
!
|
|
ncount = ncount + 1
|
|
!
|
|
ENDDO
|
|
ELSE
|
|
DO ib = 1, nbnd, incr
|
|
!
|
|
time(STAMP_BEGIN) = mpi_wall_time()
|
|
!
|
|
call prepare_psi(ib, nbnd, ngms, psi, psic, dffts, gamma_only)
|
|
time(STAMP_PSI) = mpi_wall_time()
|
|
!
|
|
CALL invfft('Wave', psic, dffts)
|
|
time(STAMP_INVFFT) = mpi_wall_time()
|
|
!
|
|
DO j = 1, dffts%nnr
|
|
psic(j) = psic(j)*v(j)
|
|
ENDDO
|
|
time(STAMP_VLOC) = mpi_wall_time()
|
|
!
|
|
CALL fwfft('Wave', psic, dffts)
|
|
time(STAMP_FWFFT) = mpi_wall_time()
|
|
!
|
|
CALL accumulate_hpsi(ib, nbnd, ngms, hpsi, psic, dffts, gamma_only)
|
|
time(STAMP_HPSI) = mpi_wall_time()
|
|
!
|
|
DO i = 2, 6
|
|
my_time(i) = my_time(i) + (time(i) - time(i - 1))
|
|
END DO
|
|
!
|
|
ncount = ncount + 1
|
|
!
|
|
ENDDO
|
|
ENDIF
|
|
!
|
|
wall = mpi_wall_time() - wall
|
|
|
|
! -------- DEALLOCATE
|
|
!
|
|
DEALLOCATE (psic)
|
|
DEALLOCATE (psi)
|
|
DEALLOCATE (hpsi)
|
|
DEALLOCATE (v)
|
|
DEALLOCATE (mill)
|
|
DEALLOCATE (g)
|
|
DEALLOCATE (gg)
|
|
DEALLOCATE (ig_l2g)
|
|
IF (use_tg) THEN
|
|
!
|
|
DEALLOCATE (tg_psic)
|
|
DEALLOCATE (tg_v)
|
|
!
|
|
ENDIF
|
|
! --------------------
|
|
!
|
|
#if defined(__MPI)
|
|
CALL MPI_ALLREDUCE(my_time, time_min, 10, MPI_DOUBLE_PRECISION, MPI_MIN, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(my_time, time_max, 10, MPI_DOUBLE_PRECISION, MPI_MAX, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(my_time, time_avg, 10, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(wall, wall_avg, 1, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
#else
|
|
time_min = time
|
|
time_max = time
|
|
time_avg = time
|
|
#endif
|
|
|
|
time_avg = time_avg / npes
|
|
wall_avg = wall_avg / npes
|
|
|
|
if( mype == 0 ) then
|
|
|
|
write(*,*) '**** QE 3DFFT Timing ****'
|
|
write(*,*) 'grid size = ', dffts%nr1, dffts%nr2, dffts%nr3
|
|
write(*,*) 'num proc = ', npes
|
|
write(*,*) 'num band = ', nbnd
|
|
write(*,*) 'num task group = ', ntgs
|
|
write(*,*) 'num many ffts = ', many_fft
|
|
write(*,*) 'num fft cycles = ', ncount
|
|
|
|
write(*,100)
|
|
write(*,1)
|
|
write(*,100)
|
|
write(*,2) time_min(STAMP_PSI), time_max(STAMP_PSI), time_avg(STAMP_PSI)
|
|
write(*,3) time_min(STAMP_INVFFT), time_max(STAMP_INVFFT), time_avg(STAMP_INVFFT)
|
|
write(*,4) time_min(STAMP_VLOC), time_max(STAMP_VLOC), time_avg(STAMP_VLOC)
|
|
write(*,5) time_min(STAMP_FWFFT), time_max(STAMP_FWFFT), time_avg(STAMP_FWFFT)
|
|
write(*,6) time_min(STAMP_HPSI), time_max(STAMP_HPSI), time_avg(STAMP_HPSI)
|
|
write(*,7) wall
|
|
write(*,100)
|
|
|
|
100 FORMAT(' +--------------------+----------------+-----------------+----------------+' )
|
|
1 FORMAT(' |FFT TEST subroutine | sec. min | sec. max | sec. avg |' )
|
|
2 FORMAT(' |prepare_psi | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
|
|
3 FORMAT(' |invfft | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
|
|
4 FORMAT(' |workload | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
5 FORMAT(' |fwfft | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
6 FORMAT(' |accumulate_hpsi | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
7 FORMAT(' |wall time | ', D14.5, ' |')
|
|
|
|
end if
|
|
! now print FFT clocks
|
|
call print_clock(mype, npes, ncount)
|
|
|
|
CALL fft_type_deallocate(dffts)
|
|
CALL fft_type_deallocate(dfftp)
|
|
CALL sticks_map_deallocate( smap )
|
|
|
|
#if defined(__MPI)
|
|
CALL mpi_finalize(ierr)
|
|
#endif
|
|
|
|
contains
|
|
|
|
!
|
|
! Copyright (C) 2001 PWSCF group
|
|
! This file is distributed under the terms of the
|
|
! GNU General Public License. See the file `License'
|
|
! in the root directory of the present distribution,
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
!
|
|
!
|
|
!---------------------------------------------------------------------
|
|
|
|
subroutine recips (a1, a2, a3, b1, b2, b3)
|
|
!---------------------------------------------------------------------
|
|
!
|
|
! This routine generates the reciprocal lattice vectors b1,b2,b3
|
|
! given the real space vectors a1,a2,a3. The b's are units of 2 pi/a.
|
|
!
|
|
! first the input variables
|
|
!
|
|
implicit none
|
|
real(DP) :: a1 (3), a2 (3), a3 (3), b1 (3), b2 (3), b3 (3)
|
|
! input: first direct lattice vector
|
|
! input: second direct lattice vector
|
|
! input: third direct lattice vector
|
|
! output: first reciprocal lattice vector
|
|
! output: second reciprocal lattice vector
|
|
! output: third reciprocal lattice vector
|
|
!
|
|
! then the local variables
|
|
!
|
|
real(DP) :: den, s
|
|
! the denominator
|
|
! the sign of the permutations
|
|
integer :: iperm, i, j, k, l, ipol
|
|
! counter on the permutations
|
|
!\
|
|
! Auxiliary variables
|
|
!/
|
|
!
|
|
! Counter on the polarizations
|
|
!
|
|
! first we compute the denominator
|
|
!
|
|
den = 0
|
|
i = 1
|
|
j = 2
|
|
k = 3
|
|
s = 1.d0
|
|
100 do iperm = 1, 3
|
|
den = den + s * a1 (i) * a2 (j) * a3 (k)
|
|
l = i
|
|
i = j
|
|
j = k
|
|
k = l
|
|
enddo
|
|
i = 2
|
|
j = 1
|
|
k = 3
|
|
s = - s
|
|
if (s.lt.0.d0) goto 100
|
|
!
|
|
! here we compute the reciprocal vectors
|
|
!
|
|
i = 1
|
|
j = 2
|
|
k = 3
|
|
do ipol = 1, 3
|
|
b1 (ipol) = (a2 (j) * a3 (k) - a2 (k) * a3 (j) ) / den
|
|
b2 (ipol) = (a3 (j) * a1 (k) - a3 (k) * a1 (j) ) / den
|
|
b3 (ipol) = (a1 (j) * a2 (k) - a1 (k) * a2 (j) ) / den
|
|
l = i
|
|
i = j
|
|
j = k
|
|
k = l
|
|
enddo
|
|
return
|
|
end subroutine recips
|
|
|
|
SUBROUTINE ggen ( dfftp, gamma_only, at, bg, gcutm, ngm_g, ngm, &
|
|
g, gg, mill, ig_l2g, gstart, no_global_sort )
|
|
!----------------------------------------------------------------------
|
|
!
|
|
! This routine generates all the reciprocal lattice vectors
|
|
! contained in the sphere of radius gcutm. Furthermore it
|
|
! computes the indices nl which give the correspondence
|
|
! between the fft mesh points and the array of g vectors.
|
|
!
|
|
USE fft_types, ONLY: fft_stick_index, fft_type_descriptor
|
|
USE fft_ggen, ONLY : fft_set_nl
|
|
USE fft_param
|
|
!USE mp, ONLY: mp_rank, mp_size, mp_sum
|
|
!USE constants, ONLY : eps8
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
TYPE(fft_type_descriptor),INTENT(INOUT) :: dfftp
|
|
LOGICAL, INTENT(IN) :: gamma_only
|
|
REAL(DP), INTENT(IN) :: at(3,3), bg(3,3), gcutm
|
|
INTEGER, INTENT(IN) :: ngm_g
|
|
INTEGER, INTENT(INOUT) :: ngm
|
|
REAL(DP), INTENT(OUT) :: g(:,:), gg(:)
|
|
INTEGER, INTENT(OUT) :: mill(:,:), ig_l2g(:), gstart
|
|
! if no_global_sort is present (and it is true) G vectors are sorted only
|
|
! locally and not globally. In this case no global array needs to be
|
|
! allocated and sorted: saves memory and a lot of time for large systems.
|
|
!
|
|
LOGICAL, INTENT(IN) :: no_global_sort
|
|
!
|
|
! here a few local variables
|
|
!
|
|
REAL(DP) :: tx(3), ty(3), t(3)
|
|
REAL(DP), ALLOCATABLE :: tt(:)
|
|
INTEGER :: ngm_save, n1, n2, n3, ngm_offset, ngm_max, ngm_local
|
|
!
|
|
REAL(DP), ALLOCATABLE :: g2sort_g(:)
|
|
! array containing only g vectors for the current processor
|
|
INTEGER, ALLOCATABLE :: mill_unsorted(:,:)
|
|
! array containing all g vectors generators, on all processors
|
|
! (replicated data). When no_global_sort is present and .true.,
|
|
! only g-vectors for the current processor are stored
|
|
INTEGER, ALLOCATABLE :: igsrt(:), g2l(:)
|
|
!
|
|
INTEGER :: ni, nj, nk, i, j, k, ipol, ng, igl, indsw, ierr
|
|
INTEGER :: istart, jstart, kstart
|
|
INTEGER :: mype, npe
|
|
LOGICAL :: global_sort, is_local
|
|
INTEGER, ALLOCATABLE :: ngmpe(:)
|
|
!
|
|
! The 'no_global_sort' is not optional in this case.
|
|
! This differs from the version present in QE distribution.
|
|
global_sort = .NOT. no_global_sort
|
|
!
|
|
IF( .NOT. global_sort ) THEN
|
|
ngm_max = ngm
|
|
ELSE
|
|
ngm_max = ngm_g
|
|
END IF
|
|
!
|
|
! save current value of ngm
|
|
!
|
|
ngm_save = ngm
|
|
!
|
|
ngm = 0
|
|
ngm_local = 0
|
|
!
|
|
! set the total number of fft mesh points and and initial value of gg
|
|
! The choice of gcutm is due to the fact that we have to order the
|
|
! vectors after computing them.
|
|
!
|
|
gg(:) = gcutm + 1.d0
|
|
!
|
|
! and computes all the g vectors inside a sphere
|
|
!
|
|
ALLOCATE( mill_unsorted( 3, ngm_save ) )
|
|
ALLOCATE( igsrt( ngm_max ) )
|
|
ALLOCATE( g2l( ngm_max ) )
|
|
ALLOCATE( g2sort_g( ngm_max ) )
|
|
!
|
|
g2sort_g(:) = 1.0d20
|
|
!
|
|
! allocate temporal array
|
|
!
|
|
ALLOCATE( tt( dfftp%nr3 ) )
|
|
!
|
|
! max miller indices (same convention as in module stick_set)
|
|
!
|
|
ni = (dfftp%nr1-1)/2
|
|
nj = (dfftp%nr2-1)/2
|
|
nk = (dfftp%nr3-1)/2
|
|
!
|
|
! gamma-only: exclude space with x < 0
|
|
!
|
|
IF ( gamma_only ) THEN
|
|
istart = 0
|
|
ELSE
|
|
istart = -ni
|
|
ENDIF
|
|
!
|
|
iloop: DO i = istart, ni
|
|
!
|
|
! gamma-only: exclude plane with x = 0, y < 0
|
|
!
|
|
IF ( gamma_only .and. i == 0 ) THEN
|
|
jstart = 0
|
|
ELSE
|
|
jstart = -nj
|
|
ENDIF
|
|
!
|
|
tx(1:3) = i * bg(1:3,1)
|
|
!
|
|
jloop: DO j = jstart, nj
|
|
!
|
|
IF ( .NOT. global_sort ) THEN
|
|
IF ( fft_stick_index( dfftp, i, j ) == 0 ) CYCLE jloop
|
|
is_local = .TRUE.
|
|
ELSE
|
|
IF ( dfftp%lpara .AND. fft_stick_index( dfftp, i, j ) == 0) THEN
|
|
is_local = .FALSE.
|
|
ELSE
|
|
is_local = .TRUE.
|
|
END IF
|
|
END IF
|
|
!
|
|
! gamma-only: exclude line with x = 0, y = 0, z < 0
|
|
!
|
|
IF ( gamma_only .and. i == 0 .and. j == 0 ) THEN
|
|
kstart = 0
|
|
ELSE
|
|
kstart = -nk
|
|
ENDIF
|
|
!
|
|
ty(1:3) = tx(1:3) + j * bg(1:3,2)
|
|
!
|
|
! compute all the norm square
|
|
!
|
|
DO k = kstart, nk
|
|
!
|
|
t(1) = ty(1) + k * bg(1,3)
|
|
t(2) = ty(2) + k * bg(2,3)
|
|
t(3) = ty(3) + k * bg(3,3)
|
|
tt(k-kstart+1) = t(1)**2 + t(2)**2 + t(3)**2
|
|
ENDDO
|
|
!
|
|
! save all the norm square within cutoff
|
|
!
|
|
DO k = kstart, nk
|
|
IF (tt(k-kstart+1) <= gcutm) THEN
|
|
ngm = ngm + 1
|
|
IF (ngm > ngm_max) CALL fftx_error__ ('ggen 1', 'too many g-vectors', ngm)
|
|
IF ( tt(k-kstart+1) > eps8 ) THEN
|
|
g2sort_g(ngm) = tt(k-kstart+1)
|
|
ELSE
|
|
g2sort_g(ngm) = 0.d0
|
|
ENDIF
|
|
IF (is_local) THEN
|
|
ngm_local = ngm_local + 1
|
|
mill_unsorted( :, ngm_local ) = (/ i,j,k /)
|
|
g2l(ngm) = ngm_local
|
|
ELSE
|
|
g2l(ngm) = 0
|
|
ENDIF
|
|
ENDIF
|
|
ENDDO
|
|
ENDDO jloop
|
|
ENDDO iloop
|
|
IF (ngm /= ngm_max) &
|
|
CALL fftx_error__ ('ggen', 'g-vectors missing !', abs(ngm - ngm_max))
|
|
!
|
|
igsrt(1) = 0
|
|
IF( .NOT. global_sort ) THEN
|
|
CALL hpsort_eps( ngm, g2sort_g, igsrt, eps8 )
|
|
ELSE
|
|
CALL hpsort_eps( ngm_g, g2sort_g, igsrt, eps8 )
|
|
END IF
|
|
DEALLOCATE( g2sort_g, tt )
|
|
|
|
IF( .NOT. global_sort ) THEN
|
|
!
|
|
! compute adeguate offsets in order to avoid overlap between
|
|
! g vectors once they are gathered on a single (global) array
|
|
!
|
|
ngm_offset = 0
|
|
#if defined (__MPI)
|
|
!mype = mp_rank( dfftp%comm )
|
|
!npe = mp_size( dfftp%comm )
|
|
CALL MPI_COMM_RANK(dfftp%comm, mype, ierr)
|
|
CALL MPI_COMM_SIZE(dfftp%comm, npe, ierr)
|
|
ALLOCATE( ngmpe( npe ) )
|
|
ngmpe = 0
|
|
ngmpe( mype + 1 ) = ngm
|
|
!CALL mp_sum( ngmpe, dfftp%comm )
|
|
CALL MPI_ALLREDUCE( MPI_IN_PLACE, ngmpe, 1, MPI_INTEGER, MPI_SUM, dfftp%comm, ierr )
|
|
DO ng = 1, mype
|
|
ngm_offset = ngm_offset + ngmpe( ng )
|
|
END DO
|
|
DEALLOCATE( ngmpe )
|
|
#endif
|
|
!
|
|
END IF
|
|
|
|
ngm = 0
|
|
!
|
|
ngloop: DO ng = 1, ngm_max
|
|
!
|
|
IF (g2l(igsrt(ng))>0) THEN
|
|
! fetch the indices
|
|
i = mill_unsorted(1, g2l(igsrt(ng)))
|
|
j = mill_unsorted(2, g2l(igsrt(ng)))
|
|
k = mill_unsorted(3, g2l(igsrt(ng)))
|
|
!
|
|
ngm = ngm + 1
|
|
!
|
|
! Here map local and global g index !!! N.B: :
|
|
! the global G vectors arrangement depends on the number of processors
|
|
!
|
|
IF( .NOT. global_sort ) THEN
|
|
ig_l2g( ngm ) = ng + ngm_offset
|
|
ELSE
|
|
ig_l2g( ngm ) = ng
|
|
END IF
|
|
|
|
g(1:3, ngm) = i * bg (:, 1) + j * bg (:, 2) + k * bg (:, 3)
|
|
gg(ngm) = sum(g(1:3, ngm)**2)
|
|
ENDIF
|
|
ENDDO ngloop
|
|
|
|
DEALLOCATE( igsrt, g2l )
|
|
|
|
IF (ngm /= ngm_save) &
|
|
CALL fftx_error__ ('ggen', 'g-vectors (ngm) missing !', abs(ngm - ngm_save))
|
|
!
|
|
! determine first nonzero g vector
|
|
!
|
|
IF (gg(1).le.eps8) THEN
|
|
gstart=2
|
|
ELSE
|
|
gstart=1
|
|
ENDIF
|
|
!
|
|
! Now set nl and nls with the correct fft correspondence
|
|
!
|
|
CALL fft_set_nl( dfftp, at, g, mill )
|
|
!
|
|
END SUBROUTINE ggen
|
|
!
|
|
!-----------------------------------------------------------------------
|
|
SUBROUTINE ggens( dffts, gamma_only, at, g, gg, mill, gcutms, ngms, &
|
|
gs, ggs )
|
|
!-----------------------------------------------------------------------
|
|
!
|
|
! Initialize number and indices of g-vectors for a subgrid,
|
|
! for exactly the same ordering as for the original FFT grid
|
|
!
|
|
!--------------------------------------------------------------------
|
|
!
|
|
USE fft_types, ONLY: fft_stick_index, fft_type_descriptor
|
|
USE fft_ggen, ONLY : fft_set_nl
|
|
USE fft_param
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
LOGICAL, INTENT(IN) :: gamma_only
|
|
TYPE (fft_type_descriptor), INTENT(INOUT) :: dffts
|
|
! primitive lattice vectors
|
|
REAL(dp), INTENT(IN) :: at(3,3)
|
|
! G-vectors in FFT grid
|
|
REAL(dp), INTENT(IN) :: g(:,:), gg(:)
|
|
! Miller indices for G-vectors of FFT grid
|
|
INTEGER, INTENT(IN) :: mill(:,:)
|
|
! cutoff for subgrid
|
|
REAL(DP), INTENT(IN):: gcutms
|
|
! Local number of G-vectors in subgrid
|
|
INTEGER, INTENT(OUT):: ngms
|
|
! Optionally: G-vectors and modules
|
|
REAL(DP), INTENT(OUT), POINTER, OPTIONAL:: gs(:,:), ggs(:)
|
|
!
|
|
INTEGER :: i, ng, ngm
|
|
!
|
|
ngm = SIZE(gg)
|
|
ngms = dffts%ngm
|
|
IF ( ngms > ngm ) CALL fftx_error__ ('ggens','wrong number of G-vectors',1)
|
|
!
|
|
IF ( PRESENT(gs) ) ALLOCATE ( gs(3,ngms) )
|
|
IF ( PRESENT(ggs)) ALLOCATE ( ggs(ngms) )
|
|
ng = 0
|
|
DO i = 1, ngm
|
|
IF ( gg(i) > gcutms ) exit
|
|
IF ( PRESENT(gs) ) gs (:,i) = g(:,i)
|
|
IF ( PRESENT(ggs)) ggs(i) = gg(i)
|
|
ng = i
|
|
END DO
|
|
IF ( ng /= ngms ) CALL fftx_error__ ('ggens','mismatch in number of G-vectors',2)
|
|
!
|
|
CALL fft_set_nl ( dffts, at, g )
|
|
!
|
|
END SUBROUTINE ggens
|
|
!
|
|
SUBROUTINE fft_base_info( ionode, dffts, dfftp )
|
|
USE fft_types, ONLY: fft_type_descriptor
|
|
implicit none
|
|
LOGICAL, INTENT(IN) :: ionode
|
|
TYPE(fft_type_descriptor), INTENT(IN) :: dfftp, dffts
|
|
!
|
|
! Display fft basic information
|
|
!
|
|
IF (ionode) THEN
|
|
WRITE( *,*)
|
|
IF ( dfftp%nproc > 1 ) THEN
|
|
WRITE( stdout, '(5X,"Parallelization info")')
|
|
ELSE
|
|
WRITE( stdout, '(5X,"G-vector sticks info")')
|
|
ENDIF
|
|
WRITE( *, '(5X,"--------------------")')
|
|
WRITE( *, '(5X,"sticks: dense smooth PW", &
|
|
& 5X,"G-vecs: dense smooth PW")')
|
|
IF ( dfftp%nproc > 1 ) THEN
|
|
WRITE( *,'(5X,"Min",4X,2I8,I7,12X,2I9,I8)') &
|
|
minval(dfftp%nsp), minval(dffts%nsp), minval(dffts%nsw), &
|
|
minval(dfftp%ngl), minval(dffts%ngl), minval(dffts%nwl)
|
|
WRITE( *,'(5X,"Max",4X,2I8,I7,12X,2I9,I8)') &
|
|
maxval(dfftp%nsp), maxval(dffts%nsp), maxval(dffts%nsw), &
|
|
maxval(dfftp%ngl), maxval(dffts%ngl), maxval(dffts%nwl)
|
|
END IF
|
|
WRITE( *,'(5X,"Sum",4X,2I8,I7,12X,2I9,I8)') &
|
|
sum(dfftp%nsp), sum(dffts%nsp), sum(dffts%nsw), &
|
|
sum(dfftp%ngl), sum(dffts%ngl), sum(dffts%nwl)
|
|
ENDIF
|
|
|
|
IF(ionode) WRITE( *,*)
|
|
|
|
RETURN
|
|
END SUBROUTINE fft_base_info
|
|
end program test
|
|
|
|
subroutine start_clock(label)
|
|
use timers
|
|
use fft_param
|
|
implicit none
|
|
character(len=*) :: label
|
|
if (ignore_time) RETURN
|
|
select case (label)
|
|
case ("cft_1z")
|
|
times(1) = times(1) - mpi_wall_time()
|
|
case ("cft_2xy")
|
|
times(2) = times(2) - mpi_wall_time()
|
|
case ("cgather")
|
|
times(3) = times(3) - mpi_wall_time()
|
|
case ("cgather_grid")
|
|
times(4) = times(4) - mpi_wall_time()
|
|
case ("cscatter_grid")
|
|
times(5) = times(5) - mpi_wall_time()
|
|
case ("cscatter_sym")
|
|
times(6) = times(6) - mpi_wall_time()
|
|
case ("fft")
|
|
times(7) = times(7) - mpi_wall_time()
|
|
case ("fft_scatt_tg")
|
|
times(8) = times(8) - mpi_wall_time()
|
|
case ("fft_scatt_xy")
|
|
times(9) = times(9) - mpi_wall_time()
|
|
case ("fft_scatt_yz")
|
|
times(10) = times(10) - mpi_wall_time()
|
|
case ("fftb")
|
|
times(11) = times(11) - mpi_wall_time()
|
|
case ("fftc")
|
|
times(12) = times(12) - mpi_wall_time()
|
|
case ("fftcw")
|
|
times(13) = times(13) - mpi_wall_time()
|
|
case ("ffts")
|
|
times(14) = times(14) - mpi_wall_time()
|
|
case ("fftw")
|
|
times(15) = times(15) - mpi_wall_time()
|
|
case ("rgather_grid")
|
|
times(16) = times(16) - mpi_wall_time()
|
|
case ("rscatter_grid")
|
|
times(17) = times(17) - mpi_wall_time()
|
|
case ("fft_scatter") !alt version compatibility
|
|
times(18) = times(18) - mpi_wall_time()
|
|
case ("ALLTOALL") !alt version compatibility
|
|
times(19) = times(19) - mpi_wall_time()
|
|
case ("fft_scatt_many_yz") !alt version compatibility
|
|
times(20) = times(20) - mpi_wall_time()
|
|
case ("fft_scatt_many_xy") !alt version compatibility
|
|
times(21) = times(21) - mpi_wall_time()
|
|
case default
|
|
write (*, *) "Error, label not found", label
|
|
end select
|
|
end subroutine
|
|
|
|
subroutine stop_clock(label)
|
|
use timers
|
|
use fft_param
|
|
implicit none
|
|
character(len=*) :: label
|
|
if (ignore_time) RETURN
|
|
select case (label)
|
|
case ("cft_1z")
|
|
times(1) = times(1) + mpi_wall_time()
|
|
case ("cft_2xy")
|
|
times(2) = times(2) + mpi_wall_time()
|
|
case ("cgather")
|
|
times(3) = times(3) + mpi_wall_time()
|
|
case ("cgather_grid")
|
|
times(4) = times(4) + mpi_wall_time()
|
|
case ("cscatter_grid")
|
|
times(5) = times(5) + mpi_wall_time()
|
|
case ("cscatter_sym")
|
|
times(6) = times(6) + mpi_wall_time()
|
|
case ("fft")
|
|
times(7) = times(7) + mpi_wall_time()
|
|
case ("fft_scatt_tg")
|
|
times(8) = times(8) + mpi_wall_time()
|
|
case ("fft_scatt_xy")
|
|
times(9) = times(9) + mpi_wall_time()
|
|
case ("fft_scatt_yz")
|
|
times(10) = times(10) + mpi_wall_time()
|
|
case ("fftb")
|
|
times(11) = times(11) + mpi_wall_time()
|
|
case ("fftc")
|
|
times(12) = times(12) + mpi_wall_time()
|
|
case ("fftcw")
|
|
times(13) = times(13) + mpi_wall_time()
|
|
case ("ffts")
|
|
times(14) = times(14) + mpi_wall_time()
|
|
case ("fftw")
|
|
times(15) = times(15) + mpi_wall_time()
|
|
case ("rgather_grid")
|
|
times(16) = times(16) + mpi_wall_time()
|
|
case ("rscatter_grid")
|
|
times(17) = times(17) + mpi_wall_time()
|
|
case ("fft_scatter") !alt version compatibility
|
|
times(18) = times(18) + mpi_wall_time()
|
|
case ("ALLTOALL") !alt version compatibility
|
|
times(19) = times(19) + mpi_wall_time()
|
|
case ("fft_scatt_many_yz") !alt version compatibility
|
|
times(20) = times(20) + mpi_wall_time()
|
|
case ("fft_scatt_many_xy") !alt version compatibility
|
|
times(21) = times(21) + mpi_wall_time()
|
|
case default
|
|
write (*, *) "Error, label not found", label
|
|
end select
|
|
end subroutine
|
|
!
|
|
subroutine print_clock(mype, npes, ncount)
|
|
use timers
|
|
use fft_param
|
|
implicit none
|
|
integer, intent(in) :: mype, npes, ncount
|
|
REAL*8 :: time_min(21)
|
|
REAL*8 :: time_max(21)
|
|
REAL*8 :: time_avg(21)
|
|
integer :: ierr
|
|
|
|
#if defined(__MPI)
|
|
CALL MPI_ALLREDUCE(times, time_min, 21, MPI_DOUBLE_PRECISION, MPI_MIN, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(times, time_max, 21, MPI_DOUBLE_PRECISION, MPI_MAX, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(times, time_avg, 21, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
time_avg = time_avg/npes
|
|
#else
|
|
time_min(:) = times(:)
|
|
time_max(:) = times(:)
|
|
time_avg(:) = times(:)
|
|
#endif
|
|
|
|
if (mype == 0) then
|
|
write (*, 10100)
|
|
write (*, 101)
|
|
write (*, 10100)
|
|
if (times(1) > 0.d0) write (*, 102) time_min(1), time_max(1), time_avg(1)
|
|
if (times(2) > 0.d0) write (*, 103) time_min(2), time_max(2), time_avg(2)
|
|
if (times(3) > 0.d0) write (*, 104) time_min(3), time_max(3), time_avg(3)
|
|
if (times(4) > 0.d0) write (*, 105) time_min(4), time_max(4), time_avg(4)
|
|
if (times(5) > 0.d0) write (*, 106) time_min(5), time_max(5), time_avg(5)
|
|
if (times(6) > 0.d0) write (*, 107) time_min(6), time_max(6), time_avg(6)
|
|
if (times(7) > 0.d0) write (*, 108) time_min(7), time_max(7), time_avg(7)
|
|
if (times(8) > 0.d0) write (*, 109) time_min(8), time_max(8), time_avg(8)
|
|
if (times(9) > 0.d0) write (*, 1010) time_min(9), time_max(9), time_avg(9)
|
|
if (times(10) > 0.d0) write (*, 1011) time_min(10), time_max(10), time_avg(10)
|
|
if (times(11) > 0.d0) write (*, 1012) time_min(11), time_max(11), time_avg(11)
|
|
if (times(12) > 0.d0) write (*, 1013) time_min(12), time_max(12), time_avg(12)
|
|
if (times(13) > 0.d0) write (*, 1014) time_min(13), time_max(13), time_avg(13)
|
|
if (times(14) > 0.d0) write (*, 1015) time_min(14), time_max(14), time_avg(14)
|
|
if (times(20) > 0.d0) write (*, 1021) time_min(20), time_max(20), time_avg(20)
|
|
if (times(21) > 0.d0) write (*, 1022) time_min(21), time_max(21), time_avg(21)
|
|
if (times(15) > 0.d0) write (*, 1016) time_min(15), time_max(15), time_avg(15)
|
|
if (times(16) > 0.d0) write (*, 1017) time_min(16), time_max(16), time_avg(16)
|
|
if (times(17) > 0.d0) write (*, 1018) time_min(17), time_max(17), time_avg(17)
|
|
if (times(18) > 0.d0) write (*, 1019) time_min(18), time_max(18), time_avg(18)
|
|
if (times(19) > 0.d0) write (*, 1020) time_min(19), time_max(19), time_avg(19)
|
|
write (*, 10100)
|
|
end if
|
|
10100 FORMAT(' +--------------------+----------------+-----------------+----------------+' )
|
|
101 FORMAT(' |FFT subroutine | sec. min | sec. max | sec. avg |' )
|
|
102 FORMAT(' |cft_1z | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
|
|
103 FORMAT(' |cft_2xy | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
|
|
104 FORMAT(' |cgather | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
105 FORMAT(' |cgather_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
106 FORMAT(' |cscatter_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
107 FORMAT(' |cscatter_sym | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
108 FORMAT(' |fft | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
109 FORMAT(' |fft_scatt_tg | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1010 FORMAT(' |fft_scatt_xy | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1011 FORMAT(' |fft_scatt_yz | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1012 FORMAT(' |fftb | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1013 FORMAT(' |fftc | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1014 FORMAT(' |fftcw | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1015 FORMAT(' |ffts | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1016 FORMAT(' |fftw | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1017 FORMAT(' |rgather_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1018 FORMAT(' |rscatter_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1019 FORMAT(' |fft_scatter | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1020 FORMAT(' |ALLTOALL | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1021 FORMAT(' |fft_scatt_many_yz | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
1022 FORMAT(' |fft_scatt_many_xy | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
|
|
|
end subroutine
|
|
|
|
subroutine start_clock_gpu(label)
|
|
implicit none
|
|
character(len=*) :: label
|
|
call start_clock(label)
|
|
end subroutine
|
|
|
|
subroutine stop_clock_gpu(label)
|
|
implicit none
|
|
character(len=*) :: label
|
|
call stop_clock(label)
|
|
end subroutine
|
|
!
|
|
! Copyright (C) 2001 PWSCF group
|
|
! This file is distributed under the terms of the
|
|
! GNU General Public License. See the file `License'
|
|
! in the root directory of the present distribution,
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
!
|
|
!---------------------------------------------------------------------
|
|
subroutine hpsort_eps(n, ra, ind, eps)
|
|
!---------------------------------------------------------------------
|
|
! sort an array ra(1:n) into ascending order using heapsort algorithm,
|
|
! and considering two elements being equal if their values differ
|
|
! for less than "eps".
|
|
! n is input, ra is replaced on output by its sorted rearrangement.
|
|
! create an index table (ind) by making an exchange in the index array
|
|
! whenever an exchange is made on the sorted data array (ra).
|
|
! in case of equal values in the data array (ra) the values in the
|
|
! index array (ind) are used to order the entries.
|
|
! if on input ind(1) = 0 then indices are initialized in the routine,
|
|
! if on input ind(1) != 0 then indices are assumed to have been
|
|
! initialized before entering the routine and these
|
|
! indices are carried around during the sorting process
|
|
!
|
|
! no work space needed !
|
|
! free us from machine-dependent sorting-routines !
|
|
!
|
|
! adapted from Numerical Recipes pg. 329 (new edition)
|
|
!
|
|
USE fft_param
|
|
implicit none
|
|
!-input/output variables
|
|
integer, intent(in) :: n
|
|
integer, intent(inout) :: ind (*)
|
|
real(DP), intent(inout) :: ra (*)
|
|
real(DP), intent(in) :: eps
|
|
!-local variables
|
|
integer :: i, ir, j, l, iind
|
|
real(DP) :: rra
|
|
! initialize index array
|
|
if (ind (1) .eq.0) then
|
|
do i = 1, n
|
|
ind (i) = i
|
|
enddo
|
|
endif
|
|
! nothing to order
|
|
if (n.lt.2) return
|
|
! initialize indices for hiring and retirement-promotion phase
|
|
l = n / 2 + 1
|
|
|
|
ir = n
|
|
|
|
sorting: do
|
|
|
|
! still in hiring phase
|
|
if ( l .gt. 1 ) then
|
|
l = l - 1
|
|
rra = ra (l)
|
|
iind = ind (l)
|
|
! in retirement-promotion phase.
|
|
else
|
|
! clear a space at the end of the array
|
|
rra = ra (ir)
|
|
!
|
|
iind = ind (ir)
|
|
! retire the top of the heap into it
|
|
ra (ir) = ra (1)
|
|
!
|
|
ind (ir) = ind (1)
|
|
! decrease the size of the corporation
|
|
ir = ir - 1
|
|
! done with the last promotion
|
|
if ( ir .eq. 1 ) then
|
|
! the least competent worker at all !
|
|
ra (1) = rra
|
|
!
|
|
ind (1) = iind
|
|
exit sorting
|
|
endif
|
|
endif
|
|
! wheter in hiring or promotion phase, we
|
|
i = l
|
|
! set up to place rra in its proper level
|
|
j = l + l
|
|
!
|
|
do while ( j .le. ir )
|
|
if ( j .lt. ir ) then
|
|
! compare to better underling
|
|
if ( abs(ra(j)-ra(j+1)).ge.eps ) then
|
|
if (ra(j).lt.ra(j+1)) j = j + 1
|
|
else
|
|
! this means ra(j) == ra(j+1) within tolerance
|
|
if (ind (j) .lt.ind (j + 1) ) j = j + 1
|
|
endif
|
|
endif
|
|
! demote rra
|
|
if ( abs(rra - ra(j)).ge.eps ) then
|
|
if (rra.lt.ra(j)) then
|
|
ra (i) = ra (j)
|
|
ind (i) = ind (j)
|
|
i = j
|
|
j = j + j
|
|
else
|
|
! set j to terminate do-while loop
|
|
j = ir + 1
|
|
end if
|
|
else
|
|
!this means rra == ra(j) within tolerance
|
|
! demote rra
|
|
if (iind.lt.ind (j) ) then
|
|
ra (i) = ra (j)
|
|
ind (i) = ind (j)
|
|
i = j
|
|
j = j + j
|
|
else
|
|
! set j to terminate do-while loop
|
|
j = ir + 1
|
|
endif
|
|
end if
|
|
enddo
|
|
ra(i) = rra
|
|
ind(i) = iind
|
|
|
|
end do sorting
|
|
!
|
|
end subroutine hpsort_eps
|
|
|
|
subroutine prepare_psi_tg(ibnd, nbnd, ngms, psi, tg_psi, dffts, gamma_only)
|
|
USE fft_param
|
|
USE fft_types
|
|
USE fft_helper_subroutines
|
|
implicit none
|
|
integer, intent(in) :: ibnd, nbnd, ngms
|
|
TYPE(fft_type_descriptor), intent(in) :: dffts
|
|
complex(DP) :: tg_psi(dffts%nnr_tg)
|
|
complex(DP) :: psi(ngms, nbnd)
|
|
logical, intent(in) :: gamma_only
|
|
integer ioff, idx, j, ntgrp, right_nnr
|
|
!
|
|
tg_psi(:) = ( 0.D0, 0.D0 )
|
|
ioff = 0
|
|
!
|
|
CALL tg_get_nnr( dffts, right_nnr )
|
|
ntgrp = fftx_ntgrp(dffts)
|
|
!
|
|
IF (gamma_only) THEN
|
|
DO idx = 1, 2*ntgrp, 2
|
|
!
|
|
! ... 2*ntgrp ffts at the same time
|
|
!
|
|
IF( idx + ibnd - 1 < nbnd ) THEN
|
|
DO j = 1, ngms
|
|
tg_psi(dffts%nl (j)+ioff)= psi(j,idx+ibnd-1)+&
|
|
(0.0d0,1.d0) * psi(j,idx+ibnd)
|
|
tg_psi(dffts%nlm(j)+ioff)=CONJG(psi(j,idx+ibnd-1) -&
|
|
(0.0d0,1.d0) * psi(j,idx+ibnd) )
|
|
END DO
|
|
ELSE IF( idx + ibnd - 1 == nbnd ) THEN
|
|
DO j = 1, ngms
|
|
tg_psi(dffts%nl (j)+ioff)= psi(j,idx+ibnd-1)
|
|
tg_psi(dffts%nlm(j)+ioff)=CONJG( psi(j,idx+ibnd-1) )
|
|
END DO
|
|
END IF
|
|
|
|
ioff = ioff + right_nnr
|
|
|
|
END DO
|
|
ELSE
|
|
!
|
|
DO idx = 1, ntgrp
|
|
IF( idx + ibnd - 1 <= nbnd ) THEN
|
|
!$omp parallel do
|
|
DO j = 1, ngms
|
|
! here we forget about igk
|
|
tg_psi(dffts%nl(j) +ioff) = psi(j,idx+ibnd-1)
|
|
ENDDO
|
|
!$omp end parallel do
|
|
ENDIF
|
|
ioff = ioff + right_nnr
|
|
ENDDO
|
|
END IF
|
|
end subroutine prepare_psi_tg
|
|
|
|
subroutine prepare_psi( ibnd, nbnd, ngms, psi, psic, dffts, gamma_only)
|
|
USE fft_param
|
|
USE fft_types
|
|
USE fft_helper_subroutines
|
|
implicit none
|
|
integer, intent(in) :: ibnd, nbnd, ngms
|
|
TYPE(fft_type_descriptor), intent(in) :: dffts
|
|
complex(DP) :: psic( dffts%nnr )
|
|
complex(DP) :: psi( ngms, nbnd )
|
|
logical, intent(in) :: gamma_only
|
|
integer :: j
|
|
psic(:) = (0.d0, 0.d0)
|
|
IF (gamma_only) THEN
|
|
IF (ibnd < nbnd) THEN
|
|
! two ffts at the same time
|
|
DO j = 1, ngms
|
|
psic(dffts%nl (j))= psi(j,ibnd) + (0.0d0,1.d0)*psi(j,ibnd+1)
|
|
psic(dffts%nlm(j))=conjg(psi(j,ibnd) - (0.0d0,1.d0)*psi(j,ibnd+1))
|
|
ENDDO
|
|
ELSE
|
|
DO j = 1, ngms
|
|
psic (dffts%nl (j)) = psi(j, ibnd)
|
|
psic (dffts%nlm(j)) = conjg(psi(j, ibnd))
|
|
ENDDO
|
|
ENDIF
|
|
ELSE
|
|
DO j = 1, ngms
|
|
! here we forget about igk
|
|
psic (dffts%nl (j)) = psi(j, ibnd)
|
|
END DO
|
|
END IF
|
|
end subroutine prepare_psi
|
|
|
|
subroutine accumulate_hpsi( ibnd, nbnd, ngms, hpsi, psic, dffts, gamma_only)
|
|
USE fft_types
|
|
USE fft_param
|
|
USE fft_helper_subroutines
|
|
implicit none
|
|
integer, intent(in) :: ibnd, nbnd, ngms
|
|
TYPE(fft_type_descriptor) :: dffts
|
|
complex(DP) :: psic( dffts%nnr )
|
|
complex(DP) :: hpsi( ngms, nbnd )
|
|
logical, intent(in) :: gamma_only
|
|
integer j
|
|
complex(DP) :: fp, fm
|
|
!
|
|
!
|
|
! addition to the total product
|
|
!
|
|
IF (gamma_only) THEN
|
|
IF (ibnd < nbnd) THEN
|
|
! two ffts at the same time
|
|
DO j = 1, ngms
|
|
fp = (psic (dffts%nl(j)) + psic (dffts%nlm(j)))*0.5d0
|
|
fm = (psic (dffts%nl(j)) - psic (dffts%nlm(j)))*0.5d0
|
|
hpsi (j, ibnd) = hpsi (j, ibnd) + &
|
|
cmplx( dble(fp), aimag(fm),kind=DP)
|
|
hpsi (j, ibnd+1) = hpsi (j, ibnd+1) + &
|
|
cmplx(aimag(fp),- dble(fm),kind=DP)
|
|
ENDDO
|
|
ELSE
|
|
DO j = 1, ngms
|
|
hpsi (j, ibnd) = hpsi (j, ibnd) + psic (dffts%nl(j))
|
|
ENDDO
|
|
ENDIF
|
|
ELSE
|
|
DO j = 1, ngms ! here we forget about igk_k
|
|
hpsi (j, ibnd) = hpsi (j, ibnd) + psic (dffts%nl(j))
|
|
ENDDO
|
|
END IF
|
|
!
|
|
end subroutine accumulate_hpsi
|
|
subroutine accumulate_hpsi_tg( ibnd, nbnd, ngms, hpsi, tg_psic, dffts, gamma_only)
|
|
USE fft_types
|
|
USE fft_param
|
|
USE fft_helper_subroutines
|
|
implicit none
|
|
integer, intent(in) :: ibnd, nbnd, ngms
|
|
TYPE(fft_type_descriptor) :: dffts
|
|
complex(DP) :: tg_psic( dffts%nnr_tg )
|
|
complex(DP) :: hpsi( ngms, nbnd )
|
|
logical, intent(in) :: gamma_only
|
|
integer ioff, idx, j, right_inc
|
|
complex(DP) :: fp, fm
|
|
!
|
|
! addition to the total product
|
|
!
|
|
ioff = 0
|
|
!
|
|
CALL tg_get_recip_inc( dffts, right_inc )
|
|
!
|
|
IF (gamma_only) THEN
|
|
DO idx = 1, 2*fftx_ntgrp(dffts), 2
|
|
!
|
|
IF( idx + ibnd - 1 < nbnd ) THEN
|
|
DO j = 1, ngms
|
|
fp= ( tg_psic( dffts%nl(j) + ioff ) + &
|
|
tg_psic( dffts%nlm(j) + ioff ) ) * 0.5d0
|
|
fm= ( tg_psic( dffts%nl(j) + ioff ) - &
|
|
tg_psic( dffts%nlm(j) + ioff ) ) * 0.5d0
|
|
hpsi (j, ibnd+idx-1) = hpsi (j, ibnd+idx-1) + &
|
|
cmplx( dble(fp), aimag(fm),kind=DP)
|
|
hpsi (j, ibnd+idx ) = hpsi (j, ibnd+idx ) + &
|
|
cmplx(aimag(fp),- dble(fm),kind=DP)
|
|
ENDDO
|
|
ELSEIF( idx + ibnd - 1 == nbnd ) THEN
|
|
DO j = 1, ngms
|
|
hpsi (j, ibnd+idx-1) = hpsi (j, ibnd+idx-1) + &
|
|
tg_psic( dffts%nl(j) + ioff )
|
|
ENDDO
|
|
ENDIF
|
|
!
|
|
ioff = ioff + right_inc
|
|
!
|
|
ENDDO
|
|
ELSE
|
|
DO idx = 1, fftx_ntgrp(dffts)
|
|
!
|
|
IF( idx + ibnd - 1 <= nbnd ) THEN
|
|
!$omp parallel do
|
|
DO j = 1, ngms
|
|
hpsi (j, ibnd+idx-1) = hpsi (j, ibnd+idx-1) + &
|
|
tg_psic( dffts%nl(j) + ioff )
|
|
! we forgot about igk above
|
|
ENDDO
|
|
!$omp end parallel do
|
|
ENDIF
|
|
!
|
|
ioff = ioff + right_inc
|
|
!
|
|
ENDDO
|
|
END IF
|
|
!
|
|
end subroutine accumulate_hpsi_tg
|