quantum-espresso/FFTXlib/test.f90

1387 lines
42 KiB
Fortran

!
! Copyright (C) Quantum ESPRESSO group
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
! by P. Bonfa', F. Affinito and C. Cavazzoni, Cineca
! & S. de Gironcoli, SISSA
module timers
save
LOGICAL :: ignore_time = .true.
REAL*8 :: times(20) = 0.d0
end module
program test
!! This mini-app provides a tool for testing and benchmarking the FFT drivers
!! contained in the FFTXlib.
!!
!! The mini-app mimics the workload of vloc_psi (a charge-density transformation
!! from complex-to-real space and back and contribution to h).
!!
!! To compile the test program, once you have properly edit the make.sys file
!! included in the FFTXlib and type:
!!
!! make TEST
!!
!! N.B.: do not run the make command alone, otherwise the FFT times will
!! not be present in the final summary.
!!
!! Then you can run your FFT tests using command like:
!!
!! mpirun -np 4 ./fft_test.x -ecutwfc 80 -alat 20 -nbnd 128 -ntg 4 -gamma .true.
!!
!! or, in case of serial build
!!
!! ./fft_test.x -ecutwfc 80 -alat 20 -nbnd 128 -ntg 4
!!
!! Command line arguments:
!!
!!-ecutwfc Plane wave energy cut off
!!
!!-alat Lattice parameter
!!
!!-nbnd Number of bands (fft cycles)
!!
!!-ntg Number of task groups
!!
!!-gamma Enables gamma point trick. Should be about 2 times faster.
!!
!! Timings of different stages of execution are provided at the end of the
!! run.
!! In the present version, a preliminar implementation with non-blocking MPI
!! calls as been implemented. This version requires the precompilation flags
!! -D__NON_BLOCKING_SCATTER
!!
USE fft_types
USE stick_base
USE fft_parallel
USE fft_support
USE fft_helper_subroutines
USE fft_interfaces, ONLY:fwfft, invfft
USE timers
IMPLICIT NONE
!
TYPE(fft_type_descriptor) :: dfftp, dffts, dfft3d
!
TYPE(sticks_map) :: smap
INTEGER :: nx = 128
!! grid points along x (modified after)
INTEGER :: ny = 128
!! grid points along y (modified after)
INTEGER :: nz = 256
!! grid points along z (modified after)
!
INTEGER :: mype, npes, comm, root
!! MPI handles
INTEGER :: ntgs
!! number of taskgroups
INTEGER :: nbnd
!! number of bands
LOGICAL :: iope
!! I/O process
INTEGER :: ierr, i, j, ncount, ib
INTEGER :: incr=1, right_nr3
INTEGER :: ngw_, ngm_, ngs_
REAL*8 :: gcutm, gkcut, gcutms
REAL*8 :: ecutm, ecutw, ecutms
REAL*8 :: ecutrho
!! cut-off for density
REAL*8 :: ecutwfc
!! cut-off for the wave-function
REAL*8 :: tpiba, alat, alat_in
!! lattice parameters
REAL*8 :: time(100)
REAL*8 :: my_time(100)
REAL*8 :: time_min(100)
REAL*8 :: time_max(100)
REAL*8 :: time_avg(100)
REAL*8 :: wall
REAL*8 :: wall_avg
!
LOGICAL :: gamma_only = .false.
LOGICAL :: use_tg
!! if calculations require only gamma point
REAL*8 :: at(3, 3), bg(3, 3)
REAL(DP), PARAMETER :: pi = 3.14159265358979323846_DP
!
COMPLEX(DP), ALLOCATABLE :: tg_psic(:)
COMPLEX(DP), ALLOCATABLE :: psic(:)
COMPLEX(DP), ALLOCATABLE :: psi(:, :)
!! fake argument returned by the FFT
REAL(DP), ALLOCATABLE :: v(:)
REAL(DP), ALLOCATABLE :: tg_v(:)
COMPLEX(DP), ALLOCATABLE :: hpsi(:, :)
!! array representing the potential
INTEGER, ALLOCATABLE :: nls(:), nlsm(:)
INTEGER :: ngms, ngsx, ngms_g
INTEGER, ALLOCATABLE :: mill(:, :), nl(:), nlm(:), ig_l2g(:)
REAL(DP), ALLOCATABLE :: g(:, :), gg(:)
INTEGER :: ngm, ngmx, ngm_g, gstart
!
!
integer :: nargs
CHARACTER(LEN=80) :: arg
!
#if defined(_OPENMP)
INTEGER :: PROVIDED
#endif
!
! ........
!
! default parameter (32 water molecules)
!
ecutwfc = 80.0d0
ecutrho = 0.d0
alat_in = 18.65
ntgs = 1
nbnd = 1
!
nargs = command_argument_count()
do i = 1, nargs - 1
CALL get_command_argument(i, arg)
IF (TRIM(arg) == '-ecutrho') THEN
CALL get_command_argument(i + 1, arg)
READ (arg, *) ecutrho
END IF
IF (TRIM(arg) == '-ecutwfc') THEN
CALL get_command_argument(i + 1, arg)
READ (arg, *) ecutwfc
END IF
IF (TRIM(arg) == '-alat') THEN
CALL get_command_argument(i + 1, arg)
READ (arg, *) alat_in
END IF
IF (TRIM(arg) == '-ntg') THEN
CALL get_command_argument(i + 1, arg)
READ (arg, *) ntgs
END IF
IF (TRIM(arg) == '-nbnd') THEN
CALL get_command_argument(i + 1, arg)
READ (arg, *) nbnd
END IF
IF (TRIM(arg) == '-gamma') THEN
CALL get_command_argument(i + 1, arg)
READ (arg, *) gamma_only
END IF
end do
if (ecutrho == 0.d0) ecutrho = 4.0d0*ecutwfc
#if defined(__MPI)
#if defined(_OPENMP)
CALL MPI_Init_thread(MPI_THREAD_FUNNELED, PROVIDED, ierr)
#else
CALL MPI_Init(ierr)
#endif
CALL mpi_comm_rank(MPI_COMM_WORLD, mype, ierr)
CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
comm = MPI_COMM_WORLD
root = 0
IF (mype == root) THEN
iope = .true.
ELSE
iope = .false.
ENDIF
#else
mype = 0
npes = 1
comm = 0
ntgs = 1
root = 0
iope = .true.
#endif
!
! Broadcast input parameter first
!
CALL MPI_BCAST(ecutrho, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
CALL MPI_BCAST(ecutwfc, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
CALL MPI_BCAST(alat_in, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
CALL MPI_BCAST(ntgs, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
CALL MPI_BCAST(nbnd, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
!
!
! -------- INITIALIZE DIMENSIONS AND DESCRIPTORS
!
!
ecutw = ecutwfc
! dual
ecutm = ecutrho
ecutms = ecutrho
!
at(1, :) = (/0.5d0, 1.0d0, 0.0d0/)
at(2, :) = (/0.5d0, 0.0d0, 0.5d0/)
at(3, :) = (/0.0d0, 0.5d0, 1.5d0/)
!
at = at*alat_in
!
alat = SQRT(at(1, 1)**2 + at(2, 1)**2 + at(3, 1)**2)
!
tpiba = 2.0d0*pi/alat
!
gcutm = ecutm / tpiba**2 ! potential cut-off
gcutms= ecutms / tpiba**2 ! smooth mesh cut-off
gkcut = ecutw / tpiba**2 ! wave function cut-off
!
if( mype == 0 ) then
write (*, *) '+-----------------------------------+'
write (*, *) '| QE FFT |'
write (*, *) '| testing & timing |'
write (*, *) '| by Carlo Cavazzoni |'
write (*, *) '+-----------------------------------+'
write (*, *)
write (*, *) 'alat = ', alat
write (*, *) 'Ecutwfc = ', ecutw
write (*, *) 'Ecutrho = ', ecutm
write (*, *) 'Ecuts = ', ecutms
write (*, *) 'Gcutrho = ', SQRT(gcutm)
write (*, *) 'Gcuts = ', SQRT(gcutms)
write (*, *) 'Gcutwfc = ', SQRT(gkcut)
write (*, *) 'Num bands = ', nbnd
write (*, *) 'Num procs = ', npes
write (*, *) 'Num Task Group = ', ntgs
write (*, *) 'Gamma trick = ', gamma_only
end if
!
at = at/alat
!
call recips(at(1, 1), at(1, 2), at(1, 3), bg(1, 1), bg(1, 2), bg(1, 3))
!
nx = 2*int(sqrt(gcutm)*sqrt(at(1, 1)**2 + at(2, 1)**2 + at(3, 1)**2)) + 1
ny = 2*int(sqrt(gcutm)*sqrt(at(1, 2)**2 + at(2, 2)**2 + at(3, 2)**2)) + 1
nz = 2*int(sqrt(gcutm)*sqrt(at(1, 3)**2 + at(2, 3)**2 + at(3, 3)**2)) + 1
!
if (mype == 0) then
write (*, *) 'nx = ', nx, ' ny = ', ny, ' nz = ', nz
end if
!
IF (gamma_only) incr = 2
dffts%have_task_groups = (ntgs > 1)
use_tg = dffts%have_task_groups
!
CALL fft_type_init(dffts, smap, "wave", gamma_only, .true., comm, at, bg, gkcut, gcutms/gkcut, nyfft=ntgs)
CALL fft_type_init(dfftp, smap, "rho", gamma_only, .true., comm, at, bg, gcutm, 4.d0, nyfft=ntgs)
!
if (mype == 0) then
write (*, *) 'dffts: nr1 = ', dffts%nr1, ' nr2 = ', dffts%nr2, ' nr3 = ', dffts%nr3
write (*, *) ' nr1x= ', dffts%nr1x, ' nr2x= ', dffts%nr2x, ' nr3x= ', dffts%nr3x
end if
!
ngw_ = dffts%nwl(dffts%mype + 1)
ngs_ = dffts%ngl(dffts%mype + 1)
ngm_ = dfftp%ngl(dfftp%mype + 1)
!
IF (gamma_only) THEN
ngw_ = (ngw_ + 1)/2
ngs_ = (ngs_ + 1)/2
ngm_ = (ngm_ + 1)/2
END IF
!
ngms = ngs_
CALL MPI_ALLREDUCE(ngms, ngsx, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, ierr)
CALL MPI_ALLREDUCE(ngms, ngms_g, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
ngm = ngm_
CALL MPI_ALLREDUCE(ngm, ngmx, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, ierr)
CALL MPI_ALLREDUCE(ngm, ngm_g, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
!
! -------- ALLOCATE
!
ALLOCATE (psic(dffts%nnr))
ALLOCATE (psi(ngms, nbnd))
ALLOCATE (hpsi(ngms, nbnd))
ALLOCATE (v(dffts%nnr))
ALLOCATE (nls(ngms))
ALLOCATE (nlsm(ngms))
ALLOCATE (nl(ngm))
ALLOCATE (nlm(ngm))
ALLOCATE (mill(3, ngm))
ALLOCATE (g(3, ngm))
ALLOCATE (gg(ngm))
ALLOCATE (ig_l2g(ngm))
!
! -------- GENERATE G-VECTORS
!
call ggen(gamma_only, at, bg, .true., ngm, ngms, ngm_g, ngms_g, mill, &
& nl, nls, nlm, nlsm, gg, g, ig_l2g, gstart, gcutm, gcutms, dfftp, dffts)
!
! -------- RESET TIMERS
!
time = 0.0d0
my_time = 0.0d0
time_min = 0.0d0
time_max = 0.0d0
time_avg = 0.0d0
!
! Test FFT for wave functions - First calls may be biased by MPI and FFT initialization
!
CALL MPI_BARRIER(MPI_COMM_WORLD, ierr)
!
IF (use_tg) THEN
ALLOCATE (tg_psic(dffts%nnr_tg))
CALL invfft('tgWave', tg_psic, dffts)
DEALLOCATE (tg_psic)
ELSE
CALL invfft('Wave', psic, dffts)
END IF
!
IF (use_tg) THEN
ALLOCATE (tg_psic(dffts%nnr_tg))
CALL fwfft('tgWave', tg_psic, dffts)
DEALLOCATE (tg_psic)
ELSE
CALL fwfft('Wave', psic, dffts)
END IF
! Now for real,
!
! -------- RECORD TIMES
!
wall = MPI_WTIME()
ignore_time = .false.
!
! -------- INITIALIZE WAVE FUNCTIONS psi
!
psi = 0.0d0
!
! -------- INITIALIZE POTENTIAL v
!
v = 1.0d0
!
IF (use_tg) THEN
!
ALLOCATE (tg_v(dffts%nnr_tg))
ALLOCATE (tg_psic(dffts%nnr_tg))
!
CALL tg_gather(dffts, v, tg_v)
! incr is already set to 2 for gamma_only
incr = incr*fftx_ntgrp(dffts)
!
ENDIF
!
! Execute FFT calls once more and Take time
!
ncount = 0
!
!
!
DO ib = 1, nbnd, incr
!
time(1) = MPI_WTIME()
!
IF (use_tg) THEN
!
call prepare_psi_tg(ib, nbnd, ngms, psi, tg_psic, nls, nlsm, dffts, gamma_only)
time(2) = MPI_WTIME()
!
CALL invfft('tgWave', tg_psic, dffts);
time(3) = MPI_WTIME()
!
CALL tg_get_group_nr3(dffts, right_nr3)
!
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
tg_psic(j) = tg_psic(j)*tg_v(j)
ENDDO
!
time(4) = MPI_WTIME()
!
CALL fwfft('tgWave', tg_psic, dffts);
time(5) = MPI_WTIME()
!
CALL accumulate_hpsi_tg(ib, nbnd, ngms, hpsi, tg_psic, nls, nlsm, dffts, gamma_only)
time(6) = MPI_WTIME()
ELSE
!
call prepare_psi(ib, nbnd, ngms, psi, psic, nls, nlsm, dffts, gamma_only)
time(2) = MPI_WTIME()
!
CALL invfft('Wave', psic, dffts); time(3) = MPI_WTIME()
!
DO j = 1, dffts%nnr
psic(j) = psic(j)*v(j)
ENDDO
time(4) = MPI_WTIME()
!
CALL fwfft('Wave', psic, dffts);
time(5) = MPI_WTIME()
!
CALL accumulate_hpsi(ib, nbnd, ngms, hpsi, psic, nls, nlsm, dffts, gamma_only)
time(6) = MPI_WTIME()
!
ENDIF
!
do i = 2, 6
my_time(i) = my_time(i) + (time(i) - time(i - 1))
end do
!
ncount = ncount + 1
!
enddo
!
wall = MPI_WTIME() - wall
DEALLOCATE (psic)
DEALLOCATE (hpsi)
IF (use_tg) THEN
!
DEALLOCATE (tg_psic)
DEALLOCATE (tg_v)
!
ENDIF
#if defined(__MPI)
CALL MPI_ALLREDUCE(my_time, time_min, 10, MPI_DOUBLE_PRECISION, MPI_MIN, MPI_COMM_WORLD, ierr)
CALL MPI_ALLREDUCE(my_time, time_max, 10, MPI_DOUBLE_PRECISION, MPI_MAX, MPI_COMM_WORLD, ierr)
CALL MPI_ALLREDUCE(my_time, time_avg, 10, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr)
CALL MPI_ALLREDUCE(wall, wall_avg, 1, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr)
#else
time_min = time
time_max = time
time_avg = time
#endif
time_avg = time_avg / npes
wall_avg = wall_avg / npes
if( mype == 0 ) then
write(*,*) '**** QE 3DFFT Timing ****'
write(*,*) 'grid size = ', dffts%nr1, dffts%nr2, dffts%nr3
write(*,*) 'num proc = ', npes
write(*,*) 'num band = ', nbnd
write(*,*) 'num task group = ', ntgs
write(*,*) 'num fft cycles = ', ncount
write(*,100)
write(*,1)
write(*,100)
write(*,2) time_min(2), time_max(2), time_avg(2)
write(*,3) time_min(3), time_max(3), time_avg(3)
write(*,4) time_min(4), time_max(4), time_avg(4)
write(*,5) time_min(5), time_max(5), time_avg(5)
write(*,6) time_min(6), time_max(6), time_avg(6)
write(*,7) wall
write(*,100)
100 FORMAT(' +--------------------+----------------+-----------------+----------------+' )
1 FORMAT(' |FFT TEST subroutine | sec. min | sec. max | sec. avg |' )
2 FORMAT(' |prepare_psi | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
3 FORMAT(' |invfft | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
4 FORMAT(' |workload | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
5 FORMAT(' |fwfft | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
6 FORMAT(' |accumulate_hpsi | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
7 FORMAT(' |wall time | ', D14.5, ' |')
end if
! now print FFT clocks
call print_clock(mype, npes, ncount)
CALL fft_type_deallocate(dffts)
CALL fft_type_deallocate(dfftp)
CALL fft_type_deallocate(dfft3d)
#if defined(__MPI)
CALL mpi_finalize(ierr)
#endif
contains
!
! Copyright (C) 2001 PWSCF group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!---------------------------------------------------------------------
subroutine recips (a1, a2, a3, b1, b2, b3)
!---------------------------------------------------------------------
!
! This routine generates the reciprocal lattice vectors b1,b2,b3
! given the real space vectors a1,a2,a3. The b's are units of 2 pi/a.
!
! first the input variables
!
implicit none
real(DP) :: a1 (3), a2 (3), a3 (3), b1 (3), b2 (3), b3 (3)
! input: first direct lattice vector
! input: second direct lattice vector
! input: third direct lattice vector
! output: first reciprocal lattice vector
! output: second reciprocal lattice vector
! output: third reciprocal lattice vector
!
! then the local variables
!
real(DP) :: den, s
! the denominator
! the sign of the permutations
integer :: iperm, i, j, k, l, ipol
! counter on the permutations
!\
! Auxiliary variables
!/
!
! Counter on the polarizations
!
! first we compute the denominator
!
den = 0
i = 1
j = 2
k = 3
s = 1.d0
100 do iperm = 1, 3
den = den + s * a1 (i) * a2 (j) * a3 (k)
l = i
i = j
j = k
k = l
enddo
i = 2
j = 1
k = 3
s = - s
if (s.lt.0.d0) goto 100
!
! here we compute the reciprocal vectors
!
i = 1
j = 2
k = 3
do ipol = 1, 3
b1 (ipol) = (a2 (j) * a3 (k) - a2 (k) * a3 (j) ) / den
b2 (ipol) = (a3 (j) * a1 (k) - a3 (k) * a1 (j) ) / den
b3 (ipol) = (a1 (j) * a2 (k) - a1 (k) * a2 (j) ) / den
l = i
i = j
j = k
k = l
enddo
return
end subroutine recips
end program test
subroutine start_clock(label)
use timers
use mpi, ONLY:MPI_WTIME
implicit none
character(len=*) :: label
if (ignore_time) RETURN
select case (label)
case ("cft_1z")
times(1) = times(1) - MPI_WTIME()
case ("cft_2xy")
times(2) = times(2) - MPI_WTIME()
case ("cgather")
times(3) = times(3) - MPI_WTIME()
case ("cgather_grid")
times(4) = times(4) - MPI_WTIME()
case ("cscatter_grid")
times(5) = times(5) - MPI_WTIME()
case ("cscatter_sym")
times(6) = times(6) - MPI_WTIME()
case ("fft")
times(7) = times(7) - MPI_WTIME()
case ("fft_scatt_tg")
times(8) = times(8) - MPI_WTIME()
case ("fft_scatt_xy")
times(9) = times(9) - MPI_WTIME()
case ("fft_scatt_yz")
times(10) = times(10) - MPI_WTIME()
case ("fftb")
times(11) = times(11) - MPI_WTIME()
case ("fftc")
times(12) = times(12) - MPI_WTIME()
case ("fftcw")
times(13) = times(13) - MPI_WTIME()
case ("ffts")
times(14) = times(14) - MPI_WTIME()
case ("fftw")
times(15) = times(15) - MPI_WTIME()
case ("rgather_grid")
times(16) = times(16) - MPI_WTIME()
case ("rscatter_grid")
times(17) = times(17) - MPI_WTIME()
case ("fft_scatter") !alt version compatibility
times(18) = times(18) - MPI_WTIME()
case ("ALLTOALL") !alt version compatibility
times(19) = times(19) - MPI_WTIME()
case default
write (*, *) "Error, label not found", label
end select
end subroutine
subroutine stop_clock(label)
use timers
use mpi, ONLY:MPI_WTIME
implicit none
character(len=*) :: label
if (ignore_time) RETURN
select case (label)
case ("cft_1z")
times(1) = times(1) + MPI_WTIME()
case ("cft_2xy")
times(2) = times(2) + MPI_WTIME()
case ("cgather")
times(3) = times(3) + MPI_WTIME()
case ("cgather_grid")
times(4) = times(4) + MPI_WTIME()
case ("cscatter_grid")
times(5) = times(5) + MPI_WTIME()
case ("cscatter_sym")
times(6) = times(6) + MPI_WTIME()
case ("fft")
times(7) = times(7) + MPI_WTIME()
case ("fft_scatt_tg")
times(8) = times(8) + MPI_WTIME()
case ("fft_scatt_xy")
times(9) = times(9) + MPI_WTIME()
case ("fft_scatt_yz")
times(10) = times(10) + MPI_WTIME()
case ("fftb")
times(11) = times(11) + MPI_WTIME()
case ("fftc")
times(12) = times(12) + MPI_WTIME()
case ("fftcw")
times(13) = times(13) + MPI_WTIME()
case ("ffts")
times(14) = times(14) + MPI_WTIME()
case ("fftw")
times(15) = times(15) + MPI_WTIME()
case ("rgather_grid")
times(16) = times(16) + MPI_WTIME()
case ("rscatter_grid")
times(17) = times(17) + MPI_WTIME()
case ("fft_scatter") !alt version compatibility
times(18) = times(18) + MPI_WTIME()
case ("ALLTOALL") !alt version compatibility
times(19) = times(19) + MPI_WTIME()
case default
write (*, *) "Error, label not found", label
end select
end subroutine
!
subroutine print_clock(mype, npes, ncount)
use timers
use mpi
implicit none
integer, intent(in) :: mype, npes, ncount
REAL*8 :: time_min(20)
REAL*8 :: time_max(20)
REAL*8 :: time_avg(20)
integer :: ierr
#if defined(__MPI)
CALL MPI_ALLREDUCE(times, time_min, 20, MPI_DOUBLE_PRECISION, MPI_MIN, MPI_COMM_WORLD, ierr)
CALL MPI_ALLREDUCE(times, time_max, 20, MPI_DOUBLE_PRECISION, MPI_MAX, MPI_COMM_WORLD, ierr)
CALL MPI_ALLREDUCE(times, time_avg, 20, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr)
time_avg = time_avg/npes
#else
time_min(:) = times(:)
time_max(:) = times(:)
time_avg(:) = times(:)
#endif
if (mype == 0) then
write (*, 10100)
write (*, 101)
write (*, 10100)
if (times(1) > 0.d0) write (*, 102) time_min(1), time_max(1), time_avg(1)
if (times(2) > 0.d0) write (*, 103) time_min(2), time_max(2), time_avg(2)
if (times(3) > 0.d0) write (*, 104) time_min(3), time_max(3), time_avg(3)
if (times(4) > 0.d0) write (*, 105) time_min(4), time_max(4), time_avg(4)
if (times(5) > 0.d0) write (*, 106) time_min(5), time_max(5), time_avg(5)
if (times(6) > 0.d0) write (*, 107) time_min(6), time_max(6), time_avg(6)
if (times(7) > 0.d0) write (*, 108) time_min(7), time_max(7), time_avg(7)
if (times(8) > 0.d0) write (*, 109) time_min(8), time_max(8), time_avg(8)
if (times(9) > 0.d0) write (*, 1010) time_min(9), time_max(9), time_avg(9)
if (times(10) > 0.d0) write (*, 1011) time_min(10), time_max(10), time_avg(10)
if (times(11) > 0.d0) write (*, 1012) time_min(11), time_max(11), time_avg(11)
if (times(12) > 0.d0) write (*, 1013) time_min(12), time_max(12), time_avg(12)
if (times(13) > 0.d0) write (*, 1014) time_min(13), time_max(13), time_avg(13)
if (times(14) > 0.d0) write (*, 1015) time_min(14), time_max(14), time_avg(14)
if (times(15) > 0.d0) write (*, 1016) time_min(15), time_max(15), time_avg(15)
if (times(16) > 0.d0) write (*, 1017) time_min(16), time_max(16), time_avg(16)
if (times(17) > 0.d0) write (*, 1018) time_min(17), time_max(17), time_avg(17)
if (times(18) > 0.d0) write (*, 1019) time_min(18), time_max(18), time_avg(18)
if (times(19) > 0.d0) write (*, 1020) time_min(19), time_max(19), time_avg(19)
write (*, 10100)
end if
10100 FORMAT(' +--------------------+----------------+-----------------+----------------+' )
101 FORMAT(' |FFT subroutine | sec. min | sec. max | sec. avg |' )
102 FORMAT(' |cft_1z | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
103 FORMAT(' |cft_2xy | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
104 FORMAT(' |cgather | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
105 FORMAT(' |cgather_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
106 FORMAT(' |cscatter_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
107 FORMAT(' |cscatter_sym | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
108 FORMAT(' |fft | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
109 FORMAT(' |fft_scatt_tg | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1010 FORMAT(' |fft_scatt_xy | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1011 FORMAT(' |fft_scatt_yz | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1012 FORMAT(' |fftb | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1013 FORMAT(' |fftc | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1014 FORMAT(' |fftcw | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1015 FORMAT(' |ffts | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1016 FORMAT(' |fftw | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1017 FORMAT(' |rgather_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1018 FORMAT(' |rscatter_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1019 FORMAT(' |fft_scatter | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1020 FORMAT(' |ALLTOALL | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
end subroutine
!
!-----------------------------------------------------------------------
!-----------------------------------------------------------------------
SUBROUTINE ggen ( gamma_only, at, bg, no_global_sort, ngm, ngms, ngm_g, ngms_g, mill, &
& nl, nls, nlm, nlsm, gg, g, ig_l2g, gstart, gcutm, gcutms, dfftp, dffts )
!----------------------------------------------------------------------
!
! This routine generates all the reciprocal lattice vectors
! contained in the sphere of radius gcutm. Furthermore it
! computes the indices nl which give the correspondence
! between the fft mesh points and the array of g vectors.
!
USE fft_types
USE fft_param
!
IMPLICIT NONE
!
REAL(DP), PARAMETER :: eps8 = 1.0E-8_DP
!
LOGICAL, INTENT(IN) :: gamma_only
REAL(DP), INTENT(IN) :: at(3,3), bg(3,3), gcutm, gcutms
LOGICAL, INTENT(IN) :: no_global_sort
! if no_global_sort is present (and it is true) G vectors are sorted only
! locally and not globally. In this case no global array needs to be
! allocated and sorted: saves memory and a lot of time for large systems.
INTEGER :: ngm, ngms, ngm_g, ngms_g, gstart
INTEGER :: mill(3,ngm), nlm(ngm), nlsm(ngms), ig_l2g(ngm_g)
INTEGER :: nl(ngm), nls(ngms)
REAL(DP) :: gg(ngm), g(3,ngm)
TYPE(fft_type_descriptor) :: dfftp, dffts
!
! here a few local variables
!
REAL(DP) :: t (3), tt
INTEGER :: ngm_save, ngms_save, n1, n2, n3, n1s, n2s, n3s, ngm_offset, ngm_max, ngms_max
INTEGER :: ierr
!
REAL(DP), ALLOCATABLE :: g2sort_g(:)
! array containing all g vectors, on all processors: replicated data
! when no_global_sort is present (and it is true) only g vectors for the current processor are stored
INTEGER, ALLOCATABLE :: mill_g(:,:), mill_unsorted(:,:)
! array containing all g vectors generators, on all processors: replicated data
! when no_global_sort is present (and it is true) only g vectors for the current processor are stored
INTEGER, ALLOCATABLE :: igsrt(:)
!
INTEGER :: m1, m2, mc
INTEGER :: ni, nj, nk, i, j, k, ng
INTEGER :: mype, npe, comm
LOGICAL :: global_sort
INTEGER, ALLOCATABLE :: ngmpe(:)
!
global_sort = .NOT. no_global_sort
!
comm = dfftp%comm
mype = dfftp%mype
npe = dfftp%nproc
!
IF( .NOT. global_sort ) THEN
ALLOCATE( ngmpe( npe ) )
ngmpe = 0
ngm_max = ngm
ngms_max = ngms
ELSE
ngm_max = ngm_g
ngms_max = ngms_g
END IF
!
! save current value of ngm and ngms
!
ngm_save = ngm
ngms_save = ngms
!
ngm = 0
ngms = 0
!
! counters
!
! set the total number of fft mesh points and and initial value of gg
! The choice of gcutm is due to the fact that we have to order the
! vectors after computing them.
!
gg(:) = gcutm + 1.d0
!
! and computes all the g vectors inside a sphere
!
ALLOCATE( mill_g( 3, ngm_max ),mill_unsorted( 3, ngm_max ) )
ALLOCATE( igsrt( ngm_max ) )
ALLOCATE( g2sort_g( ngm_max ) )
!
g2sort_g(:) = 1.0d20
!
! max miller indices (same convention as in module stick_set)
!
ni = (dfftp%nr1-1)/2
nj = (dfftp%nr2-1)/2
nk = (dfftp%nr3-1)/2
!
iloop: DO i = -ni, ni
!
! gamma-only: exclude space with x < 0
!
IF ( gamma_only .and. i < 0) CYCLE iloop
jloop: DO j = -nj, nj
!
! gamma-only: exclude plane with x = 0, y < 0
!
IF ( gamma_only .and. i == 0 .and. j < 0) CYCLE jloop
IF( .NOT. global_sort ) THEN
m1 = mod (i, dfftp%nr1) + 1
IF (m1 < 1) m1 = m1 + dfftp%nr1
m2 = mod (j, dfftp%nr2) + 1
IF (m2 < 1) m2 = m2 + dfftp%nr2
mc = m1 + (m2 - 1) * dfftp%nr1x
IF ( dfftp%isind ( mc ) == 0) CYCLE jloop
END IF
kloop: DO k = -nk, nk
!
! gamma-only: exclude line with x = 0, y = 0, z < 0
!
IF ( gamma_only .and. i == 0 .and. j == 0 .and. k < 0) CYCLE kloop
t(:) = i * bg (:,1) + j * bg (:,2) + k * bg (:,3)
!tt = sum(t(:)**2)
tt = t(1)**2+t(2)**2+t(3)**2
IF (tt <= gcutm) THEN
ngm = ngm + 1
IF (tt <= gcutms) ngms = ngms + 1
IF (ngm > ngm_max) CALL fftx_error__ ('ggen 1', 'too many g-vectors', ngm)
mill_unsorted( :, ngm ) = (/ i,j,k /)
IF ( tt > eps8 ) THEN
g2sort_g(ngm) = tt
ELSE
g2sort_g(ngm) = 0.d0
ENDIF
ENDIF
ENDDO kloop
ENDDO jloop
ENDDO iloop
IF( .NOT. global_sort ) THEN
ngmpe( mype + 1 ) = ngm
CALL MPI_ALLREDUCE( MPI_IN_PLACE, ngmpe, 1, MPI_INTEGER, MPI_SUM, comm, ierr )
END IF
IF (ngm /= ngm_max) &
CALL fftx_error__ ('ggen', 'g-vectors missing !', abs(ngm - ngm_max))
IF (ngms /= ngms_max) &
CALL fftx_error__ ('ggen', 'smooth g-vectors missing !', abs(ngms - ngms_max))
igsrt(1) = 0
IF( .NOT. global_sort ) THEN
CALL hpsort_eps( ngm, g2sort_g, igsrt, eps8 )
ELSE
CALL hpsort_eps( ngm_g, g2sort_g, igsrt, eps8 )
END IF
mill_g(1,:) = mill_unsorted(1,igsrt(:))
mill_g(2,:) = mill_unsorted(2,igsrt(:))
mill_g(3,:) = mill_unsorted(3,igsrt(:))
DEALLOCATE( g2sort_g, igsrt, mill_unsorted )
IF( .NOT. global_sort ) THEN
! compute adeguate offsets in order to avoid overlap between
! g vectors once they are gathered on a single (global) array
!
ngm_offset = 0
DO ng = 1, mype
ngm_offset = ngm_offset + ngmpe( ng )
END DO
END IF
ngm = 0
ngms = 0
!
ngloop: DO ng = 1, ngm_max
i = mill_g(1, ng)
j = mill_g(2, ng)
k = mill_g(3, ng)
#if defined(__MPI)
IF( global_sort ) THEN
m1 = mod (i, dfftp%nr1) + 1
IF (m1 < 1) m1 = m1 + dfftp%nr1
m2 = mod (j, dfftp%nr2) + 1
IF (m2 < 1) m2 = m2 + dfftp%nr2
mc = m1 + (m2 - 1) * dfftp%nr1x
IF ( dfftp%isind ( mc ) == 0) CYCLE ngloop
END IF
#endif
ngm = ngm + 1
! Here map local and global g index !!!
! N.B. the global G vectors arrangement depends on the number of processors
!
IF( .NOT. global_sort ) THEN
ig_l2g( ngm ) = ng + ngm_offset
ELSE
ig_l2g( ngm ) = ng
END IF
g (1:3, ngm) = i * bg (:, 1) + j * bg (:, 2) + k * bg (:, 3)
gg (ngm) = sum(g (1:3, ngm)**2)
IF (gg (ngm) <= gcutms) ngms = ngms + 1
IF (ngm > ngm_save) CALL fftx_error__ ('ggen 2', 'too many g-vectors', ngm)
ENDDO ngloop
IF (ngm /= ngm_save) &
CALL fftx_error__ ('ggen', 'g-vectors (ngm) missing !', abs(ngm - ngm_save))
IF (ngms /= ngms_save) &
CALL fftx_error__ ('ggen', 'g-vectors (ngms) missing !', abs(ngm - ngms_save))
!
! determine first nonzero g vector
!
IF (gg(1).le.eps8) THEN
gstart=2
ELSE
gstart=1
ENDIF
!
! Now set nl and nls with the correct fft correspondence
!
DO ng = 1, ngm
n1 = nint (sum(g (:, ng) * at (:, 1))) + 1
mill (1,ng) = n1 - 1
n1s = n1
IF (n1<1) n1 = n1 + dfftp%nr1
IF (n1s<1) n1s = n1s + dffts%nr1
n2 = nint (sum(g (:, ng) * at (:, 2))) + 1
mill (2,ng) = n2 - 1
n2s = n2
IF (n2<1) n2 = n2 + dfftp%nr2
IF (n2s<1) n2s = n2s + dffts%nr2
n3 = nint (sum(g (:, ng) * at (:, 3))) + 1
mill (3,ng) = n3 - 1
n3s = n3
IF (n3<1) n3 = n3 + dfftp%nr3
IF (n3s<1) n3s = n3s + dffts%nr3
IF (n1>dfftp%nr1 .or. n2>dfftp%nr2 .or. n3>dfftp%nr3) &
CALL fftx_error__('ggen','Mesh too small?',ng)
#if defined (__MPI) && !defined (__USE_3D_FFT)
nl (ng) = n3 + ( dfftp%isind (n1 + (n2 - 1) * dfftp%nr1x) - 1) * dfftp%nr3x
IF (ng <= ngms) &
nls (ng) = n3s + ( dffts%isind (n1s+(n2s-1)*dffts%nr1x) - 1 ) * dffts%nr3x
#else
nl (ng) = n1 + (n2 - 1) * dfftp%nr1x + (n3 - 1) * dfftp%nr1x * dfftp%nr2x
IF (ng <= ngms) &
nls (ng) = n1s + (n2s - 1) * dffts%nr1x + (n3s - 1) * dffts%nr1x * dffts%nr2x
#endif
ENDDO
!
DEALLOCATE( mill_g )
IF ( gamma_only) CALL index_minusg( ngm, ngms, nlm, nlsm, mill, dfftp, dffts )
IF( ALLOCATED( ngmpe ) ) DEALLOCATE( ngmpe )
END SUBROUTINE ggen
!
!-----------------------------------------------------------------------
SUBROUTINE index_minusg(ngm, ngms, nlm, nlsm, mill, dfftp, dffts)
!----------------------------------------------------------------------
!
! compute indices nlm and nlms giving the correspondence
! between the fft mesh points and -G (for gamma-only calculations)
!
USE fft_types
!
IMPLICIT NONE
!
TYPE(fft_type_descriptor) :: dfftp, dffts
INTEGER :: ngm, ngms
INTEGER :: mill(3, ngm), nlm(ngm), nlsm(ngms)
!
INTEGER :: n1, n2, n3, n1s, n2s, n3s, ng
!
DO ng = 1, ngm
n1 = -mill(1, ng) + 1
n1s = n1
IF (n1 < 1) THEN
n1 = n1 + dfftp%nr1
n1s = n1s+dffts%nr1
END IF
n2 = -mill(2, ng) + 1
n2s = n2
IF (n2 < 1) THEN
n2 = n2 + dfftp%nr2
n2s = n2s+dffts%nr2
END IF
n3 = -mill(3, ng) + 1
n3s = n3
IF (n3 < 1) THEN
n3 = n3 + dfftp%nr3
n3s = n3s+dffts%nr3
END IF
IF (n1 > dfftp%nr1 .or. n2 > dfftp%nr2 .or. n3 > dfftp%nr3) THEN
CALL fftx_error__('index_minusg', 'Mesh too small?', ng)
ENDIF
#if defined (__MPI) && !defined (__USE_3D_FFT)
nlm(ng) = n3 + (dfftp%isind(n1 + (n2 - 1)*dfftp%nr1x) - 1)*dfftp%nr3x
IF (ng <= ngms) &
nlsm(ng) = n3s+(dffts%isind(n1s+(n2s-1)*dffts%nr1x) - 1)*dffts%nr3x
#else
nlm(ng) = n1 + (n2 - 1)*dfftp%nr1x+(n3 - 1)*dfftp%nr1x*dfftp%nr2x
IF (ng <= ngms) &
nlsm(ng) = n1s+(n2s-1)*dffts%nr1x+(n3s-1)*dffts%nr1x*dffts%nr2x
#endif
ENDDO
END SUBROUTINE index_minusg
!
! Copyright (C) 2001 PWSCF group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!---------------------------------------------------------------------
subroutine hpsort_eps(n, ra, ind, eps)
!---------------------------------------------------------------------
! sort an array ra(1:n) into ascending order using heapsort algorithm,
! and considering two elements being equal if their values differ
! for less than "eps".
! n is input, ra is replaced on output by its sorted rearrangement.
! create an index table (ind) by making an exchange in the index array
! whenever an exchange is made on the sorted data array (ra).
! in case of equal values in the data array (ra) the values in the
! index array (ind) are used to order the entries.
! if on input ind(1) = 0 then indices are initialized in the routine,
! if on input ind(1) != 0 then indices are assumed to have been
! initialized before entering the routine and these
! indices are carried around during the sorting process
!
! no work space needed !
! free us from machine-dependent sorting-routines !
!
! adapted from Numerical Recipes pg. 329 (new edition)
!
USE fft_param
implicit none
!-input/output variables
integer, intent(in) :: n
integer, intent(inout) :: ind (*)
real(DP), intent(inout) :: ra (*)
real(DP), intent(in) :: eps
!-local variables
integer :: i, ir, j, l, iind
real(DP) :: rra
! initialize index array
if (ind (1) .eq.0) then
do i = 1, n
ind (i) = i
enddo
endif
! nothing to order
if (n.lt.2) return
! initialize indices for hiring and retirement-promotion phase
l = n / 2 + 1
ir = n
sorting: do
! still in hiring phase
if ( l .gt. 1 ) then
l = l - 1
rra = ra (l)
iind = ind (l)
! in retirement-promotion phase.
else
! clear a space at the end of the array
rra = ra (ir)
!
iind = ind (ir)
! retire the top of the heap into it
ra (ir) = ra (1)
!
ind (ir) = ind (1)
! decrease the size of the corporation
ir = ir - 1
! done with the last promotion
if ( ir .eq. 1 ) then
! the least competent worker at all !
ra (1) = rra
!
ind (1) = iind
exit sorting
endif
endif
! wheter in hiring or promotion phase, we
i = l
! set up to place rra in its proper level
j = l + l
!
do while ( j .le. ir )
if ( j .lt. ir ) then
! compare to better underling
if ( abs(ra(j)-ra(j+1)).ge.eps ) then
if (ra(j).lt.ra(j+1)) j = j + 1
else
! this means ra(j) == ra(j+1) within tolerance
if (ind (j) .lt.ind (j + 1) ) j = j + 1
endif
endif
! demote rra
if ( abs(rra - ra(j)).ge.eps ) then
if (rra.lt.ra(j)) then
ra (i) = ra (j)
ind (i) = ind (j)
i = j
j = j + j
else
! set j to terminate do-while loop
j = ir + 1
end if
else
!this means rra == ra(j) within tolerance
! demote rra
if (iind.lt.ind (j) ) then
ra (i) = ra (j)
ind (i) = ind (j)
i = j
j = j + j
else
! set j to terminate do-while loop
j = ir + 1
endif
end if
enddo
ra(i) = rra
ind(i) = iind
end do sorting
!
end subroutine hpsort_eps
subroutine prepare_psi_tg(ibnd, nbnd, ngms, psi, tg_psi, nls, nlsm, dffts, gamma_only)
USE fft_param
USE fft_types
USE fft_helper_subroutines
implicit none
integer, intent(in) :: ibnd, nbnd, ngms
TYPE(fft_type_descriptor), intent(in) :: dffts
complex(DP) :: tg_psi(dffts%nnr_tg)
complex(DP) :: psi(ngms, nbnd)
integer, intent(in) :: nls(ngms), nlsm(ngms)
logical, intent(in) :: gamma_only
integer ioff, idx, j, ntgrp, right_nnr
!
tg_psi(:) = ( 0.D0, 0.D0 )
ioff = 0
!
CALL tg_get_nnr( dffts, right_nnr )
ntgrp = fftx_ntgrp(dffts)
!
IF (gamma_only) THEN
DO idx = 1, 2*ntgrp, 2
!
! ... 2*ntgrp ffts at the same time
!
IF( idx + ibnd - 1 < nbnd ) THEN
DO j = 1, ngms
tg_psi(nls (j)+ioff)= psi(j,idx+ibnd-1)+&
(0.0d0,1.d0) * psi(j,idx+ibnd)
tg_psi(nlsm(j)+ioff)=CONJG(psi(j,idx+ibnd-1) -&
(0.0d0,1.d0) * psi(j,idx+ibnd) )
END DO
ELSE IF( idx + ibnd - 1 == nbnd ) THEN
DO j = 1, ngms
tg_psi(nls (j)+ioff)= psi(j,idx+ibnd-1)
tg_psi(nlsm(j)+ioff)=CONJG( psi(j,idx+ibnd-1) )
END DO
END IF
ioff = ioff + right_nnr
END DO
ELSE
!
DO idx = 1, ntgrp
IF( idx + ibnd - 1 <= nbnd ) THEN
!$omp parallel do
DO j = 1, ngms
! here we forget about igk
tg_psi(nls (j+ioff)) = psi(j,idx+ibnd-1)
ENDDO
!$omp end parallel do
ENDIF
ioff = ioff + right_nnr
ENDDO
END IF
end subroutine prepare_psi_tg
subroutine prepare_psi( ibnd, nbnd, ngms, psi, psic, nls, nlsm, dffts, gamma_only)
USE fft_param
USE fft_types
USE fft_helper_subroutines
implicit none
integer, intent(in) :: ibnd, nbnd, ngms
TYPE(fft_type_descriptor), intent(in) :: dffts
complex(DP) :: psic( dffts%nnr )
complex(DP) :: psi( ngms, nbnd )
integer, intent(in) :: nls(ngms), nlsm(ngms)
logical, intent(in) :: gamma_only
integer :: j
psic(:) = (0.d0, 0.d0)
IF (gamma_only) THEN
IF (ibnd < nbnd) THEN
! two ffts at the same time
DO j = 1, ngms
psic(nls (j))= psi(j,ibnd) + (0.0d0,1.d0)*psi(j,ibnd+1)
psic(nlsm(j))=conjg(psi(j,ibnd) - (0.0d0,1.d0)*psi(j,ibnd+1))
ENDDO
ELSE
DO j = 1, ngms
psic (nls (j)) = psi(j, ibnd)
psic (nlsm(j)) = conjg(psi(j, ibnd))
ENDDO
ENDIF
ELSE
DO j = 1, ngms
! here we forget about igk
psic (nls (j)) = psi(j, ibnd)
END DO
END IF
end subroutine prepare_psi
subroutine accumulate_hpsi( ibnd, nbnd, ngms, hpsi, psic, nls, nlsm, dffts, gamma_only)
USE fft_types
USE fft_param
USE fft_helper_subroutines
implicit none
integer, intent(in) :: ibnd, nbnd, ngms
integer, intent(in) :: nls(ngms), nlsm(ngms)
TYPE(fft_type_descriptor) :: dffts
complex(DP) :: psic( dffts%nnr )
complex(DP) :: hpsi( ngms, nbnd )
logical, intent(in) :: gamma_only
integer j
complex(DP) :: fp, fm
!
!
! addition to the total product
!
IF (gamma_only) THEN
IF (ibnd < nbnd) THEN
! two ffts at the same time
DO j = 1, ngms
fp = (psic (nls(j)) + psic (nlsm(j)))*0.5d0
fm = (psic (nls(j)) - psic (nlsm(j)))*0.5d0
hpsi (j, ibnd) = hpsi (j, ibnd) + &
cmplx( dble(fp), aimag(fm),kind=DP)
hpsi (j, ibnd+1) = hpsi (j, ibnd+1) + &
cmplx(aimag(fp),- dble(fm),kind=DP)
ENDDO
ELSE
DO j = 1, ngms
hpsi (j, ibnd) = hpsi (j, ibnd) + psic (nls(j))
ENDDO
ENDIF
ELSE
DO j = 1, ngms ! here we forget about igk_k
hpsi (j, ibnd) = hpsi (j, ibnd) + psic (nls(j))
ENDDO
END IF
!
end subroutine accumulate_hpsi
subroutine accumulate_hpsi_tg( ibnd, nbnd, ngms, hpsi, tg_psic, nls, nlsm, dffts, gamma_only)
USE fft_types
USE fft_param
USE fft_helper_subroutines
implicit none
integer, intent(in) :: ibnd, nbnd, ngms
integer, intent(in) :: nls(ngms), nlsm(ngms)
TYPE(fft_type_descriptor) :: dffts
complex(DP) :: tg_psic( dffts%nnr_tg )
complex(DP) :: hpsi( ngms, nbnd )
logical, intent(in) :: gamma_only
integer ioff, idx, j, right_inc
complex(DP) :: fp, fm
!
! addition to the total product
!
ioff = 0
!
CALL tg_get_recip_inc( dffts, right_inc )
!
IF (gamma_only) THEN
DO idx = 1, 2*fftx_ntgrp(dffts), 2
!
IF( idx + ibnd - 1 < nbnd ) THEN
DO j = 1, ngms
fp= ( tg_psic( nls(j) + ioff ) + &
tg_psic( nlsm(j) + ioff ) ) * 0.5d0
fm= ( tg_psic( nls(j) + ioff ) - &
tg_psic( nlsm(j) + ioff ) ) * 0.5d0
hpsi (j, ibnd+idx-1) = hpsi (j, ibnd+idx-1) + &
cmplx( dble(fp), aimag(fm),kind=DP)
hpsi (j, ibnd+idx ) = hpsi (j, ibnd+idx ) + &
cmplx(aimag(fp),- dble(fm),kind=DP)
ENDDO
ELSEIF( idx + ibnd - 1 == nbnd ) THEN
DO j = 1, ngms
hpsi (j, ibnd+idx-1) = hpsi (j, ibnd+idx-1) + &
tg_psic( nls(j) + ioff )
ENDDO
ENDIF
!
ioff = ioff + right_inc
!
ENDDO
ELSE
DO idx = 1, fftx_ntgrp(dffts)
!
IF( idx + ibnd - 1 <= nbnd ) THEN
!$omp parallel do
DO j = 1, ngms
hpsi (j, ibnd+idx-1) = hpsi (j, ibnd+idx-1) + &
tg_psic( nls(j) + ioff )
! we forgot about igk above
ENDDO
!$omp end parallel do
ENDIF
!
ioff = ioff + right_inc
!
ENDDO
END IF
!
end subroutine accumulate_hpsi_tg