mirror of https://gitlab.com/QEF/q-e.git
1387 lines
42 KiB
Fortran
1387 lines
42 KiB
Fortran
!
|
|
! Copyright (C) Quantum ESPRESSO group
|
|
!
|
|
! This file is distributed under the terms of the
|
|
! GNU General Public License. See the file `License'
|
|
! in the root directory of the present distribution,
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
!
|
|
! by P. Bonfa', F. Affinito and C. Cavazzoni, Cineca
|
|
! & S. de Gironcoli, SISSA
|
|
|
|
module timers
|
|
save
|
|
LOGICAL :: ignore_time = .true.
|
|
REAL*8 :: times(20) = 0.d0
|
|
end module
|
|
|
|
program test
|
|
!! This mini-app provides a tool for testing and benchmarking the FFT drivers
|
|
!! contained in the FFTXlib.
|
|
!!
|
|
!! The mini-app mimics the workload of vloc_psi (a charge-density transformation
|
|
!! from complex-to-real space and back and contribution to h).
|
|
!!
|
|
!! To compile the test program, once you have properly edit the make.sys file
|
|
!! included in the FFTXlib and type:
|
|
!!
|
|
!! make TEST
|
|
!!
|
|
!! N.B.: do not run the make command alone, otherwise the FFT times will
|
|
!! not be present in the final summary.
|
|
!!
|
|
!! Then you can run your FFT tests using command like:
|
|
!!
|
|
!! mpirun -np 4 ./fft_test.x -ecutwfc 80 -alat 20 -nbnd 128 -ntg 4 -gamma .true.
|
|
!!
|
|
!! or, in case of serial build
|
|
!!
|
|
!! ./fft_test.x -ecutwfc 80 -alat 20 -nbnd 128 -ntg 4
|
|
!!
|
|
!! Command line arguments:
|
|
!!
|
|
!!-ecutwfc Plane wave energy cut off
|
|
!!
|
|
!!-alat Lattice parameter
|
|
!!
|
|
!!-nbnd Number of bands (fft cycles)
|
|
!!
|
|
!!-ntg Number of task groups
|
|
!!
|
|
!!-gamma Enables gamma point trick. Should be about 2 times faster.
|
|
!!
|
|
!! Timings of different stages of execution are provided at the end of the
|
|
!! run.
|
|
!! In the present version, a preliminar implementation with non-blocking MPI
|
|
!! calls as been implemented. This version requires the precompilation flags
|
|
!! -D__NON_BLOCKING_SCATTER
|
|
!!
|
|
USE fft_types
|
|
USE stick_base
|
|
USE fft_parallel
|
|
USE fft_support
|
|
USE fft_helper_subroutines
|
|
USE fft_interfaces, ONLY:fwfft, invfft
|
|
USE timers
|
|
IMPLICIT NONE
|
|
!
|
|
TYPE(fft_type_descriptor) :: dfftp, dffts, dfft3d
|
|
!
|
|
TYPE(sticks_map) :: smap
|
|
INTEGER :: nx = 128
|
|
!! grid points along x (modified after)
|
|
INTEGER :: ny = 128
|
|
!! grid points along y (modified after)
|
|
INTEGER :: nz = 256
|
|
!! grid points along z (modified after)
|
|
!
|
|
INTEGER :: mype, npes, comm, root
|
|
!! MPI handles
|
|
INTEGER :: ntgs
|
|
!! number of taskgroups
|
|
INTEGER :: nbnd
|
|
!! number of bands
|
|
LOGICAL :: iope
|
|
!! I/O process
|
|
INTEGER :: ierr, i, j, ncount, ib
|
|
INTEGER :: incr=1, right_nr3
|
|
INTEGER :: ngw_, ngm_, ngs_
|
|
REAL*8 :: gcutm, gkcut, gcutms
|
|
REAL*8 :: ecutm, ecutw, ecutms
|
|
REAL*8 :: ecutrho
|
|
!! cut-off for density
|
|
REAL*8 :: ecutwfc
|
|
!! cut-off for the wave-function
|
|
REAL*8 :: tpiba, alat, alat_in
|
|
!! lattice parameters
|
|
REAL*8 :: time(100)
|
|
REAL*8 :: my_time(100)
|
|
REAL*8 :: time_min(100)
|
|
REAL*8 :: time_max(100)
|
|
REAL*8 :: time_avg(100)
|
|
REAL*8 :: wall
|
|
REAL*8 :: wall_avg
|
|
!
|
|
LOGICAL :: gamma_only = .false.
|
|
LOGICAL :: use_tg
|
|
!! if calculations require only gamma point
|
|
REAL*8 :: at(3, 3), bg(3, 3)
|
|
REAL(DP), PARAMETER :: pi = 3.14159265358979323846_DP
|
|
!
|
|
COMPLEX(DP), ALLOCATABLE :: tg_psic(:)
|
|
COMPLEX(DP), ALLOCATABLE :: psic(:)
|
|
COMPLEX(DP), ALLOCATABLE :: psi(:, :)
|
|
!! fake argument returned by the FFT
|
|
REAL(DP), ALLOCATABLE :: v(:)
|
|
REAL(DP), ALLOCATABLE :: tg_v(:)
|
|
COMPLEX(DP), ALLOCATABLE :: hpsi(:, :)
|
|
!! array representing the potential
|
|
INTEGER, ALLOCATABLE :: nls(:), nlsm(:)
|
|
INTEGER :: ngms, ngsx, ngms_g
|
|
INTEGER, ALLOCATABLE :: mill(:, :), nl(:), nlm(:), ig_l2g(:)
|
|
REAL(DP), ALLOCATABLE :: g(:, :), gg(:)
|
|
INTEGER :: ngm, ngmx, ngm_g, gstart
|
|
!
|
|
!
|
|
integer :: nargs
|
|
CHARACTER(LEN=80) :: arg
|
|
!
|
|
#if defined(_OPENMP)
|
|
INTEGER :: PROVIDED
|
|
#endif
|
|
!
|
|
! ........
|
|
!
|
|
! default parameter (32 water molecules)
|
|
!
|
|
ecutwfc = 80.0d0
|
|
ecutrho = 0.d0
|
|
alat_in = 18.65
|
|
ntgs = 1
|
|
nbnd = 1
|
|
!
|
|
nargs = command_argument_count()
|
|
do i = 1, nargs - 1
|
|
CALL get_command_argument(i, arg)
|
|
IF (TRIM(arg) == '-ecutrho') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) ecutrho
|
|
END IF
|
|
IF (TRIM(arg) == '-ecutwfc') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) ecutwfc
|
|
END IF
|
|
IF (TRIM(arg) == '-alat') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) alat_in
|
|
END IF
|
|
IF (TRIM(arg) == '-ntg') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) ntgs
|
|
END IF
|
|
IF (TRIM(arg) == '-nbnd') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) nbnd
|
|
END IF
|
|
IF (TRIM(arg) == '-gamma') THEN
|
|
CALL get_command_argument(i + 1, arg)
|
|
READ (arg, *) gamma_only
|
|
END IF
|
|
end do
|
|
if (ecutrho == 0.d0) ecutrho = 4.0d0*ecutwfc
|
|
|
|
#if defined(__MPI)
|
|
|
|
#if defined(_OPENMP)
|
|
CALL MPI_Init_thread(MPI_THREAD_FUNNELED, PROVIDED, ierr)
|
|
#else
|
|
CALL MPI_Init(ierr)
|
|
#endif
|
|
CALL mpi_comm_rank(MPI_COMM_WORLD, mype, ierr)
|
|
CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
|
|
comm = MPI_COMM_WORLD
|
|
root = 0
|
|
IF (mype == root) THEN
|
|
iope = .true.
|
|
ELSE
|
|
iope = .false.
|
|
ENDIF
|
|
|
|
#else
|
|
|
|
mype = 0
|
|
npes = 1
|
|
comm = 0
|
|
ntgs = 1
|
|
root = 0
|
|
iope = .true.
|
|
|
|
#endif
|
|
!
|
|
! Broadcast input parameter first
|
|
!
|
|
CALL MPI_BCAST(ecutrho, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_BCAST(ecutwfc, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_BCAST(alat_in, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_BCAST(ntgs, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_BCAST(nbnd, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
|
|
!
|
|
!
|
|
! -------- INITIALIZE DIMENSIONS AND DESCRIPTORS
|
|
!
|
|
!
|
|
ecutw = ecutwfc
|
|
! dual
|
|
ecutm = ecutrho
|
|
ecutms = ecutrho
|
|
!
|
|
at(1, :) = (/0.5d0, 1.0d0, 0.0d0/)
|
|
at(2, :) = (/0.5d0, 0.0d0, 0.5d0/)
|
|
at(3, :) = (/0.0d0, 0.5d0, 1.5d0/)
|
|
!
|
|
at = at*alat_in
|
|
!
|
|
alat = SQRT(at(1, 1)**2 + at(2, 1)**2 + at(3, 1)**2)
|
|
!
|
|
tpiba = 2.0d0*pi/alat
|
|
!
|
|
gcutm = ecutm / tpiba**2 ! potential cut-off
|
|
gcutms= ecutms / tpiba**2 ! smooth mesh cut-off
|
|
gkcut = ecutw / tpiba**2 ! wave function cut-off
|
|
!
|
|
if( mype == 0 ) then
|
|
|
|
write (*, *) '+-----------------------------------+'
|
|
write (*, *) '| QE FFT |'
|
|
write (*, *) '| testing & timing |'
|
|
write (*, *) '| by Carlo Cavazzoni |'
|
|
write (*, *) '+-----------------------------------+'
|
|
write (*, *)
|
|
write (*, *) 'alat = ', alat
|
|
write (*, *) 'Ecutwfc = ', ecutw
|
|
write (*, *) 'Ecutrho = ', ecutm
|
|
write (*, *) 'Ecuts = ', ecutms
|
|
write (*, *) 'Gcutrho = ', SQRT(gcutm)
|
|
write (*, *) 'Gcuts = ', SQRT(gcutms)
|
|
write (*, *) 'Gcutwfc = ', SQRT(gkcut)
|
|
write (*, *) 'Num bands = ', nbnd
|
|
write (*, *) 'Num procs = ', npes
|
|
write (*, *) 'Num Task Group = ', ntgs
|
|
write (*, *) 'Gamma trick = ', gamma_only
|
|
end if
|
|
!
|
|
at = at/alat
|
|
!
|
|
call recips(at(1, 1), at(1, 2), at(1, 3), bg(1, 1), bg(1, 2), bg(1, 3))
|
|
!
|
|
nx = 2*int(sqrt(gcutm)*sqrt(at(1, 1)**2 + at(2, 1)**2 + at(3, 1)**2)) + 1
|
|
ny = 2*int(sqrt(gcutm)*sqrt(at(1, 2)**2 + at(2, 2)**2 + at(3, 2)**2)) + 1
|
|
nz = 2*int(sqrt(gcutm)*sqrt(at(1, 3)**2 + at(2, 3)**2 + at(3, 3)**2)) + 1
|
|
!
|
|
if (mype == 0) then
|
|
write (*, *) 'nx = ', nx, ' ny = ', ny, ' nz = ', nz
|
|
end if
|
|
!
|
|
IF (gamma_only) incr = 2
|
|
dffts%have_task_groups = (ntgs > 1)
|
|
use_tg = dffts%have_task_groups
|
|
!
|
|
CALL fft_type_init(dffts, smap, "wave", gamma_only, .true., comm, at, bg, gkcut, gcutms/gkcut, nyfft=ntgs)
|
|
CALL fft_type_init(dfftp, smap, "rho", gamma_only, .true., comm, at, bg, gcutm, 4.d0, nyfft=ntgs)
|
|
!
|
|
if (mype == 0) then
|
|
write (*, *) 'dffts: nr1 = ', dffts%nr1, ' nr2 = ', dffts%nr2, ' nr3 = ', dffts%nr3
|
|
write (*, *) ' nr1x= ', dffts%nr1x, ' nr2x= ', dffts%nr2x, ' nr3x= ', dffts%nr3x
|
|
end if
|
|
!
|
|
ngw_ = dffts%nwl(dffts%mype + 1)
|
|
ngs_ = dffts%ngl(dffts%mype + 1)
|
|
ngm_ = dfftp%ngl(dfftp%mype + 1)
|
|
!
|
|
IF (gamma_only) THEN
|
|
ngw_ = (ngw_ + 1)/2
|
|
ngs_ = (ngs_ + 1)/2
|
|
ngm_ = (ngm_ + 1)/2
|
|
END IF
|
|
!
|
|
ngms = ngs_
|
|
CALL MPI_ALLREDUCE(ngms, ngsx, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(ngms, ngms_g, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
ngm = ngm_
|
|
CALL MPI_ALLREDUCE(ngm, ngmx, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(ngm, ngm_g, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
!
|
|
! -------- ALLOCATE
|
|
!
|
|
ALLOCATE (psic(dffts%nnr))
|
|
ALLOCATE (psi(ngms, nbnd))
|
|
ALLOCATE (hpsi(ngms, nbnd))
|
|
ALLOCATE (v(dffts%nnr))
|
|
ALLOCATE (nls(ngms))
|
|
ALLOCATE (nlsm(ngms))
|
|
ALLOCATE (nl(ngm))
|
|
ALLOCATE (nlm(ngm))
|
|
ALLOCATE (mill(3, ngm))
|
|
ALLOCATE (g(3, ngm))
|
|
ALLOCATE (gg(ngm))
|
|
ALLOCATE (ig_l2g(ngm))
|
|
!
|
|
! -------- GENERATE G-VECTORS
|
|
!
|
|
call ggen(gamma_only, at, bg, .true., ngm, ngms, ngm_g, ngms_g, mill, &
|
|
& nl, nls, nlm, nlsm, gg, g, ig_l2g, gstart, gcutm, gcutms, dfftp, dffts)
|
|
!
|
|
! -------- RESET TIMERS
|
|
!
|
|
time = 0.0d0
|
|
my_time = 0.0d0
|
|
time_min = 0.0d0
|
|
time_max = 0.0d0
|
|
time_avg = 0.0d0
|
|
!
|
|
! Test FFT for wave functions - First calls may be biased by MPI and FFT initialization
|
|
!
|
|
CALL MPI_BARRIER(MPI_COMM_WORLD, ierr)
|
|
!
|
|
IF (use_tg) THEN
|
|
ALLOCATE (tg_psic(dffts%nnr_tg))
|
|
CALL invfft('tgWave', tg_psic, dffts)
|
|
DEALLOCATE (tg_psic)
|
|
ELSE
|
|
CALL invfft('Wave', psic, dffts)
|
|
END IF
|
|
!
|
|
IF (use_tg) THEN
|
|
ALLOCATE (tg_psic(dffts%nnr_tg))
|
|
CALL fwfft('tgWave', tg_psic, dffts)
|
|
DEALLOCATE (tg_psic)
|
|
ELSE
|
|
CALL fwfft('Wave', psic, dffts)
|
|
END IF
|
|
! Now for real,
|
|
!
|
|
! -------- RECORD TIMES
|
|
!
|
|
wall = MPI_WTIME()
|
|
ignore_time = .false.
|
|
!
|
|
! -------- INITIALIZE WAVE FUNCTIONS psi
|
|
!
|
|
psi = 0.0d0
|
|
!
|
|
! -------- INITIALIZE POTENTIAL v
|
|
!
|
|
v = 1.0d0
|
|
!
|
|
IF (use_tg) THEN
|
|
!
|
|
ALLOCATE (tg_v(dffts%nnr_tg))
|
|
ALLOCATE (tg_psic(dffts%nnr_tg))
|
|
!
|
|
CALL tg_gather(dffts, v, tg_v)
|
|
! incr is already set to 2 for gamma_only
|
|
incr = incr*fftx_ntgrp(dffts)
|
|
!
|
|
ENDIF
|
|
|
|
!
|
|
! Execute FFT calls once more and Take time
|
|
!
|
|
ncount = 0
|
|
!
|
|
!
|
|
!
|
|
DO ib = 1, nbnd, incr
|
|
!
|
|
time(1) = MPI_WTIME()
|
|
!
|
|
IF (use_tg) THEN
|
|
!
|
|
call prepare_psi_tg(ib, nbnd, ngms, psi, tg_psic, nls, nlsm, dffts, gamma_only)
|
|
time(2) = MPI_WTIME()
|
|
!
|
|
CALL invfft('tgWave', tg_psic, dffts);
|
|
time(3) = MPI_WTIME()
|
|
!
|
|
CALL tg_get_group_nr3(dffts, right_nr3)
|
|
!
|
|
DO j = 1, dffts%nr1x*dffts%nr2x*right_nr3
|
|
tg_psic(j) = tg_psic(j)*tg_v(j)
|
|
ENDDO
|
|
!
|
|
time(4) = MPI_WTIME()
|
|
!
|
|
CALL fwfft('tgWave', tg_psic, dffts);
|
|
time(5) = MPI_WTIME()
|
|
!
|
|
CALL accumulate_hpsi_tg(ib, nbnd, ngms, hpsi, tg_psic, nls, nlsm, dffts, gamma_only)
|
|
time(6) = MPI_WTIME()
|
|
ELSE
|
|
!
|
|
call prepare_psi(ib, nbnd, ngms, psi, psic, nls, nlsm, dffts, gamma_only)
|
|
time(2) = MPI_WTIME()
|
|
!
|
|
CALL invfft('Wave', psic, dffts); time(3) = MPI_WTIME()
|
|
!
|
|
DO j = 1, dffts%nnr
|
|
psic(j) = psic(j)*v(j)
|
|
ENDDO
|
|
time(4) = MPI_WTIME()
|
|
!
|
|
CALL fwfft('Wave', psic, dffts);
|
|
time(5) = MPI_WTIME()
|
|
!
|
|
CALL accumulate_hpsi(ib, nbnd, ngms, hpsi, psic, nls, nlsm, dffts, gamma_only)
|
|
time(6) = MPI_WTIME()
|
|
!
|
|
ENDIF
|
|
!
|
|
do i = 2, 6
|
|
my_time(i) = my_time(i) + (time(i) - time(i - 1))
|
|
end do
|
|
!
|
|
ncount = ncount + 1
|
|
!
|
|
enddo
|
|
!
|
|
wall = MPI_WTIME() - wall
|
|
|
|
DEALLOCATE (psic)
|
|
DEALLOCATE (hpsi)
|
|
IF (use_tg) THEN
|
|
!
|
|
DEALLOCATE (tg_psic)
|
|
DEALLOCATE (tg_v)
|
|
!
|
|
ENDIF
|
|
|
|
#if defined(__MPI)
|
|
CALL MPI_ALLREDUCE(my_time, time_min, 10, MPI_DOUBLE_PRECISION, MPI_MIN, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(my_time, time_max, 10, MPI_DOUBLE_PRECISION, MPI_MAX, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(my_time, time_avg, 10, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(wall, wall_avg, 1, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
#else
|
|
time_min = time
|
|
time_max = time
|
|
time_avg = time
|
|
#endif
|
|
|
|
time_avg = time_avg / npes
|
|
wall_avg = wall_avg / npes
|
|
|
|
if( mype == 0 ) then
|
|
|
|
write(*,*) '**** QE 3DFFT Timing ****'
|
|
write(*,*) 'grid size = ', dffts%nr1, dffts%nr2, dffts%nr3
|
|
write(*,*) 'num proc = ', npes
|
|
write(*,*) 'num band = ', nbnd
|
|
write(*,*) 'num task group = ', ntgs
|
|
write(*,*) 'num fft cycles = ', ncount
|
|
|
|
write(*,100)
|
|
write(*,1)
|
|
write(*,100)
|
|
write(*,2) time_min(2), time_max(2), time_avg(2)
|
|
write(*,3) time_min(3), time_max(3), time_avg(3)
|
|
write(*,4) time_min(4), time_max(4), time_avg(4)
|
|
write(*,5) time_min(5), time_max(5), time_avg(5)
|
|
write(*,6) time_min(6), time_max(6), time_avg(6)
|
|
write(*,7) wall
|
|
write(*,100)
|
|
|
|
100 FORMAT(' +--------------------+----------------+-----------------+----------------+' )
|
|
1 FORMAT(' |FFT TEST subroutine | sec. min | sec. max | sec. avg |' )
|
|
2 FORMAT(' |prepare_psi | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
|
|
3 FORMAT(' |invfft | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
|
|
4 FORMAT(' |workload | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
5 FORMAT(' |fwfft | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
6 FORMAT(' |accumulate_hpsi | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
7 FORMAT(' |wall time | ', D14.5, ' |')
|
|
|
|
end if
|
|
! now print FFT clocks
|
|
call print_clock(mype, npes, ncount)
|
|
|
|
CALL fft_type_deallocate(dffts)
|
|
CALL fft_type_deallocate(dfftp)
|
|
CALL fft_type_deallocate(dfft3d)
|
|
|
|
#if defined(__MPI)
|
|
CALL mpi_finalize(ierr)
|
|
#endif
|
|
|
|
|
|
contains
|
|
|
|
|
|
|
|
!
|
|
! Copyright (C) 2001 PWSCF group
|
|
! This file is distributed under the terms of the
|
|
! GNU General Public License. See the file `License'
|
|
! in the root directory of the present distribution,
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
!
|
|
!
|
|
!---------------------------------------------------------------------
|
|
|
|
subroutine recips (a1, a2, a3, b1, b2, b3)
|
|
!---------------------------------------------------------------------
|
|
!
|
|
! This routine generates the reciprocal lattice vectors b1,b2,b3
|
|
! given the real space vectors a1,a2,a3. The b's are units of 2 pi/a.
|
|
!
|
|
! first the input variables
|
|
!
|
|
implicit none
|
|
real(DP) :: a1 (3), a2 (3), a3 (3), b1 (3), b2 (3), b3 (3)
|
|
! input: first direct lattice vector
|
|
! input: second direct lattice vector
|
|
! input: third direct lattice vector
|
|
! output: first reciprocal lattice vector
|
|
! output: second reciprocal lattice vector
|
|
! output: third reciprocal lattice vector
|
|
!
|
|
! then the local variables
|
|
!
|
|
real(DP) :: den, s
|
|
! the denominator
|
|
! the sign of the permutations
|
|
integer :: iperm, i, j, k, l, ipol
|
|
! counter on the permutations
|
|
!\
|
|
! Auxiliary variables
|
|
!/
|
|
!
|
|
! Counter on the polarizations
|
|
!
|
|
! first we compute the denominator
|
|
!
|
|
den = 0
|
|
i = 1
|
|
j = 2
|
|
k = 3
|
|
s = 1.d0
|
|
100 do iperm = 1, 3
|
|
den = den + s * a1 (i) * a2 (j) * a3 (k)
|
|
l = i
|
|
i = j
|
|
j = k
|
|
k = l
|
|
enddo
|
|
i = 2
|
|
j = 1
|
|
k = 3
|
|
s = - s
|
|
if (s.lt.0.d0) goto 100
|
|
!
|
|
! here we compute the reciprocal vectors
|
|
!
|
|
i = 1
|
|
j = 2
|
|
k = 3
|
|
do ipol = 1, 3
|
|
b1 (ipol) = (a2 (j) * a3 (k) - a2 (k) * a3 (j) ) / den
|
|
b2 (ipol) = (a3 (j) * a1 (k) - a3 (k) * a1 (j) ) / den
|
|
b3 (ipol) = (a1 (j) * a2 (k) - a1 (k) * a2 (j) ) / den
|
|
l = i
|
|
i = j
|
|
j = k
|
|
k = l
|
|
enddo
|
|
return
|
|
end subroutine recips
|
|
|
|
|
|
end program test
|
|
|
|
subroutine start_clock(label)
|
|
use timers
|
|
use mpi, ONLY:MPI_WTIME
|
|
implicit none
|
|
character(len=*) :: label
|
|
if (ignore_time) RETURN
|
|
select case (label)
|
|
case ("cft_1z")
|
|
times(1) = times(1) - MPI_WTIME()
|
|
case ("cft_2xy")
|
|
times(2) = times(2) - MPI_WTIME()
|
|
case ("cgather")
|
|
times(3) = times(3) - MPI_WTIME()
|
|
case ("cgather_grid")
|
|
times(4) = times(4) - MPI_WTIME()
|
|
case ("cscatter_grid")
|
|
times(5) = times(5) - MPI_WTIME()
|
|
case ("cscatter_sym")
|
|
times(6) = times(6) - MPI_WTIME()
|
|
case ("fft")
|
|
times(7) = times(7) - MPI_WTIME()
|
|
case ("fft_scatt_tg")
|
|
times(8) = times(8) - MPI_WTIME()
|
|
case ("fft_scatt_xy")
|
|
times(9) = times(9) - MPI_WTIME()
|
|
case ("fft_scatt_yz")
|
|
times(10) = times(10) - MPI_WTIME()
|
|
case ("fftb")
|
|
times(11) = times(11) - MPI_WTIME()
|
|
case ("fftc")
|
|
times(12) = times(12) - MPI_WTIME()
|
|
case ("fftcw")
|
|
times(13) = times(13) - MPI_WTIME()
|
|
case ("ffts")
|
|
times(14) = times(14) - MPI_WTIME()
|
|
case ("fftw")
|
|
times(15) = times(15) - MPI_WTIME()
|
|
case ("rgather_grid")
|
|
times(16) = times(16) - MPI_WTIME()
|
|
case ("rscatter_grid")
|
|
times(17) = times(17) - MPI_WTIME()
|
|
case ("fft_scatter") !alt version compatibility
|
|
times(18) = times(18) - MPI_WTIME()
|
|
case ("ALLTOALL") !alt version compatibility
|
|
times(19) = times(19) - MPI_WTIME()
|
|
case default
|
|
write (*, *) "Error, label not found", label
|
|
end select
|
|
end subroutine
|
|
|
|
subroutine stop_clock(label)
|
|
use timers
|
|
use mpi, ONLY:MPI_WTIME
|
|
implicit none
|
|
character(len=*) :: label
|
|
if (ignore_time) RETURN
|
|
select case (label)
|
|
case ("cft_1z")
|
|
times(1) = times(1) + MPI_WTIME()
|
|
case ("cft_2xy")
|
|
times(2) = times(2) + MPI_WTIME()
|
|
case ("cgather")
|
|
times(3) = times(3) + MPI_WTIME()
|
|
case ("cgather_grid")
|
|
times(4) = times(4) + MPI_WTIME()
|
|
case ("cscatter_grid")
|
|
times(5) = times(5) + MPI_WTIME()
|
|
case ("cscatter_sym")
|
|
times(6) = times(6) + MPI_WTIME()
|
|
case ("fft")
|
|
times(7) = times(7) + MPI_WTIME()
|
|
case ("fft_scatt_tg")
|
|
times(8) = times(8) + MPI_WTIME()
|
|
case ("fft_scatt_xy")
|
|
times(9) = times(9) + MPI_WTIME()
|
|
case ("fft_scatt_yz")
|
|
times(10) = times(10) + MPI_WTIME()
|
|
case ("fftb")
|
|
times(11) = times(11) + MPI_WTIME()
|
|
case ("fftc")
|
|
times(12) = times(12) + MPI_WTIME()
|
|
case ("fftcw")
|
|
times(13) = times(13) + MPI_WTIME()
|
|
case ("ffts")
|
|
times(14) = times(14) + MPI_WTIME()
|
|
case ("fftw")
|
|
times(15) = times(15) + MPI_WTIME()
|
|
case ("rgather_grid")
|
|
times(16) = times(16) + MPI_WTIME()
|
|
case ("rscatter_grid")
|
|
times(17) = times(17) + MPI_WTIME()
|
|
case ("fft_scatter") !alt version compatibility
|
|
times(18) = times(18) + MPI_WTIME()
|
|
case ("ALLTOALL") !alt version compatibility
|
|
times(19) = times(19) + MPI_WTIME()
|
|
case default
|
|
write (*, *) "Error, label not found", label
|
|
end select
|
|
end subroutine
|
|
!
|
|
subroutine print_clock(mype, npes, ncount)
|
|
use timers
|
|
use mpi
|
|
implicit none
|
|
integer, intent(in) :: mype, npes, ncount
|
|
REAL*8 :: time_min(20)
|
|
REAL*8 :: time_max(20)
|
|
REAL*8 :: time_avg(20)
|
|
integer :: ierr
|
|
|
|
#if defined(__MPI)
|
|
CALL MPI_ALLREDUCE(times, time_min, 20, MPI_DOUBLE_PRECISION, MPI_MIN, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(times, time_max, 20, MPI_DOUBLE_PRECISION, MPI_MAX, MPI_COMM_WORLD, ierr)
|
|
CALL MPI_ALLREDUCE(times, time_avg, 20, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr)
|
|
time_avg = time_avg/npes
|
|
#else
|
|
time_min(:) = times(:)
|
|
time_max(:) = times(:)
|
|
time_avg(:) = times(:)
|
|
#endif
|
|
|
|
if (mype == 0) then
|
|
write (*, 10100)
|
|
write (*, 101)
|
|
write (*, 10100)
|
|
if (times(1) > 0.d0) write (*, 102) time_min(1), time_max(1), time_avg(1)
|
|
if (times(2) > 0.d0) write (*, 103) time_min(2), time_max(2), time_avg(2)
|
|
if (times(3) > 0.d0) write (*, 104) time_min(3), time_max(3), time_avg(3)
|
|
if (times(4) > 0.d0) write (*, 105) time_min(4), time_max(4), time_avg(4)
|
|
if (times(5) > 0.d0) write (*, 106) time_min(5), time_max(5), time_avg(5)
|
|
if (times(6) > 0.d0) write (*, 107) time_min(6), time_max(6), time_avg(6)
|
|
if (times(7) > 0.d0) write (*, 108) time_min(7), time_max(7), time_avg(7)
|
|
if (times(8) > 0.d0) write (*, 109) time_min(8), time_max(8), time_avg(8)
|
|
if (times(9) > 0.d0) write (*, 1010) time_min(9), time_max(9), time_avg(9)
|
|
if (times(10) > 0.d0) write (*, 1011) time_min(10), time_max(10), time_avg(10)
|
|
if (times(11) > 0.d0) write (*, 1012) time_min(11), time_max(11), time_avg(11)
|
|
if (times(12) > 0.d0) write (*, 1013) time_min(12), time_max(12), time_avg(12)
|
|
if (times(13) > 0.d0) write (*, 1014) time_min(13), time_max(13), time_avg(13)
|
|
if (times(14) > 0.d0) write (*, 1015) time_min(14), time_max(14), time_avg(14)
|
|
if (times(15) > 0.d0) write (*, 1016) time_min(15), time_max(15), time_avg(15)
|
|
if (times(16) > 0.d0) write (*, 1017) time_min(16), time_max(16), time_avg(16)
|
|
if (times(17) > 0.d0) write (*, 1018) time_min(17), time_max(17), time_avg(17)
|
|
if (times(18) > 0.d0) write (*, 1019) time_min(18), time_max(18), time_avg(18)
|
|
if (times(19) > 0.d0) write (*, 1020) time_min(19), time_max(19), time_avg(19)
|
|
write (*, 10100)
|
|
end if
|
|
10100 FORMAT(' +--------------------+----------------+-----------------+----------------+' )
|
|
101 FORMAT(' |FFT subroutine | sec. min | sec. max | sec. avg |' )
|
|
102 FORMAT(' |cft_1z | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
|
|
103 FORMAT(' |cft_2xy | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
|
|
104 FORMAT(' |cgather | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
105 FORMAT(' |cgather_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
106 FORMAT(' |cscatter_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
107 FORMAT(' |cscatter_sym | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
108 FORMAT(' |fft | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
109 FORMAT(' |fft_scatt_tg | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
1010 FORMAT(' |fft_scatt_xy | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
1011 FORMAT(' |fft_scatt_yz | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
1012 FORMAT(' |fftb | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
1013 FORMAT(' |fftc | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
1014 FORMAT(' |fftcw | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
1015 FORMAT(' |ffts | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
1016 FORMAT(' |fftw | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
1017 FORMAT(' |rgather_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
1018 FORMAT(' |rscatter_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
1019 FORMAT(' |fft_scatter | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
1020 FORMAT(' |ALLTOALL | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
|
|
|
end subroutine
|
|
!
|
|
!-----------------------------------------------------------------------
|
|
!-----------------------------------------------------------------------
|
|
SUBROUTINE ggen ( gamma_only, at, bg, no_global_sort, ngm, ngms, ngm_g, ngms_g, mill, &
|
|
& nl, nls, nlm, nlsm, gg, g, ig_l2g, gstart, gcutm, gcutms, dfftp, dffts )
|
|
!----------------------------------------------------------------------
|
|
!
|
|
! This routine generates all the reciprocal lattice vectors
|
|
! contained in the sphere of radius gcutm. Furthermore it
|
|
! computes the indices nl which give the correspondence
|
|
! between the fft mesh points and the array of g vectors.
|
|
!
|
|
USE fft_types
|
|
USE fft_param
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
REAL(DP), PARAMETER :: eps8 = 1.0E-8_DP
|
|
!
|
|
LOGICAL, INTENT(IN) :: gamma_only
|
|
REAL(DP), INTENT(IN) :: at(3,3), bg(3,3), gcutm, gcutms
|
|
LOGICAL, INTENT(IN) :: no_global_sort
|
|
! if no_global_sort is present (and it is true) G vectors are sorted only
|
|
! locally and not globally. In this case no global array needs to be
|
|
! allocated and sorted: saves memory and a lot of time for large systems.
|
|
INTEGER :: ngm, ngms, ngm_g, ngms_g, gstart
|
|
INTEGER :: mill(3,ngm), nlm(ngm), nlsm(ngms), ig_l2g(ngm_g)
|
|
INTEGER :: nl(ngm), nls(ngms)
|
|
REAL(DP) :: gg(ngm), g(3,ngm)
|
|
TYPE(fft_type_descriptor) :: dfftp, dffts
|
|
!
|
|
! here a few local variables
|
|
!
|
|
REAL(DP) :: t (3), tt
|
|
INTEGER :: ngm_save, ngms_save, n1, n2, n3, n1s, n2s, n3s, ngm_offset, ngm_max, ngms_max
|
|
INTEGER :: ierr
|
|
!
|
|
REAL(DP), ALLOCATABLE :: g2sort_g(:)
|
|
! array containing all g vectors, on all processors: replicated data
|
|
! when no_global_sort is present (and it is true) only g vectors for the current processor are stored
|
|
INTEGER, ALLOCATABLE :: mill_g(:,:), mill_unsorted(:,:)
|
|
! array containing all g vectors generators, on all processors: replicated data
|
|
! when no_global_sort is present (and it is true) only g vectors for the current processor are stored
|
|
INTEGER, ALLOCATABLE :: igsrt(:)
|
|
!
|
|
INTEGER :: m1, m2, mc
|
|
INTEGER :: ni, nj, nk, i, j, k, ng
|
|
INTEGER :: mype, npe, comm
|
|
LOGICAL :: global_sort
|
|
INTEGER, ALLOCATABLE :: ngmpe(:)
|
|
!
|
|
global_sort = .NOT. no_global_sort
|
|
!
|
|
comm = dfftp%comm
|
|
mype = dfftp%mype
|
|
npe = dfftp%nproc
|
|
!
|
|
IF( .NOT. global_sort ) THEN
|
|
ALLOCATE( ngmpe( npe ) )
|
|
ngmpe = 0
|
|
ngm_max = ngm
|
|
ngms_max = ngms
|
|
ELSE
|
|
ngm_max = ngm_g
|
|
ngms_max = ngms_g
|
|
END IF
|
|
!
|
|
! save current value of ngm and ngms
|
|
!
|
|
ngm_save = ngm
|
|
ngms_save = ngms
|
|
!
|
|
ngm = 0
|
|
ngms = 0
|
|
!
|
|
! counters
|
|
!
|
|
! set the total number of fft mesh points and and initial value of gg
|
|
! The choice of gcutm is due to the fact that we have to order the
|
|
! vectors after computing them.
|
|
!
|
|
gg(:) = gcutm + 1.d0
|
|
!
|
|
! and computes all the g vectors inside a sphere
|
|
!
|
|
ALLOCATE( mill_g( 3, ngm_max ),mill_unsorted( 3, ngm_max ) )
|
|
ALLOCATE( igsrt( ngm_max ) )
|
|
ALLOCATE( g2sort_g( ngm_max ) )
|
|
!
|
|
g2sort_g(:) = 1.0d20
|
|
!
|
|
! max miller indices (same convention as in module stick_set)
|
|
!
|
|
ni = (dfftp%nr1-1)/2
|
|
nj = (dfftp%nr2-1)/2
|
|
nk = (dfftp%nr3-1)/2
|
|
!
|
|
iloop: DO i = -ni, ni
|
|
!
|
|
! gamma-only: exclude space with x < 0
|
|
!
|
|
IF ( gamma_only .and. i < 0) CYCLE iloop
|
|
jloop: DO j = -nj, nj
|
|
!
|
|
! gamma-only: exclude plane with x = 0, y < 0
|
|
!
|
|
IF ( gamma_only .and. i == 0 .and. j < 0) CYCLE jloop
|
|
|
|
IF( .NOT. global_sort ) THEN
|
|
m1 = mod (i, dfftp%nr1) + 1
|
|
IF (m1 < 1) m1 = m1 + dfftp%nr1
|
|
m2 = mod (j, dfftp%nr2) + 1
|
|
IF (m2 < 1) m2 = m2 + dfftp%nr2
|
|
mc = m1 + (m2 - 1) * dfftp%nr1x
|
|
IF ( dfftp%isind ( mc ) == 0) CYCLE jloop
|
|
END IF
|
|
|
|
kloop: DO k = -nk, nk
|
|
!
|
|
! gamma-only: exclude line with x = 0, y = 0, z < 0
|
|
!
|
|
IF ( gamma_only .and. i == 0 .and. j == 0 .and. k < 0) CYCLE kloop
|
|
t(:) = i * bg (:,1) + j * bg (:,2) + k * bg (:,3)
|
|
!tt = sum(t(:)**2)
|
|
tt = t(1)**2+t(2)**2+t(3)**2
|
|
IF (tt <= gcutm) THEN
|
|
ngm = ngm + 1
|
|
IF (tt <= gcutms) ngms = ngms + 1
|
|
IF (ngm > ngm_max) CALL fftx_error__ ('ggen 1', 'too many g-vectors', ngm)
|
|
mill_unsorted( :, ngm ) = (/ i,j,k /)
|
|
IF ( tt > eps8 ) THEN
|
|
g2sort_g(ngm) = tt
|
|
ELSE
|
|
g2sort_g(ngm) = 0.d0
|
|
ENDIF
|
|
ENDIF
|
|
ENDDO kloop
|
|
ENDDO jloop
|
|
ENDDO iloop
|
|
|
|
IF( .NOT. global_sort ) THEN
|
|
ngmpe( mype + 1 ) = ngm
|
|
CALL MPI_ALLREDUCE( MPI_IN_PLACE, ngmpe, 1, MPI_INTEGER, MPI_SUM, comm, ierr )
|
|
END IF
|
|
IF (ngm /= ngm_max) &
|
|
CALL fftx_error__ ('ggen', 'g-vectors missing !', abs(ngm - ngm_max))
|
|
IF (ngms /= ngms_max) &
|
|
CALL fftx_error__ ('ggen', 'smooth g-vectors missing !', abs(ngms - ngms_max))
|
|
|
|
igsrt(1) = 0
|
|
IF( .NOT. global_sort ) THEN
|
|
CALL hpsort_eps( ngm, g2sort_g, igsrt, eps8 )
|
|
ELSE
|
|
CALL hpsort_eps( ngm_g, g2sort_g, igsrt, eps8 )
|
|
END IF
|
|
mill_g(1,:) = mill_unsorted(1,igsrt(:))
|
|
mill_g(2,:) = mill_unsorted(2,igsrt(:))
|
|
mill_g(3,:) = mill_unsorted(3,igsrt(:))
|
|
DEALLOCATE( g2sort_g, igsrt, mill_unsorted )
|
|
|
|
IF( .NOT. global_sort ) THEN
|
|
! compute adeguate offsets in order to avoid overlap between
|
|
! g vectors once they are gathered on a single (global) array
|
|
!
|
|
ngm_offset = 0
|
|
DO ng = 1, mype
|
|
ngm_offset = ngm_offset + ngmpe( ng )
|
|
END DO
|
|
END IF
|
|
|
|
ngm = 0
|
|
ngms = 0
|
|
!
|
|
ngloop: DO ng = 1, ngm_max
|
|
|
|
i = mill_g(1, ng)
|
|
j = mill_g(2, ng)
|
|
k = mill_g(3, ng)
|
|
|
|
#if defined(__MPI)
|
|
IF( global_sort ) THEN
|
|
m1 = mod (i, dfftp%nr1) + 1
|
|
IF (m1 < 1) m1 = m1 + dfftp%nr1
|
|
m2 = mod (j, dfftp%nr2) + 1
|
|
IF (m2 < 1) m2 = m2 + dfftp%nr2
|
|
mc = m1 + (m2 - 1) * dfftp%nr1x
|
|
IF ( dfftp%isind ( mc ) == 0) CYCLE ngloop
|
|
END IF
|
|
#endif
|
|
|
|
ngm = ngm + 1
|
|
|
|
! Here map local and global g index !!!
|
|
! N.B. the global G vectors arrangement depends on the number of processors
|
|
!
|
|
IF( .NOT. global_sort ) THEN
|
|
ig_l2g( ngm ) = ng + ngm_offset
|
|
ELSE
|
|
ig_l2g( ngm ) = ng
|
|
END IF
|
|
|
|
g (1:3, ngm) = i * bg (:, 1) + j * bg (:, 2) + k * bg (:, 3)
|
|
gg (ngm) = sum(g (1:3, ngm)**2)
|
|
|
|
IF (gg (ngm) <= gcutms) ngms = ngms + 1
|
|
IF (ngm > ngm_save) CALL fftx_error__ ('ggen 2', 'too many g-vectors', ngm)
|
|
ENDDO ngloop
|
|
|
|
IF (ngm /= ngm_save) &
|
|
CALL fftx_error__ ('ggen', 'g-vectors (ngm) missing !', abs(ngm - ngm_save))
|
|
IF (ngms /= ngms_save) &
|
|
CALL fftx_error__ ('ggen', 'g-vectors (ngms) missing !', abs(ngm - ngms_save))
|
|
!
|
|
! determine first nonzero g vector
|
|
!
|
|
IF (gg(1).le.eps8) THEN
|
|
gstart=2
|
|
ELSE
|
|
gstart=1
|
|
ENDIF
|
|
!
|
|
! Now set nl and nls with the correct fft correspondence
|
|
!
|
|
DO ng = 1, ngm
|
|
n1 = nint (sum(g (:, ng) * at (:, 1))) + 1
|
|
mill (1,ng) = n1 - 1
|
|
n1s = n1
|
|
IF (n1<1) n1 = n1 + dfftp%nr1
|
|
IF (n1s<1) n1s = n1s + dffts%nr1
|
|
|
|
n2 = nint (sum(g (:, ng) * at (:, 2))) + 1
|
|
mill (2,ng) = n2 - 1
|
|
n2s = n2
|
|
IF (n2<1) n2 = n2 + dfftp%nr2
|
|
IF (n2s<1) n2s = n2s + dffts%nr2
|
|
|
|
n3 = nint (sum(g (:, ng) * at (:, 3))) + 1
|
|
mill (3,ng) = n3 - 1
|
|
n3s = n3
|
|
IF (n3<1) n3 = n3 + dfftp%nr3
|
|
IF (n3s<1) n3s = n3s + dffts%nr3
|
|
|
|
IF (n1>dfftp%nr1 .or. n2>dfftp%nr2 .or. n3>dfftp%nr3) &
|
|
CALL fftx_error__('ggen','Mesh too small?',ng)
|
|
|
|
#if defined (__MPI) && !defined (__USE_3D_FFT)
|
|
nl (ng) = n3 + ( dfftp%isind (n1 + (n2 - 1) * dfftp%nr1x) - 1) * dfftp%nr3x
|
|
IF (ng <= ngms) &
|
|
nls (ng) = n3s + ( dffts%isind (n1s+(n2s-1)*dffts%nr1x) - 1 ) * dffts%nr3x
|
|
#else
|
|
nl (ng) = n1 + (n2 - 1) * dfftp%nr1x + (n3 - 1) * dfftp%nr1x * dfftp%nr2x
|
|
IF (ng <= ngms) &
|
|
nls (ng) = n1s + (n2s - 1) * dffts%nr1x + (n3s - 1) * dffts%nr1x * dffts%nr2x
|
|
#endif
|
|
ENDDO
|
|
!
|
|
DEALLOCATE( mill_g )
|
|
|
|
IF ( gamma_only) CALL index_minusg( ngm, ngms, nlm, nlsm, mill, dfftp, dffts )
|
|
|
|
IF( ALLOCATED( ngmpe ) ) DEALLOCATE( ngmpe )
|
|
|
|
END SUBROUTINE ggen
|
|
|
|
!
|
|
!-----------------------------------------------------------------------
|
|
SUBROUTINE index_minusg(ngm, ngms, nlm, nlsm, mill, dfftp, dffts)
|
|
!----------------------------------------------------------------------
|
|
!
|
|
! compute indices nlm and nlms giving the correspondence
|
|
! between the fft mesh points and -G (for gamma-only calculations)
|
|
!
|
|
USE fft_types
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
TYPE(fft_type_descriptor) :: dfftp, dffts
|
|
INTEGER :: ngm, ngms
|
|
INTEGER :: mill(3, ngm), nlm(ngm), nlsm(ngms)
|
|
!
|
|
INTEGER :: n1, n2, n3, n1s, n2s, n3s, ng
|
|
!
|
|
DO ng = 1, ngm
|
|
n1 = -mill(1, ng) + 1
|
|
n1s = n1
|
|
IF (n1 < 1) THEN
|
|
n1 = n1 + dfftp%nr1
|
|
n1s = n1s+dffts%nr1
|
|
END IF
|
|
|
|
n2 = -mill(2, ng) + 1
|
|
n2s = n2
|
|
IF (n2 < 1) THEN
|
|
n2 = n2 + dfftp%nr2
|
|
n2s = n2s+dffts%nr2
|
|
END IF
|
|
n3 = -mill(3, ng) + 1
|
|
n3s = n3
|
|
IF (n3 < 1) THEN
|
|
n3 = n3 + dfftp%nr3
|
|
n3s = n3s+dffts%nr3
|
|
END IF
|
|
|
|
IF (n1 > dfftp%nr1 .or. n2 > dfftp%nr2 .or. n3 > dfftp%nr3) THEN
|
|
CALL fftx_error__('index_minusg', 'Mesh too small?', ng)
|
|
ENDIF
|
|
|
|
#if defined (__MPI) && !defined (__USE_3D_FFT)
|
|
nlm(ng) = n3 + (dfftp%isind(n1 + (n2 - 1)*dfftp%nr1x) - 1)*dfftp%nr3x
|
|
IF (ng <= ngms) &
|
|
nlsm(ng) = n3s+(dffts%isind(n1s+(n2s-1)*dffts%nr1x) - 1)*dffts%nr3x
|
|
#else
|
|
nlm(ng) = n1 + (n2 - 1)*dfftp%nr1x+(n3 - 1)*dfftp%nr1x*dfftp%nr2x
|
|
IF (ng <= ngms) &
|
|
nlsm(ng) = n1s+(n2s-1)*dffts%nr1x+(n3s-1)*dffts%nr1x*dffts%nr2x
|
|
#endif
|
|
ENDDO
|
|
|
|
END SUBROUTINE index_minusg
|
|
!
|
|
! Copyright (C) 2001 PWSCF group
|
|
! This file is distributed under the terms of the
|
|
! GNU General Public License. See the file `License'
|
|
! in the root directory of the present distribution,
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
!
|
|
!---------------------------------------------------------------------
|
|
subroutine hpsort_eps(n, ra, ind, eps)
|
|
!---------------------------------------------------------------------
|
|
! sort an array ra(1:n) into ascending order using heapsort algorithm,
|
|
! and considering two elements being equal if their values differ
|
|
! for less than "eps".
|
|
! n is input, ra is replaced on output by its sorted rearrangement.
|
|
! create an index table (ind) by making an exchange in the index array
|
|
! whenever an exchange is made on the sorted data array (ra).
|
|
! in case of equal values in the data array (ra) the values in the
|
|
! index array (ind) are used to order the entries.
|
|
! if on input ind(1) = 0 then indices are initialized in the routine,
|
|
! if on input ind(1) != 0 then indices are assumed to have been
|
|
! initialized before entering the routine and these
|
|
! indices are carried around during the sorting process
|
|
!
|
|
! no work space needed !
|
|
! free us from machine-dependent sorting-routines !
|
|
!
|
|
! adapted from Numerical Recipes pg. 329 (new edition)
|
|
!
|
|
USE fft_param
|
|
implicit none
|
|
!-input/output variables
|
|
integer, intent(in) :: n
|
|
integer, intent(inout) :: ind (*)
|
|
real(DP), intent(inout) :: ra (*)
|
|
real(DP), intent(in) :: eps
|
|
!-local variables
|
|
integer :: i, ir, j, l, iind
|
|
real(DP) :: rra
|
|
! initialize index array
|
|
if (ind (1) .eq.0) then
|
|
do i = 1, n
|
|
ind (i) = i
|
|
enddo
|
|
endif
|
|
! nothing to order
|
|
if (n.lt.2) return
|
|
! initialize indices for hiring and retirement-promotion phase
|
|
l = n / 2 + 1
|
|
|
|
ir = n
|
|
|
|
sorting: do
|
|
|
|
! still in hiring phase
|
|
if ( l .gt. 1 ) then
|
|
l = l - 1
|
|
rra = ra (l)
|
|
iind = ind (l)
|
|
! in retirement-promotion phase.
|
|
else
|
|
! clear a space at the end of the array
|
|
rra = ra (ir)
|
|
!
|
|
iind = ind (ir)
|
|
! retire the top of the heap into it
|
|
ra (ir) = ra (1)
|
|
!
|
|
ind (ir) = ind (1)
|
|
! decrease the size of the corporation
|
|
ir = ir - 1
|
|
! done with the last promotion
|
|
if ( ir .eq. 1 ) then
|
|
! the least competent worker at all !
|
|
ra (1) = rra
|
|
!
|
|
ind (1) = iind
|
|
exit sorting
|
|
endif
|
|
endif
|
|
! wheter in hiring or promotion phase, we
|
|
i = l
|
|
! set up to place rra in its proper level
|
|
j = l + l
|
|
!
|
|
do while ( j .le. ir )
|
|
if ( j .lt. ir ) then
|
|
! compare to better underling
|
|
if ( abs(ra(j)-ra(j+1)).ge.eps ) then
|
|
if (ra(j).lt.ra(j+1)) j = j + 1
|
|
else
|
|
! this means ra(j) == ra(j+1) within tolerance
|
|
if (ind (j) .lt.ind (j + 1) ) j = j + 1
|
|
endif
|
|
endif
|
|
! demote rra
|
|
if ( abs(rra - ra(j)).ge.eps ) then
|
|
if (rra.lt.ra(j)) then
|
|
ra (i) = ra (j)
|
|
ind (i) = ind (j)
|
|
i = j
|
|
j = j + j
|
|
else
|
|
! set j to terminate do-while loop
|
|
j = ir + 1
|
|
end if
|
|
else
|
|
!this means rra == ra(j) within tolerance
|
|
! demote rra
|
|
if (iind.lt.ind (j) ) then
|
|
ra (i) = ra (j)
|
|
ind (i) = ind (j)
|
|
i = j
|
|
j = j + j
|
|
else
|
|
! set j to terminate do-while loop
|
|
j = ir + 1
|
|
endif
|
|
end if
|
|
enddo
|
|
ra(i) = rra
|
|
ind(i) = iind
|
|
|
|
end do sorting
|
|
!
|
|
end subroutine hpsort_eps
|
|
|
|
subroutine prepare_psi_tg(ibnd, nbnd, ngms, psi, tg_psi, nls, nlsm, dffts, gamma_only)
|
|
USE fft_param
|
|
USE fft_types
|
|
USE fft_helper_subroutines
|
|
implicit none
|
|
integer, intent(in) :: ibnd, nbnd, ngms
|
|
TYPE(fft_type_descriptor), intent(in) :: dffts
|
|
complex(DP) :: tg_psi(dffts%nnr_tg)
|
|
complex(DP) :: psi(ngms, nbnd)
|
|
integer, intent(in) :: nls(ngms), nlsm(ngms)
|
|
logical, intent(in) :: gamma_only
|
|
integer ioff, idx, j, ntgrp, right_nnr
|
|
!
|
|
tg_psi(:) = ( 0.D0, 0.D0 )
|
|
ioff = 0
|
|
!
|
|
CALL tg_get_nnr( dffts, right_nnr )
|
|
ntgrp = fftx_ntgrp(dffts)
|
|
!
|
|
IF (gamma_only) THEN
|
|
DO idx = 1, 2*ntgrp, 2
|
|
!
|
|
! ... 2*ntgrp ffts at the same time
|
|
!
|
|
IF( idx + ibnd - 1 < nbnd ) THEN
|
|
DO j = 1, ngms
|
|
tg_psi(nls (j)+ioff)= psi(j,idx+ibnd-1)+&
|
|
(0.0d0,1.d0) * psi(j,idx+ibnd)
|
|
tg_psi(nlsm(j)+ioff)=CONJG(psi(j,idx+ibnd-1) -&
|
|
(0.0d0,1.d0) * psi(j,idx+ibnd) )
|
|
END DO
|
|
ELSE IF( idx + ibnd - 1 == nbnd ) THEN
|
|
DO j = 1, ngms
|
|
tg_psi(nls (j)+ioff)= psi(j,idx+ibnd-1)
|
|
tg_psi(nlsm(j)+ioff)=CONJG( psi(j,idx+ibnd-1) )
|
|
END DO
|
|
END IF
|
|
|
|
ioff = ioff + right_nnr
|
|
|
|
END DO
|
|
ELSE
|
|
!
|
|
DO idx = 1, ntgrp
|
|
IF( idx + ibnd - 1 <= nbnd ) THEN
|
|
!$omp parallel do
|
|
DO j = 1, ngms
|
|
! here we forget about igk
|
|
tg_psi(nls (j+ioff)) = psi(j,idx+ibnd-1)
|
|
ENDDO
|
|
!$omp end parallel do
|
|
ENDIF
|
|
ioff = ioff + right_nnr
|
|
ENDDO
|
|
END IF
|
|
end subroutine prepare_psi_tg
|
|
|
|
subroutine prepare_psi( ibnd, nbnd, ngms, psi, psic, nls, nlsm, dffts, gamma_only)
|
|
USE fft_param
|
|
USE fft_types
|
|
USE fft_helper_subroutines
|
|
implicit none
|
|
integer, intent(in) :: ibnd, nbnd, ngms
|
|
TYPE(fft_type_descriptor), intent(in) :: dffts
|
|
complex(DP) :: psic( dffts%nnr )
|
|
complex(DP) :: psi( ngms, nbnd )
|
|
integer, intent(in) :: nls(ngms), nlsm(ngms)
|
|
logical, intent(in) :: gamma_only
|
|
integer :: j
|
|
psic(:) = (0.d0, 0.d0)
|
|
IF (gamma_only) THEN
|
|
IF (ibnd < nbnd) THEN
|
|
! two ffts at the same time
|
|
DO j = 1, ngms
|
|
psic(nls (j))= psi(j,ibnd) + (0.0d0,1.d0)*psi(j,ibnd+1)
|
|
psic(nlsm(j))=conjg(psi(j,ibnd) - (0.0d0,1.d0)*psi(j,ibnd+1))
|
|
ENDDO
|
|
ELSE
|
|
DO j = 1, ngms
|
|
psic (nls (j)) = psi(j, ibnd)
|
|
psic (nlsm(j)) = conjg(psi(j, ibnd))
|
|
ENDDO
|
|
ENDIF
|
|
ELSE
|
|
DO j = 1, ngms
|
|
! here we forget about igk
|
|
psic (nls (j)) = psi(j, ibnd)
|
|
END DO
|
|
END IF
|
|
end subroutine prepare_psi
|
|
|
|
|
|
subroutine accumulate_hpsi( ibnd, nbnd, ngms, hpsi, psic, nls, nlsm, dffts, gamma_only)
|
|
USE fft_types
|
|
USE fft_param
|
|
USE fft_helper_subroutines
|
|
implicit none
|
|
integer, intent(in) :: ibnd, nbnd, ngms
|
|
integer, intent(in) :: nls(ngms), nlsm(ngms)
|
|
TYPE(fft_type_descriptor) :: dffts
|
|
complex(DP) :: psic( dffts%nnr )
|
|
complex(DP) :: hpsi( ngms, nbnd )
|
|
logical, intent(in) :: gamma_only
|
|
integer j
|
|
complex(DP) :: fp, fm
|
|
!
|
|
!
|
|
! addition to the total product
|
|
!
|
|
IF (gamma_only) THEN
|
|
IF (ibnd < nbnd) THEN
|
|
! two ffts at the same time
|
|
DO j = 1, ngms
|
|
fp = (psic (nls(j)) + psic (nlsm(j)))*0.5d0
|
|
fm = (psic (nls(j)) - psic (nlsm(j)))*0.5d0
|
|
hpsi (j, ibnd) = hpsi (j, ibnd) + &
|
|
cmplx( dble(fp), aimag(fm),kind=DP)
|
|
hpsi (j, ibnd+1) = hpsi (j, ibnd+1) + &
|
|
cmplx(aimag(fp),- dble(fm),kind=DP)
|
|
ENDDO
|
|
ELSE
|
|
DO j = 1, ngms
|
|
hpsi (j, ibnd) = hpsi (j, ibnd) + psic (nls(j))
|
|
ENDDO
|
|
ENDIF
|
|
ELSE
|
|
DO j = 1, ngms ! here we forget about igk_k
|
|
hpsi (j, ibnd) = hpsi (j, ibnd) + psic (nls(j))
|
|
ENDDO
|
|
END IF
|
|
!
|
|
end subroutine accumulate_hpsi
|
|
subroutine accumulate_hpsi_tg( ibnd, nbnd, ngms, hpsi, tg_psic, nls, nlsm, dffts, gamma_only)
|
|
USE fft_types
|
|
USE fft_param
|
|
USE fft_helper_subroutines
|
|
implicit none
|
|
integer, intent(in) :: ibnd, nbnd, ngms
|
|
integer, intent(in) :: nls(ngms), nlsm(ngms)
|
|
TYPE(fft_type_descriptor) :: dffts
|
|
complex(DP) :: tg_psic( dffts%nnr_tg )
|
|
complex(DP) :: hpsi( ngms, nbnd )
|
|
logical, intent(in) :: gamma_only
|
|
integer ioff, idx, j, right_inc
|
|
complex(DP) :: fp, fm
|
|
!
|
|
! addition to the total product
|
|
!
|
|
ioff = 0
|
|
!
|
|
CALL tg_get_recip_inc( dffts, right_inc )
|
|
!
|
|
IF (gamma_only) THEN
|
|
DO idx = 1, 2*fftx_ntgrp(dffts), 2
|
|
!
|
|
IF( idx + ibnd - 1 < nbnd ) THEN
|
|
DO j = 1, ngms
|
|
fp= ( tg_psic( nls(j) + ioff ) + &
|
|
tg_psic( nlsm(j) + ioff ) ) * 0.5d0
|
|
fm= ( tg_psic( nls(j) + ioff ) - &
|
|
tg_psic( nlsm(j) + ioff ) ) * 0.5d0
|
|
hpsi (j, ibnd+idx-1) = hpsi (j, ibnd+idx-1) + &
|
|
cmplx( dble(fp), aimag(fm),kind=DP)
|
|
hpsi (j, ibnd+idx ) = hpsi (j, ibnd+idx ) + &
|
|
cmplx(aimag(fp),- dble(fm),kind=DP)
|
|
ENDDO
|
|
ELSEIF( idx + ibnd - 1 == nbnd ) THEN
|
|
DO j = 1, ngms
|
|
hpsi (j, ibnd+idx-1) = hpsi (j, ibnd+idx-1) + &
|
|
tg_psic( nls(j) + ioff )
|
|
ENDDO
|
|
ENDIF
|
|
!
|
|
ioff = ioff + right_inc
|
|
!
|
|
ENDDO
|
|
ELSE
|
|
DO idx = 1, fftx_ntgrp(dffts)
|
|
!
|
|
IF( idx + ibnd - 1 <= nbnd ) THEN
|
|
!$omp parallel do
|
|
DO j = 1, ngms
|
|
hpsi (j, ibnd+idx-1) = hpsi (j, ibnd+idx-1) + &
|
|
tg_psic( nls(j) + ioff )
|
|
! we forgot about igk above
|
|
ENDDO
|
|
!$omp end parallel do
|
|
ENDIF
|
|
!
|
|
ioff = ioff + right_inc
|
|
!
|
|
ENDDO
|
|
END IF
|
|
!
|
|
end subroutine accumulate_hpsi_tg
|