Merge branch 'fix_fft_test' into 'develop'

fix fft unit tests

See merge request QEF/q-e!1552
This commit is contained in:
Ye Luo 2021-10-22 15:12:23 +00:00
commit 4fd0d2c167
6 changed files with 626 additions and 90 deletions

View File

@ -1,6 +1,6 @@
set(test_common_src tester.f90 utils.f90)
set(test_common_src tester.f90 utils.f90 sort.f90 recips.f90)
qe_add_library(qe_fftx_test_common ${test_common_src})
target_link_libraries(qe_fftx_test_common PRIVATE qe_mpi_fortran)
target_link_libraries(qe_fftx_test_common PRIVATE qe_fftx qe_mpi_fortran)
set(source_names fft_scalar_gpu fft_scatter_mod_gpu fwinv_gpu)
foreach(NAME ${source_names})

View File

@ -3,6 +3,7 @@
include ../../make.inc
MODFLAGS = $(MOD_FLAG).. $(MOD_FLAG).
LDFLAGS += -Mcudalib=cufft
SRCS = test_fft_scalar_gpu.f90 \
test_fft_scatter_mod_gpu.f90 \
@ -12,10 +13,10 @@ EXECS = $(SRCS:.f90=.x)
all: common $(EXECS)
common: tester.o utils.o
common: sort.o tester.o utils.o recips.o
%.x: %.o
$(LD) $(LDFLAGS) $< utils.o tester.o -o $@ ../libqefft.a $(FFT_LIBS) $(BLAS_LIBS) $(MPI_LIBS) $(LD_LIBS)
$(LD) $(LDFLAGS) $< utils.o tester.o sort.o recips.o -o $@ ../libqefft.a $(FFT_LIBS) $(BLAS_LIBS) $(MPI_LIBS) $(LD_LIBS)
clean:
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x rnd_seed*

67
FFTXlib/tests/recips.f90 Normal file
View File

@ -0,0 +1,67 @@
SUBROUTINE recips (a1, a2, a3, b1, b2, b3)
!---------------------------------------------------------------------
!
! This routine generates the reciprocal lattice vectors b1,b2,b3
! given the real space vectors a1,a2,a3. The b's are units of 2 pi/a.
!
! first the input variables
!
USE fft_param, ONLY : DP
implicit none
real(DP) :: a1 (3), a2 (3), a3 (3), b1 (3), b2 (3), b3 (3)
! input: first direct lattice vector
! input: second direct lattice vector
! input: third direct lattice vector
! output: first reciprocal lattice vector
! output: second reciprocal lattice vector
! output: third reciprocal lattice vector
!
! then the local variables
!
real(DP) :: den, s
! the denominator
! the sign of the permutations
integer :: iperm, i, j, k, l, ipol
! counter on the permutations
!\
! Auxiliary variables
!/
!
! Counter on the polarizations
!
! first we compute the denominator
!
den = 0
i = 1
j = 2
k = 3
s = 1.d0
100 do iperm = 1, 3
den = den + s * a1 (i) * a2 (j) * a3 (k)
l = i
i = j
j = k
k = l
enddo
i = 2
j = 1
k = 3
s = - s
if (s.lt.0.d0) goto 100
!
! here we compute the reciprocal vectors
!
i = 1
j = 2
k = 3
do ipol = 1, 3
b1 (ipol) = (a2 (j) * a3 (k) - a2 (k) * a3 (j) ) / den
b2 (ipol) = (a3 (j) * a1 (k) - a3 (k) * a1 (j) ) / den
b3 (ipol) = (a1 (j) * a2 (k) - a1 (k) * a2 (j) ) / den
l = i
i = j
j = k
k = l
enddo
return
end subroutine recips

117
FFTXlib/tests/sort.f90 Normal file
View File

@ -0,0 +1,117 @@
subroutine hpsort_eps(n, ra, ind, eps)
!---------------------------------------------------------------------
! sort an array ra(1:n) into ascending order using heapsort algorithm,
! and considering two elements being equal if their values differ
! for less than "eps".
! n is input, ra is replaced on output by its sorted rearrangement.
! create an index table (ind) by making an exchange in the index array
! whenever an exchange is made on the sorted data array (ra).
! in case of equal values in the data array (ra) the values in the
! index array (ind) are used to order the entries.
! if on input ind(1) = 0 then indices are initialized in the routine,
! if on input ind(1) != 0 then indices are assumed to have been
! initialized before entering the routine and these
! indices are carried around during the sorting process
!
! no work space needed !
! free us from machine-dependent sorting-routines !
!
! adapted from Numerical Recipes pg. 329 (new edition)
!
USE fft_param
implicit none
!-input/output variables
integer, intent(in) :: n
integer, intent(inout) :: ind (*)
real(DP), intent(inout) :: ra (*)
real(DP), intent(in) :: eps
!-local variables
integer :: i, ir, j, l, iind
real(DP) :: rra
! initialize index array
if (ind (1) .eq.0) then
do i = 1, n
ind (i) = i
enddo
endif
! nothing to order
if (n.lt.2) return
! initialize indices for hiring and retirement-promotion phase
l = n / 2 + 1
ir = n
sorting: do
! still in hiring phase
if ( l .gt. 1 ) then
l = l - 1
rra = ra (l)
iind = ind (l)
! in retirement-promotion phase.
else
! clear a space at the end of the array
rra = ra (ir)
!
iind = ind (ir)
! retire the top of the heap into it
ra (ir) = ra (1)
!
ind (ir) = ind (1)
! decrease the size of the corporation
ir = ir - 1
! done with the last promotion
if ( ir .eq. 1 ) then
! the least competent worker at all !
ra (1) = rra
!
ind (1) = iind
exit sorting
endif
endif
! wheter in hiring or promotion phase, we
i = l
! set up to place rra in its proper level
j = l + l
!
do while ( j .le. ir )
if ( j .lt. ir ) then
! compare to better underling
if ( abs(ra(j)-ra(j+1)).ge.eps ) then
if (ra(j).lt.ra(j+1)) j = j + 1
else
! this means ra(j) == ra(j+1) within tolerance
if (ind (j) .lt.ind (j + 1) ) j = j + 1
endif
endif
! demote rra
if ( abs(rra - ra(j)).ge.eps ) then
if (rra.lt.ra(j)) then
ra (i) = ra (j)
ind (i) = ind (j)
i = j
j = j + j
else
! set j to terminate do-while loop
j = ir + 1
end if
else
!this means rra == ra(j) within tolerance
! demote rra
if (iind.lt.ind (j) ) then
ra (i) = ra (j)
ind (i) = ind (j)
i = j
j = j + j
else
! set j to terminate do-while loop
j = ir + 1
endif
end if
enddo
ra(i) = rra
ind(i) = iind
end do sorting
!
end subroutine hpsort_eps

View File

@ -20,9 +20,9 @@ program test_fft_scatter_mod_gpu
!
#if defined(__MPI)
#if defined(_OPENMP)
CALL MPI_Init_thread(MPI_THREAD_FUNNELED,level, ierr)
CALL MPI_Init_thread(MPI_THREAD_FUNNELED,level, ierr)
#else
CALL MPI_Init(ierr)
CALL MPI_Init(ierr)
#endif
#endif
!
@ -32,24 +32,27 @@ program test_fft_scatter_mod_gpu
!
test%tolerance64 = 1.d-14
!
CALL save_random_seed("test_fft_scatter_mod_gpu", mp%me)
!
DO i = 1, mp%n
IF (MOD(mp%n,i) == 0 ) THEN
! gamma case
CALL test_fft_scatter_xy_gpu_1(mp, test, .true., i)
! k case
CALL test_fft_scatter_xy_gpu_1(mp, test, .false., i)
!
! gamma case
CALL test_fft_scatter_yz_gpu_1(mp, test, .true., i)
! k case
CALL test_fft_scatter_yz_gpu_1(mp, test, .false., i)
END IF
END DO
CALL test_fft_scatter_many_yz_gpu_1(mp, test, .true., 1)
CALL test_fft_scatter_many_yz_gpu_1(mp, test, .false., 1)
IF (mp%n > 1) THEN
!
CALL save_random_seed("test_fft_scatter_mod_gpu", mp%me)
!
DO i = 1, mp%n
IF (MOD(mp%n,i) == 0 ) THEN
! gamma case
CALL test_fft_scatter_xy_gpu_1(mp, test, .true., i)
! k case
CALL test_fft_scatter_xy_gpu_1(mp, test, .false., i)
!
! gamma case
CALL test_fft_scatter_yz_gpu_1(mp, test, .true., i)
! k case
CALL test_fft_scatter_yz_gpu_1(mp, test, .false., i)
END IF
END DO
CALL test_fft_scatter_many_yz_gpu_1(mp, test, .true., 1)
CALL test_fft_scatter_many_yz_gpu_1(mp, test, .false., 1)
!
ENDIF
!
CALL collect_results(test)
!

View File

@ -1,4 +1,9 @@
#if defined(__CUDA)
#define DEVATTR ,DEVICE
#else
#define DEVATTR
#endif
program test_fwinv_gpu
#if defined(__MPI) && defined(__MPI_MODULE)
USE mpi
@ -26,29 +31,40 @@ program test_fwinv_gpu
#endif
#endif
!
! Prepare MPI and communicators
CALL mpi_data_init(mp%me, mp%n, mp%root, mp%comm)
!
CALL test%init()
!
test%tolerance64 = 1.d-12
! A rather large threshold is necessary to match the results of all
! possible implementations.
test%tolerance64 = 1.d-10
!
CALL save_random_seed("test_fwinv_gpu", mp%me)
!
DO i = 1, mp%n
IF (MOD(mp%n,i) == 0 ) THEN
! test R -> G, gamma case
CALL test_fwfft_gpu_1(mp, test, .true., i)
! test R -> G, k case
CALL test_fwfft_gpu_1(mp, test, .false., i)
!
! test G -> R, gamma case
CALL test_invfft_gpu_1(mp, test, .true., i)
! test G -> R, k case
CALL test_invfft_gpu_1(mp, test, .false., i)
END IF
END DO
!
#if defined(__CUDA)
! the batched FFT is only implemented for GPU,
! will segault if called on non-implemented version.
CALL test_fwfft_many_gpu_1(mp, test, .true., 1)
CALL test_fwfft_many_gpu_1(mp, test, .false., 1)
!
CALL test_invfft_many_gpu_1(mp, test, .true., 1)
CALL test_invfft_many_gpu_1(mp, test, .false., 1)
#endif
!
CALL collect_results(test)
!
@ -71,30 +87,6 @@ program test_fwinv_gpu
#endif
END SUBROUTINE mpi_data_init
!
SUBROUTINE calc_bg(at, bg)
USE fft_param, ONLY : DP
implicit none
REAL(DP), PARAMETER :: pi=4.D0*DATAN(1.D0)
REAL(DP), INTENT(IN) :: at(3,3)
REAL(DP), INTENT(OUT) :: bg(3,3)
REAL(DP) :: ucvol
!
ucvol=at(1,1)*(at(2,2)*at(3,3)-at(3,2)*at(2,3))+&
& at(2,1)*(at(3,2)*at(1,2)-at(1,2)*at(3,3))+&
at(3,1)*(at(1,2)*at(2,3)-at(2,2)*at(1,3))
!calculate reciprocal-space lattice vectors
bg(1,1)=2.0*pi*(at(2,2)*at(3,3)-at(3,2)*at(2,3))/ucvol
bg(2,1)=2.0*pi*(at(3,2)*at(1,3)-at(1,2)*at(3,3))/ucvol
bg(3,1)=2.0*pi*(at(1,2)*at(2,3)-at(2,2)*at(1,3))/ucvol
bg(1,2)=2.0*pi*(at(2,3)*at(3,1)-at(3,3)*at(2,1))/ucvol
bg(2,2)=2.0*pi*(at(3,3)*at(1,1)-at(1,3)*at(3,1))/ucvol
bg(3,2)=2.0*pi*(at(1,3)*at(2,1)-at(2,3)*at(1,1))/ucvol
bg(1,3)=2.0*pi*(at(2,1)*at(3,2)-at(3,1)*at(2,2))/ucvol
bg(2,3)=2.0*pi*(at(3,1)*at(1,2)-at(1,1)*at(3,2))/ucvol
bg(3,3)=2.0*pi*(at(1,1)*at(2,2)-at(2,1)*at(1,2))/ucvol
END SUBROUTINE calc_bg
!
SUBROUTINE fft_desc_init(dfft, smap, flavor, gamma_only, parallel, comm, nyfft)
USE stick_base
USE fft_types, ONLY : fft_type_descriptor, fft_type_init
@ -106,17 +98,279 @@ program test_fwinv_gpu
LOGICAL :: gamma_only
LOGICAL :: parallel
INTEGER :: comm, nyfft
INTEGER :: ngm, ngm_g
!
REAL(DP) :: at(3,3), bg(3,3)
REAL(DP) :: at(3,3), bg(3,3), alat, tpiba
REAL(DP), ALLOCATABLE :: g(:, :)
!
REAL(DP), PARAMETER :: gcut = 80.d0
REAL(DP), PARAMETER :: dual = 4.d0
REAL(DP), PARAMETER :: pi=4.D0*DATAN(1.D0)
!
! Define a direct lattice
at = RESHAPE((/10.d0, 0.d0, 0.d0, 0.d0, 10.d0, 0.d0, 0.d0, 0.d0, 10.d0/), shape(at))
CALL calc_bg(at, bg)
!
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 6.d0, &
alat = SQRT ( at(1,1)**2+at(2,1)**2+at(3,1)**2 )
!
! Lattice must be defined in units of alat
at(:,:) = at(:,:) / alat
!
tpiba = 2.0d0*pi/alat
!
! And the related recuprocal space
CALL recips(at(1, 1), at(1, 2), at(1, 3), bg(1, 1), bg(1, 2), bg(1, 3))
!
! In a FFT of flavor='wave' the dual, here set to 4.0d0, will multiply gcut to obtain
! the cutoff for hosting "charges" i.e. wfc**2.
!
! Inputs are:
! dfft -> the fft type containing all details about the distributed grid
! smap -> the map of the sticks used to distribute the data in rec. space
! flavor -> can be be 'wave' or 'rho'
! gamma_only -> setup the complex FFT to perform two real FFTs at a time
! parallel -> true if more than 1 process will take part in the FFT, false otherwise
! comm -> the mpi communicator to be used for messages among the processes performing the FFTs,
! basically alltoall communications to perform (partial) transposition of the data
! at -> the real space lattice, needs to be define in units of alat
! bg -> the reciprocal lattice
! gcut -> the cutoff for the plane wave expansion for flavor='wave'
! dual -> the dimension of the sphere of g vectors (generally meant
! to be large enough to host products of wavefunctions)
! nyfft -> data are decomposed along Z in real space when parallel=.true.,
! along X in real space when parallel=.false., and as sticks in reciprocal
! space. nyfft describes a second level of division along another dimension in real space
! or, when enabled, as portions of multiple bands. ???
! nmany -> maximum number of bands to be transformed simultaneously ???
!
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, gcut, dual, &
& nyfft=nyfft, nmany=1)
!
! Create a map between g vectors and distributed grid point in the fft.
! We will not he gvectors, but the map, stored in dfft$nl and dfft%nlm (nl minus, for gamma case)
! will be used to check only physically relevant numbers.
!
ALLOCATE (g(3, dfft%ngm))
!
! Largest g vector size
ngm = dfft%ngm
!
#if defined(__MPI)
CALL MPI_ALLREDUCE(ngm, ngm_g, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
#else
ngm_g = ngm
#endif
! Generate G vectors and map global g vectors to local FFT points.
CALL ggen(dfft, gamma_only, at, bg, 4.d0*gcut, ngm_g, ngm, g, .false.)
!
DEALLOCATE(g)
!
END SUBROUTINE fft_desc_init
!
SUBROUTINE ggen ( dfftp, gamma_only, at, bg, gcutm, ngm_g, ngm, g, no_global_sort )
!----------------------------------------------------------------------
!
! This routine generates all the reciprocal lattice vectors
! contained in the sphere of radius gcutm. Furthermore it
! computes the indices nl which give the correspondence
! between the fft mesh points and the array of g vectors.
!
USE fft_types, ONLY: fft_stick_index, fft_type_descriptor
USE fft_ggen, ONLY : fft_set_nl
USE fft_param
!USE mp, ONLY: mp_rank, mp_size, mp_sum
!USE constants, ONLY : eps8
!
IMPLICIT NONE
!
TYPE(fft_type_descriptor),INTENT(INOUT) :: dfftp
LOGICAL, INTENT(IN) :: gamma_only
REAL(DP), INTENT(IN) :: at(3,3), bg(3,3), gcutm
INTEGER, INTENT(IN) :: ngm_g
INTEGER, INTENT(INOUT) :: ngm
REAL(DP), INTENT(OUT) :: g(:,:)
! if no_global_sort is present (and it is true) G vectors are sorted only
! locally and not globally. In this case no global array needs to be
! allocated and sorted: saves memory and a lot of time for large systems.
!
LOGICAL, INTENT(IN) :: no_global_sort
!
! here a few local variables
!
REAL(DP) :: tx(3), ty(3), t(3)
REAL(DP), ALLOCATABLE :: tt(:)
INTEGER :: ngm_save, ngm_max, ngm_local
!
REAL(DP), ALLOCATABLE :: g2sort_g(:)
! array containing only g vectors for the current processor
INTEGER, ALLOCATABLE :: mill_unsorted(:,:)
! array containing all g vectors generators, on all processors
! (replicated data). When no_global_sort is present and .true.,
! only g-vectors for the current processor are stored
INTEGER, ALLOCATABLE :: igsrt(:), g2l(:)
!
INTEGER :: ni, nj, nk, i, j, k, ng
INTEGER :: istart, jstart, kstart
LOGICAL :: global_sort, is_local
!
! The 'no_global_sort' is not optional in this case.
! This differs from the version present in QE distribution.
global_sort = .NOT. no_global_sort
!
IF( .NOT. global_sort ) THEN
ngm_max = ngm
ELSE
ngm_max = ngm_g
END IF
!
! save current value of ngm
!
ngm_save = ngm
!
ngm = 0
ngm_local = 0
!
! set the total number of fft mesh points and initial value of gg
! The choice of gcutm is due to the fact that we have to order the
! vectors after computing them.
!
!
! and computes all the g vectors inside a sphere
!
ALLOCATE( mill_unsorted( 3, ngm_save ) )
ALLOCATE( igsrt( ngm_max ) )
ALLOCATE( g2l( ngm_max ) )
ALLOCATE( g2sort_g( ngm_max ) )
!
g2sort_g(:) = 1.0d20
!
! allocate temporal array
!
ALLOCATE( tt( dfftp%nr3 ) )
!
! max miller indices (same convention as in module stick_set)
!
ni = (dfftp%nr1-1)/2
nj = (dfftp%nr2-1)/2
nk = (dfftp%nr3-1)/2
!
! gamma-only: exclude space with x < 0
!
IF ( gamma_only ) THEN
istart = 0
ELSE
istart = -ni
ENDIF
!
iloop: DO i = istart, ni
!
! gamma-only: exclude plane with x = 0, y < 0
!
IF ( gamma_only .and. i == 0 ) THEN
jstart = 0
ELSE
jstart = -nj
ENDIF
!
tx(1:3) = i * bg(1:3,1)
!
jloop: DO j = jstart, nj
!
IF ( .NOT. global_sort ) THEN
IF ( fft_stick_index( dfftp, i, j ) == 0 ) CYCLE jloop
is_local = .TRUE.
ELSE
IF ( dfftp%lpara .AND. fft_stick_index( dfftp, i, j ) == 0) THEN
is_local = .FALSE.
ELSE
is_local = .TRUE.
END IF
END IF
!
! gamma-only: exclude line with x = 0, y = 0, z < 0
!
IF ( gamma_only .and. i == 0 .and. j == 0 ) THEN
kstart = 0
ELSE
kstart = -nk
ENDIF
!
ty(1:3) = tx(1:3) + j * bg(1:3,2)
!
! compute all the norm square
!
DO k = kstart, nk
!
t(1) = ty(1) + k * bg(1,3)
t(2) = ty(2) + k * bg(2,3)
t(3) = ty(3) + k * bg(3,3)
tt(k-kstart+1) = t(1)**2 + t(2)**2 + t(3)**2
ENDDO
!
! save all the norm square within cutoff
!
DO k = kstart, nk
IF (tt(k-kstart+1) <= gcutm) THEN
ngm = ngm + 1
IF (ngm > ngm_max) CALL fftx_error__ ('ggen 1', 'too many g-vectors', ngm)
IF ( tt(k-kstart+1) > eps8 ) THEN
g2sort_g(ngm) = tt(k-kstart+1)
ELSE
g2sort_g(ngm) = 0.d0
ENDIF
IF (is_local) THEN
ngm_local = ngm_local + 1
mill_unsorted( :, ngm_local ) = (/ i,j,k /)
g2l(ngm) = ngm_local
ELSE
g2l(ngm) = 0
ENDIF
ENDIF
ENDDO
ENDDO jloop
ENDDO iloop
IF (ngm /= ngm_max) &
CALL fftx_error__ ('ggen', 'g-vectors missing !', abs(ngm - ngm_max))
!
igsrt(1) = 0
IF( .NOT. global_sort ) THEN
CALL hpsort_eps( ngm, g2sort_g, igsrt, eps8 )
ELSE
CALL hpsort_eps( ngm_g, g2sort_g, igsrt, eps8 )
END IF
DEALLOCATE( g2sort_g, tt )
!
ngm = 0
!
ngloop: DO ng = 1, ngm_max
!
IF (g2l(igsrt(ng))>0) THEN
! fetch the indices
i = mill_unsorted(1, g2l(igsrt(ng)))
j = mill_unsorted(2, g2l(igsrt(ng)))
k = mill_unsorted(3, g2l(igsrt(ng)))
!
ngm = ngm + 1
!
! Here map local and global g index !!! N.B: :
! the global G vectors arrangement depends on the number of processors
!
g(1:3, ngm) = i * bg (:, 1) + j * bg (:, 2) + k * bg (:, 3)
!
ENDIF
ENDDO ngloop
DEALLOCATE( igsrt, g2l )
IF (ngm /= ngm_save) &
CALL fftx_error__ ('ggen', 'g-vectors (ngm) missing !', abs(ngm - ngm_save))
!
! Now set nl and nls with the correct fft correspondence
!
CALL fft_set_nl( dfftp, at, g)
!
END SUBROUTINE ggen
!
SUBROUTINE fft_desc_finalize(dfft, smap)
USE fft_types, ONLY : fft_type_descriptor, fft_type_deallocate
USE stick_base, ONLY : sticks_map, sticks_map_deallocate
@ -129,10 +383,9 @@ program test_fwinv_gpu
END SUBROUTINE fft_desc_finalize
!
SUBROUTINE fill_random(c, c_d, n)
USE cudafor
USE fft_param, ONLY : DP
implicit none
complex(DP), device :: c_d(:)
complex(DP) DEVATTR :: c_d(:)
complex(DP) :: c(:)
integer, intent(in) :: n
!
@ -140,13 +393,26 @@ program test_fwinv_gpu
!
ALLOCATE (rnd_aux(2*n))
CALL RANDOM_NUMBER(rnd_aux)
c = CMPLX(rnd_aux(1:n), rnd_aux(n:2*n))
c(1:n) = CMPLX(rnd_aux(1:n), rnd_aux(n+1:2*n), kind=DP)
c_d = c
DEALLOCATE(rnd_aux)
END SUBROUTINE fill_random
!
SUBROUTINE fill_random_cpu(c, n)
USE fft_param, ONLY : DP
implicit none
complex(DP) :: c(:)
integer, intent(in) :: n
!
real(DP), ALLOCATABLE :: rnd_aux(:)
!
ALLOCATE (rnd_aux(2*n))
CALL RANDOM_NUMBER(rnd_aux)
c(1:n) = CMPLX(rnd_aux(1:n), rnd_aux(n+1:2*n), kind=DP)
DEALLOCATE(rnd_aux)
END SUBROUTINE fill_random_cpu
!
SUBROUTINE test_fwfft_gpu_1(mp, test, gamma_only, ny)
USE cudafor
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
@ -162,9 +428,12 @@ program test_fwinv_gpu
!
LOGICAL :: parallel
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
COMPLEX(DP), ALLOCATABLE DEVATTR :: data_in_d(:)
INTEGER :: i
!
! task groups not implemented in 2D decomposition. Need to check the other case
IF ( ny .gt. 1 ) return
parallel = mp%n .gt. 1
CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
@ -172,7 +441,7 @@ program test_fwinv_gpu
! Test 1
!
IF ( ny .gt. 1 ) THEN
! Allocate variables
! Allocate variables and fill realspace data with random numbers
ALLOCATE(data_in(dfft%nnr_tg), aux(dfft%nnr_tg))
ALLOCATE(data_in_d(dfft%nnr_tg))
CALL fill_random(data_in, data_in_d, dfft%nnr_tg)
@ -180,20 +449,30 @@ program test_fwinv_gpu
CALL fwfft( 'tgWave' , data_in, dfft, 1 )
CALL fwfft( 'tgWave' , data_in_d, dfft, 1 )
ELSE
! Allocate variables and fill realspace data with random numbers
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
ALLOCATE(data_in_d(dfft%nnr))
CALL fill_random(data_in, data_in_d, dfft%nnr)
CALL fill_random(data_in, data_in_d, dfft%nnr)
!
CALL fwfft( 'Wave' , data_in, dfft, 1 )
CALL fwfft( 'Wave' , data_in_d, dfft, 1 )
ENDIF
! data from GPU is moved to an auxiliary array to compare the results of the GPU
! and the CPU implementation on the host
aux = data_in_d
! Check
CALL test%assert_close( data_in(1:dfft%ngw), aux(1:dfft%ngw) )
!
! Check, only the values relevant for a wavefunction FFT are considered
DO i=1,dfft%ngw
IF (gamma_only) CALL test%assert_close( data_in(dfft%nlm(i)), aux(dfft%nlm(i)) )
IF (.not. gamma_only) CALL test%assert_close( data_in(dfft%nl(i)), aux(dfft%nl(i)) )
ENDDO
!
DEALLOCATE(data_in, data_in_d, aux)
!
!
! Test 2
!
DEALLOCATE(data_in, data_in_d, aux)
! Same as above
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
ALLOCATE(data_in_d(dfft%nnr))
CALL fill_random(data_in, data_in_d, dfft%nnr)
@ -201,8 +480,13 @@ program test_fwinv_gpu
CALL fwfft( 'Rho' , data_in, dfft, 1 )
CALL fwfft( 'Rho' , data_in_d, dfft, 1 )
aux = data_in_d
! Check
CALL test%assert_close( data_in(1:dfft%ngm), aux(1:dfft%ngm) )
!
! Check, only the values relevant for a product of wavefunctions are considered
!
DO i=1,dfft%ngm
IF (gamma_only) CALL test%assert_close( data_in(dfft%nlm(i)), aux(dfft%nlm(i)) )
IF (.not. gamma_only) CALL test%assert_close( data_in(dfft%nl(i)), aux(dfft%nl(i)) )
ENDDO
!
CALL fft_desc_finalize(dfft, smap)
DEALLOCATE(data_in, data_in_d, aux)
@ -210,7 +494,7 @@ program test_fwinv_gpu
END SUBROUTINE test_fwfft_gpu_1
!
SUBROUTINE test_invfft_gpu_1(mp, test, gamma_only, ny)
USE cudafor
!
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
@ -226,8 +510,13 @@ program test_fwinv_gpu
!
LOGICAL :: parallel
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
COMPLEX(DP), ALLOCATABLE DEVATTR :: data_in_d(:)
integer :: i
!
! task groups not implemented in 2D decomposition. Need to check the other case
IF ( ny .gt. 1 ) return
parallel = mp%n .gt. 1
CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
@ -238,19 +527,32 @@ program test_fwinv_gpu
! Allocate variables
ALLOCATE(data_in(dfft%nnr_tg), aux(dfft%nnr_tg))
ALLOCATE(data_in_d(dfft%nnr_tg))
!
! Data here is not correctly filled, but this test is disabled.
! This is left as TODO!!!
!
CALL fill_random(data_in, data_in_d, dfft%nnr_tg)
!
CALL invfft( 'tgWave' , data_in, dfft, 1 )
CALL invfft( 'tgWave' , data_in_d, dfft, 1 )
ELSE
!
! Allocate variables
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
ALLOCATE(data_in_d(dfft%nnr))
CALL fill_random(data_in, data_in_d, dfft%nnr)
!
! Prepare input data, only vectors of wavefunctions
data_in = (0.d0, 0.d0)
CALL fill_random_cpu(aux, dfft%ngw)
DO i=1, dfft%ngw
IF (gamma_only) data_in(dfft%nlm(i)) = aux(i)
IF (.not. gamma_only) data_in(dfft%nl(i)) = aux(i)
ENDDO
! copy to gpu and cleanup aux
data_in_d = data_in
aux = (0.d0, 0.d0)
!
CALL invfft( 'Wave' , data_in, dfft, 1 )
CALL invfft( 'Wave' , data_in_d, dfft, 1 )
CALL invfft( 'Wave' , data_in_d, dfft, 1 )
ENDIF
aux = data_in_d
! Check
@ -261,7 +563,16 @@ program test_fwinv_gpu
DEALLOCATE(data_in, data_in_d, aux)
ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
ALLOCATE(data_in_d(dfft%nnr))
CALL fill_random(data_in, data_in_d, dfft%nnr)
! Prepare input data
data_in = (0.d0, 0.d0)
CALL fill_random_cpu(aux, dfft%ngm)
DO i=1, dfft%ngm
IF (gamma_only) data_in(dfft%nlm(i)) = aux(i)
IF (.not. gamma_only) data_in(dfft%nl(i)) = aux(i)
ENDDO
! copy to gpu and cleanup aux
data_in_d = data_in
aux = (0.d0, 0.d0)
!
CALL invfft( 'Rho' , data_in, dfft, 1 )
CALL invfft( 'Rho' , data_in_d, dfft, 1 )
@ -275,7 +586,7 @@ program test_fwinv_gpu
END SUBROUTINE test_invfft_gpu_1
!
SUBROUTINE test_fwfft_many_gpu_1(mp, test, gamma_only, ny)
USE cudafor
!
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
@ -291,9 +602,9 @@ program test_fwinv_gpu
!
LOGICAL :: parallel
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
COMPLEX(DP), ALLOCATABLE DEVATTR :: data_in_d(:)
integer, parameter :: howmany=4
INTEGER :: i, start
INTEGER :: i, ii, start
!
parallel = mp%n .gt. 1
CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
@ -314,9 +625,13 @@ program test_fwinv_gpu
DO i=0,howmany-1
start = i*dfft%nnr
CALL fwfft( 'Wave' , data_in(1+start:), dfft, 1 )
aux(start+1:start+dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
! Check
CALL test%assert_close( data_in(start+1:start+dfft%ngw), aux(start+1:start+dfft%ngw) )
aux(1:dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
! Check, only the values relevant for a wavefunction FFT are considered
DO ii=1,dfft%ngw
IF (gamma_only) CALL test%assert_close( data_in(dfft%nlm(ii)+start), aux(dfft%nlm(ii)) )
IF (.not. gamma_only) CALL test%assert_close( data_in(dfft%nl(ii)+start), aux(dfft%nl(ii)) )
ENDDO
!
END DO
!
ENDIF
@ -331,11 +646,19 @@ program test_fwinv_gpu
!
CALL fwfft( 'Rho' , data_in_d, dfft, howmany)
DO i=0,howmany-1
!
start = i*dfft%nnr
!
! This will FFT the content of data_in starting from start+1 and for nnr elements
CALL fwfft( 'Rho' , data_in(1+start:), dfft, 1 )
aux(1:dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
! Check
CALL test%assert_close( data_in(start+1:start+dfft%ngm), aux(1:dfft%ngm) )
!
! Same check as above, but remember that data_in starts from "start"
DO ii=1,dfft%ngm
IF (gamma_only) CALL test%assert_close( data_in(dfft%nlm(ii)+start), aux(dfft%nlm(ii)) )
IF (.not. gamma_only) CALL test%assert_close( data_in(dfft%nl(ii)+start), aux(dfft%nl(ii)) )
ENDDO
!
END DO
!
CALL fft_desc_finalize(dfft, smap)
@ -344,7 +667,6 @@ program test_fwinv_gpu
END SUBROUTINE test_fwfft_many_gpu_1
!
SUBROUTINE test_invfft_many_gpu_1(mp, test, gamma_only, ny)
USE cudafor
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
@ -360,8 +682,7 @@ program test_fwinv_gpu
!
LOGICAL :: parallel
COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
INTEGER(kind = cuda_stream_kind) :: strm = 0
COMPLEX(DP), ALLOCATABLE DEVATTR :: data_in_d(:)
integer, parameter :: howmany=4
integer :: start, i
!
@ -379,10 +700,23 @@ program test_fwinv_gpu
! Allocate variables
ALLOCATE(data_in(howmany*dfft%nnr), aux(howmany*dfft%nnr))
ALLOCATE(data_in_d(howmany*dfft%nnr))
CALL fill_random(data_in, data_in_d, howmany*dfft%nnr)
!
!CALL invfft( 'Wave' , data_in, dfft, 1 )
CALL invfft( 'Wave' , data_in_d, dfft, howmany=howmany, stream=strm )
data_in = (0.d0, 0.d0)
CALL fill_random_cpu(aux, dfft%ngw)
DO i=1, dfft%ngw
IF (gamma_only) data_in(dfft%nlm(i)) = aux(i)
IF (.not. gamma_only) data_in(dfft%nl(i)) = aux(i)
ENDDO
! copy data to simulate multiple bands
DO i=0,howmany-1
start = i*dfft%nnr
data_in(start+1:start+dfft%nnr) = data_in(1:dfft%nnr)
ENDDO
! copy to gpu and cleanup aux
data_in_d = data_in
aux = (0.d0, 0.d0)
!
CALL invfft( 'Wave' , data_in_d, dfft, howmany=howmany ) !, stream=strm )
DO i=0,howmany-1
start = i*dfft%nnr
CALL invfft( 'Wave' , data_in(1+start:), dfft, 1 )
@ -397,7 +731,25 @@ program test_fwinv_gpu
DEALLOCATE(data_in, data_in_d, aux)
ALLOCATE(data_in(dfft%nnr*howmany), aux(dfft%nnr))
ALLOCATE(data_in_d(dfft%nnr*howmany))
CALL fill_random(data_in, data_in_d, dfft%nnr*howmany)
data_in = (0.d0, 0.d0)
CALL fill_random_cpu(aux, dfft%ngm)
!
! Prepare vectors assuming that a product of wfcs in reciprocal space
! is stored in data_in
!
DO i=1, dfft%ngm
IF (gamma_only) data_in(dfft%nlm(i)) = aux(i)
IF (.not. gamma_only) data_in(dfft%nl(i)) = aux(i)
ENDDO
! copy data to simulate multiple bands
DO i=0,howmany-1
start = i*dfft%nnr
data_in(start+1:start+dfft%nnr) = data_in(1:dfft%nnr)
ENDDO
! copy to gpu input data and cleanup aux
data_in_d = data_in
aux = (0.d0, 0.d0)
!
CALL invfft( 'Rho' , data_in_d, dfft, howmany )
!
@ -415,10 +767,6 @@ program test_fwinv_gpu
END SUBROUTINE test_invfft_many_gpu_1
end program test_fwinv_gpu
#else
program test_fwinv_gpu
end program test_fwinv_gpu
#endif
!
! Dummy
SUBROUTINE stop_clock(label)