mirror of https://gitlab.com/QEF/q-e.git
568 lines
18 KiB
Fortran
568 lines
18 KiB
Fortran
#if defined(__CUDA)
|
|
#define DEVATTR ,DEVICE
|
|
#else
|
|
#define DEVATTR
|
|
#endif
|
|
|
|
#if defined(__CUDA) || defined(__OPENMP_GPU)
|
|
program test_fft_scatter_mod_gpu
|
|
#if defined(__MPI) && defined(__MPI_MODULE)
|
|
USE mpi
|
|
#endif
|
|
USE tester
|
|
IMPLICIT NONE
|
|
#if defined(__MPI) && ! defined(__MPI_MODULE)
|
|
INCLUDE 'mpif.h'
|
|
#endif
|
|
! MPI type
|
|
type mpi_t
|
|
integer :: me, n, root, comm
|
|
end type mpi_t
|
|
TYPE(mpi_t) :: mp
|
|
!
|
|
TYPE(tester_t) :: test
|
|
!
|
|
INTEGER :: ierr, level, i
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(_OPENMP)
|
|
CALL MPI_Init_thread(MPI_THREAD_FUNNELED,level, ierr)
|
|
#else
|
|
CALL MPI_Init(ierr)
|
|
#endif
|
|
#endif
|
|
!
|
|
CALL mpi_data_init(mp%me, mp%n, mp%root, mp%comm)
|
|
!
|
|
CALL test%init()
|
|
!
|
|
test%tolerance64 = 1.d-14
|
|
!
|
|
IF (mp%n > 1) THEN
|
|
!
|
|
CALL save_random_seed("test_fft_scatter_mod_gpu", mp%me)
|
|
!
|
|
DO i = 1, mp%n
|
|
IF (MOD(mp%n,i) == 0 ) THEN
|
|
! gamma case
|
|
CALL test_fft_scatter_xy_gpu_1(mp, test, .true., i)
|
|
! k case
|
|
CALL test_fft_scatter_xy_gpu_1(mp, test, .false., i)
|
|
!
|
|
! gamma case
|
|
CALL test_fft_scatter_yz_gpu_1(mp, test, .true., i)
|
|
! k case
|
|
CALL test_fft_scatter_yz_gpu_1(mp, test, .false., i)
|
|
END IF
|
|
END DO
|
|
CALL test_fft_scatter_many_yz_gpu_1(mp, test, .true., 1)
|
|
CALL test_fft_scatter_many_yz_gpu_1(mp, test, .false., 1)
|
|
!
|
|
ENDIF
|
|
!
|
|
CALL collect_results(test)
|
|
!
|
|
IF (mp%me == mp%root) CALL test%print()
|
|
!
|
|
#if defined(__MPI)
|
|
CALL MPI_Finalize(ierr)
|
|
#endif
|
|
CONTAINS
|
|
!
|
|
SUBROUTINE mpi_data_init(mpme, npes, mproot, comm)
|
|
implicit none
|
|
integer, intent(out) :: mpme, npes, mproot, comm
|
|
integer :: ierr
|
|
mpme=0; npes=1; mproot=0; comm=0
|
|
#if defined(__MPI)
|
|
CALL mpi_comm_rank(MPI_COMM_WORLD, mpme, ierr)
|
|
CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
|
|
comm = MPI_COMM_WORLD
|
|
#endif
|
|
END SUBROUTINE mpi_data_init
|
|
SUBROUTINE fft_desc_init(dfft, smap, flavor, gamma_only, parallel, comm, nyfft)
|
|
USE fft_types, ONLY : fft_type_descriptor, fft_type_init
|
|
USE stick_base, ONLY : sticks_map
|
|
USE fft_param, ONLY : DP
|
|
implicit none
|
|
TYPE(fft_type_descriptor) :: dfft
|
|
TYPE(sticks_map) :: smap
|
|
CHARACTER(LEN=*), INTENT(IN) :: flavor
|
|
LOGICAL :: gamma_only
|
|
LOGICAL :: parallel
|
|
INTEGER :: comm, nyfft
|
|
REAL(DP), PARAMETER :: pi=4.D0*DATAN(1.D0)
|
|
!
|
|
REAL(DP) :: at(3,3), bg(3,3)
|
|
!
|
|
at = RESHAPE((/1.d0, 0.d0, 0.d0, 0.d0, 1.d0, 0.d0, 0.d0, 0.d0, 1.d0/), shape(at))
|
|
bg = RESHAPE((/1.d0, 0.d0, 0.d0, 0.d0, 1.d0, 0.d0, 0.d0, 0.d0, 1.d0/), shape(bg))
|
|
bg = 2.d0*pi
|
|
!
|
|
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 4.d0, &
|
|
& nyfft=nyfft, nmany=1)
|
|
!
|
|
END SUBROUTINE fft_desc_init
|
|
|
|
SUBROUTINE fft_desc_finalize(dfft, smap)
|
|
USE fft_types, ONLY : fft_type_descriptor, fft_type_deallocate
|
|
USE stick_base, ONLY : sticks_map, sticks_map_deallocate
|
|
implicit none
|
|
TYPE(fft_type_descriptor) :: dfft
|
|
TYPE(sticks_map) :: smap
|
|
!
|
|
CALL fft_type_deallocate(dfft)
|
|
CALL sticks_map_deallocate( smap )
|
|
END SUBROUTINE fft_desc_finalize
|
|
!
|
|
SUBROUTINE fill_random(c, c_d, n)
|
|
#if defined(__CUDA)
|
|
USE cudafor
|
|
#endif
|
|
USE fft_param, ONLY : DP
|
|
implicit none
|
|
complex(DP) DEVATTR :: c_d(:)
|
|
complex(DP) :: c(:)
|
|
integer, intent(in) :: n
|
|
!
|
|
real(DP), ALLOCATABLE :: rnd_aux(:)
|
|
!
|
|
ALLOCATE (rnd_aux(2*n))
|
|
CALL RANDOM_NUMBER(rnd_aux)
|
|
c = CMPLX(rnd_aux(1:n), rnd_aux(n+1:2*n))
|
|
c_d = c
|
|
!$omp target update to(c_d)
|
|
DEALLOCATE(rnd_aux)
|
|
END SUBROUTINE fill_random
|
|
!
|
|
SUBROUTINE test_fft_scatter_xy_gpu_1(mp, test, gamma_only, ny)
|
|
#if defined(__CUDA)
|
|
USE cudafor
|
|
USE fft_scatter_gpu, ONLY : fft_scatter_xy_gpu
|
|
#elif defined(__OPENMP_GPU)
|
|
USE fft_scatter_omp, ONLY : fft_scatter_xy_omp
|
|
#endif
|
|
USE fft_param, ONLY : DP
|
|
USE fft_types, ONLY : fft_type_descriptor
|
|
USE stick_base, ONLY : sticks_map
|
|
USE fft_scatter, ONLY : fft_scatter_xy
|
|
implicit none
|
|
TYPE(mpi_t) :: mp
|
|
TYPE(tester_t) :: test
|
|
!
|
|
TYPE(fft_type_descriptor) :: dfft
|
|
TYPE(sticks_map) :: smap
|
|
LOGICAL, INTENT(IN) :: gamma_only
|
|
INTEGER, INTENT(IN) :: ny
|
|
!
|
|
LOGICAL :: parallel
|
|
COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_out(:)
|
|
COMPLEX(DP), ALLOCATABLE DEVATTR :: scatter_in_d(:), scatter_out_d(:)
|
|
#if !defined(__OPENMP_GPU)
|
|
COMPLEX(DP), ALLOCATABLE :: aux(:)
|
|
integer(kind = cuda_stream_kind) :: stream = 0
|
|
#endif
|
|
integer :: fft_sign = 2
|
|
integer :: vsiz, nr1p_, compare_len, me2
|
|
!
|
|
parallel = mp%n .gt. 1
|
|
|
|
CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
|
|
me2 = dfft%mype2 + 1
|
|
vsiz = dfft%nnr
|
|
compare_len = dfft%nr1x * dfft%my_nr2p * dfft%my_nr3p
|
|
if (ny > 1) then
|
|
! When using task groups, wave FFTs are not distributed along Y
|
|
fft_sign = 3
|
|
vsiz = dfft%nnr_tg
|
|
compare_len = dfft%nr1x * dfft%nr2x * dfft%my_nr3p
|
|
end if
|
|
!
|
|
! Allocate variables
|
|
ALLOCATE(scatter_in(vsiz), scatter_out(vsiz))
|
|
#if !defined(__OPENMP_GPU)
|
|
ALLOCATE(aux(vsiz))
|
|
#endif
|
|
ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz))
|
|
!
|
|
! Test 1
|
|
!$omp target enter data map(alloc:scatter_in_d,scatter_out_d)
|
|
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
|
!
|
|
CALL fft_scatter_xy( dfft, scatter_in, scatter_out, vsiz, fft_sign )
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL fft_scatter_xy_gpu( dfft, scatter_in_d, scatter_out_d, vsiz, fft_sign, stream )
|
|
#else
|
|
CALL fft_scatter_xy_omp( dfft, scatter_in_d, scatter_out_d, vsiz, fft_sign )
|
|
#endif
|
|
!$omp target update from(scatter_out_d(1:compare_len))
|
|
#if !defined(__OPENMP_GPU)
|
|
aux(1:compare_len) = scatter_out_d(1:compare_len)
|
|
#endif
|
|
!
|
|
! Check
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
|
|
#else
|
|
CALL test%assert_close( scatter_out(1:compare_len), scatter_out_d(1:compare_len) )
|
|
#endif
|
|
!
|
|
! Test 2
|
|
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
|
!
|
|
CALL fft_scatter_xy( dfft, scatter_out, scatter_in, vsiz, -1*fft_sign )
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL fft_scatter_xy_gpu( dfft, scatter_out_d, scatter_in_d, vsiz, -1*fft_sign, stream )
|
|
#else
|
|
CALL fft_scatter_xy_omp( dfft, scatter_out_d, scatter_in_d, vsiz, -1*fft_sign )
|
|
#endif
|
|
!
|
|
compare_len = dfft%nr2x * dfft%nr1w(me2) * dfft%my_nr3p
|
|
IF (ny > 1) compare_len = dfft%nr2x * dfft%nr1w_tg * dfft%my_nr3p
|
|
!$omp target update from(scatter_out_d(1:compare_len))
|
|
!
|
|
#if !defined(__OPENMP_GPU)
|
|
aux(1:compare_len) = scatter_out_d(1:compare_len)
|
|
#endif
|
|
! Check
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
|
|
#else
|
|
CALL test%assert_close( scatter_out(1:compare_len), scatter_out_d(1:compare_len) )
|
|
#endif
|
|
!
|
|
CALL fft_desc_finalize(dfft, smap)
|
|
!$omp target exit data map(delete:scatter_in_d, scatter_out_d)
|
|
#if !defined(__OPENMP_GPU)
|
|
DEALLOCATE(scatter_in, scatter_out, aux, scatter_in_d, scatter_out_d)
|
|
#else
|
|
DEALLOCATE(scatter_in, scatter_out, scatter_in_d, scatter_out_d)
|
|
#endif
|
|
!
|
|
END SUBROUTINE test_fft_scatter_xy_gpu_1
|
|
!
|
|
SUBROUTINE test_fft_scatter_yz_gpu_1(mp, test, gamma_only, ny)
|
|
!
|
|
! This test checks wave fft scatter, with parallel = .true. if
|
|
! called with more than 1 MPI.
|
|
!
|
|
#if defined(__CUDA)
|
|
USE cudafor
|
|
USE fft_scatter_gpu, ONLY : fft_scatter_yz_gpu
|
|
#elif defined(__OPENMP_GPU)
|
|
USE fft_scatter_omp, ONLY : fft_scatter_yz_omp
|
|
#endif
|
|
USE fft_param, ONLY : DP
|
|
USE fft_types, ONLY : fft_type_descriptor
|
|
USE stick_base, ONLY : sticks_map
|
|
USE fft_scatter, ONLY : fft_scatter_yz
|
|
implicit none
|
|
TYPE(mpi_t) :: mp
|
|
TYPE(tester_t) :: test
|
|
!
|
|
TYPE(fft_type_descriptor) :: dfft
|
|
TYPE(sticks_map) :: smap
|
|
LOGICAL, INTENT(IN) :: gamma_only
|
|
INTEGER, INTENT(IN) :: ny
|
|
!
|
|
LOGICAL :: parallel
|
|
COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_out(:)
|
|
COMPLEX(DP), ALLOCATABLE DEVATTR :: scatter_in_d(:), scatter_out_d(:)
|
|
#if !defined(__OPENMP_GPU)
|
|
COMPLEX(DP), ALLOCATABLE :: aux(:)
|
|
integer(kind = cuda_stream_kind) :: stream = 0
|
|
#endif
|
|
integer :: fft_sign = 2
|
|
integer :: vsiz, compare_len, my_nr1p_
|
|
!
|
|
parallel = mp%n .gt. 1
|
|
CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
|
|
vsiz = dfft%nnr
|
|
my_nr1p_ = count(dfft%ir1w > 0)
|
|
if (ny > 1) then
|
|
fft_sign = 3
|
|
vsiz = dfft%nnr_tg
|
|
my_nr1p_ = count(dfft%ir1w_tg > 0)
|
|
end if
|
|
!
|
|
! Allocate variables
|
|
ALLOCATE(scatter_in(vsiz), scatter_out(vsiz))
|
|
#if !defined(__OPENMP_GPU)
|
|
ALLOCATE(aux(vsiz))
|
|
#endif
|
|
ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz))
|
|
!
|
|
! Test 1
|
|
!$omp target enter data map(alloc:scatter_in_d,scatter_out_d)
|
|
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
|
!
|
|
CALL fft_scatter_yz( dfft, scatter_in, scatter_out, vsiz, fft_sign )
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL fft_scatter_yz_gpu( dfft, scatter_in_d, scatter_out_d, vsiz, fft_sign )
|
|
#else
|
|
CALL fft_scatter_yz_omp( dfft, scatter_in_d, scatter_out_d, vsiz, fft_sign )
|
|
#endif
|
|
! Set the number of elements that should be strictly equivalent in the
|
|
! two implementations.
|
|
compare_len = dfft%my_nr3p*my_nr1p_*dfft%nr2x
|
|
!$omp target update from(scatter_out_d(1:compare_len))
|
|
#if !defined(__OPENMP_GPU)
|
|
aux(1:compare_len) = scatter_out_d(1:compare_len)
|
|
#endif
|
|
! Check
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
|
|
#else
|
|
CALL test%assert_close( scatter_out(1:compare_len), scatter_out_d(1:compare_len) )
|
|
#endif
|
|
!
|
|
! Test 2
|
|
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
|
!
|
|
CALL fft_scatter_yz( dfft, scatter_out, scatter_in, vsiz, -1*fft_sign )
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL fft_scatter_yz_gpu( dfft, scatter_out_d, scatter_in_d, vsiz, -1*fft_sign )
|
|
#else
|
|
CALL fft_scatter_yz_omp( dfft, scatter_out_d, scatter_in_d, vsiz, -1*fft_sign )
|
|
#endif
|
|
!
|
|
compare_len = dfft%nsw(mp%me+1)*dfft%nr3x
|
|
!$omp target update from(scatter_out_d(1:compare_len))
|
|
#if !defined(__OPENMP_GPU)
|
|
aux(1:compare_len) = scatter_out_d(1:compare_len)
|
|
#endif
|
|
! Check
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
|
|
#else
|
|
CALL test%assert_close( scatter_out(1:compare_len), scatter_out_d(1:compare_len) )
|
|
#endif
|
|
!
|
|
CALL fft_desc_finalize(dfft, smap)
|
|
!$omp target exit data map(delete:scatter_in_d, scatter_out_d)
|
|
#if !defined(__OPENMP_GPU)
|
|
DEALLOCATE(scatter_in, scatter_out, aux, scatter_in_d, scatter_out_d)
|
|
#else
|
|
DEALLOCATE(scatter_in, scatter_out, scatter_in_d, scatter_out_d)
|
|
#endif
|
|
!
|
|
END SUBROUTINE test_fft_scatter_yz_gpu_1
|
|
!
|
|
SUBROUTINE test_fft_scatter_many_yz_gpu_1(mp, test, gamma_only, ny)
|
|
!
|
|
! This test checks wave fft scatter, with parallel = .true. if
|
|
! called with more than 1 MPI.
|
|
!
|
|
#if defined(__CUDA)
|
|
USE cudafor
|
|
USE fft_scatter_gpu, ONLY : fft_scatter_yz_gpu, fft_scatter_many_yz_gpu
|
|
#elif defined(__OPENMP_GPU)
|
|
USE fft_scatter_omp, ONLY : fft_scatter_yz_omp, fft_scatter_many_yz_omp
|
|
#endif
|
|
USE fft_param, ONLY : DP
|
|
USE fft_types, ONLY : fft_type_descriptor
|
|
USE stick_base, ONLY : sticks_map
|
|
USE fft_scatter, ONLY : fft_scatter_yz
|
|
implicit none
|
|
TYPE(mpi_t) :: mp
|
|
TYPE(tester_t) :: test
|
|
!
|
|
TYPE(fft_type_descriptor) :: dfft
|
|
TYPE(sticks_map) :: smap
|
|
LOGICAL, INTENT(IN) :: gamma_only
|
|
INTEGER, INTENT(IN) :: ny
|
|
INTEGER, PARAMETER :: howmany = 4
|
|
!
|
|
LOGICAL :: parallel
|
|
COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_in_cpy(:), scatter_out(:)
|
|
COMPLEX(DP), ALLOCATABLE DEVATTR :: scatter_in_d(:), scatter_out_d(:)
|
|
#if !defined(__OPENMP_GPU)
|
|
COMPLEX(DP), ALLOCATABLE :: aux(:)
|
|
COMPLEX(DP), ALLOCATABLE DEVATTR :: aux_d(:)
|
|
#endif
|
|
! convenient variables for slices
|
|
integer :: i, l, start_in, end_in, start_out, end_out, nstick_zx, n3, n3x, vsiz
|
|
integer :: start_sl, end_sl
|
|
!
|
|
parallel = mp%n .gt. 1
|
|
IF (ny > 1) print *, 'scatter_many does not support task grouping'
|
|
CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
|
|
!
|
|
! Allocate variables
|
|
vsiz = dfft%nnr*howmany
|
|
ALLOCATE(scatter_in(vsiz), scatter_in_cpy(vsiz), scatter_out(vsiz))
|
|
ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz))
|
|
#if !defined(__OPENMP_GPU)
|
|
ALLOCATE(aux(vsiz), aux_d(vsiz))
|
|
#endif
|
|
!
|
|
! How FFT allocates bunches in this case:
|
|
nstick_zx = MAXVAL(dfft%nsw)
|
|
n3 = dfft%nr3
|
|
n3x = dfft%nr3x
|
|
|
|
! Test 1
|
|
!$omp target enter data map(alloc:scatter_in_d, scatter_out_d)
|
|
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
|
scatter_in_cpy = scatter_in
|
|
!
|
|
!print *, 'dfft%nnr, nr3, nr2, nr1 : ', dfft%nnr, dfft%nr3, dfft%nr2, dfft%nr1
|
|
! Test 1.1, compare scatter of slices
|
|
DO i=0,howmany-1
|
|
start_in = i*dfft%nnr + 1
|
|
end_in = (i+1)*dfft%nnr
|
|
start_out = start_in
|
|
!
|
|
end_out = (start_out-1) + dfft%my_nr3p*dfft%nr1w(dfft%mype2 +1)*dfft%nr2x
|
|
!
|
|
CALL fft_scatter_yz( dfft, scatter_in(start_in:end_in), scatter_out(start_out:end_out), dfft%nnr, 2 )
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL fft_scatter_yz_gpu( dfft, scatter_in_d(start_in:end_in), scatter_out_d(start_in:end_out), dfft%nnr, 2 )
|
|
#else
|
|
CALL fft_scatter_yz_omp( dfft, scatter_in_d(start_in:end_in), scatter_out_d(start_in:end_out), dfft%nnr, 2 )
|
|
#endif
|
|
!$omp target update from(scatter_out_d(start_out:end_out))
|
|
#if !defined(__OPENMP_GPU)
|
|
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
|
|
#endif
|
|
! Check
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
|
|
#else
|
|
CALL test%assert_close( scatter_out(start_out:end_out), scatter_out_d(start_out:end_out) )
|
|
#endif
|
|
END DO
|
|
!
|
|
scatter_in = scatter_in_cpy
|
|
! Store data as expected in input
|
|
DO i=0,howmany-1
|
|
start_in = i*dfft%nnr + 1
|
|
end_in = (i+1)*dfft%nnr
|
|
start_out= i*nstick_zx*n3x+1
|
|
end_out = (i+1)*nstick_zx*n3
|
|
scatter_in_d(start_out:end_out) = scatter_in(start_in:start_in+nstick_zx*n3)
|
|
END DO
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL fft_scatter_many_yz_gpu ( dfft, scatter_in_d, scatter_out_d, vsiz, 2, howmany )
|
|
#else
|
|
!$omp target update to(scatter_in_d)
|
|
CALL fft_scatter_many_yz_omp ( dfft, scatter_in_d, scatter_out_d, vsiz, 2, howmany )
|
|
!$omp target update from(scatter_out_d)
|
|
#endif
|
|
|
|
DO i=0,howmany-1
|
|
start_out = i*dfft%nnr + 1
|
|
end_out = (start_out-1) + dfft%my_nr3p*dfft%nr1w(dfft%mype2 +1)*dfft%nr2x
|
|
!
|
|
#if !defined(__OPENMP_GPU)
|
|
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
|
|
!
|
|
CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
|
|
#else
|
|
CALL test%assert_close( scatter_out(start_out:end_out), scatter_out_d(start_out:end_out) )
|
|
#endif
|
|
END DO
|
|
!
|
|
!
|
|
! Test 2
|
|
CALL fill_random(scatter_in, scatter_in_d, vsiz)
|
|
scatter_in_cpy = scatter_in
|
|
!
|
|
DO i=0,howmany-1
|
|
! Input data for fft_scatter_yz call
|
|
start_in = i*dfft%nnr + 1
|
|
end_in = (i+1)*dfft%nnr
|
|
!
|
|
! Where to store output data
|
|
start_out = start_in
|
|
end_out = end_in
|
|
CALL fft_scatter_yz( dfft, scatter_out(start_out:end_out), scatter_in(start_in:end_in), dfft%nnr, -2 )
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL fft_scatter_yz_gpu( dfft, scatter_out_d(start_out:end_out), scatter_in_d(start_in:end_in), dfft%nnr, -2 )
|
|
#else
|
|
CALL fft_scatter_yz_omp( dfft, scatter_out_d(start_out:end_out), scatter_in_d(start_in:end_in), dfft%nnr, -2 )
|
|
#endif
|
|
!$omp target update from(scatter_out_d(start_out:end_out))
|
|
!
|
|
#if !defined(__OPENMP_GPU)
|
|
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
|
|
#endif
|
|
!
|
|
! Check only relevant part
|
|
end_out = start_out + dfft%nsw(mp%me+1)*n3
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
|
|
#else
|
|
CALL test%assert_close( scatter_out(start_out:end_out), scatter_out_d(start_out:end_out) )
|
|
#endif
|
|
END DO
|
|
!
|
|
!
|
|
! Now repeat the test, but doing all the FFTs in a single shot
|
|
scatter_in_d(1:vsiz) = scatter_in_cpy(1:vsiz)
|
|
!$omp target update to(scatter_in_d)
|
|
!
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL fft_scatter_many_yz_gpu ( dfft, scatter_out_d, scatter_in_d, vsiz, -2, howmany )
|
|
#else
|
|
CALL fft_scatter_many_yz_omp ( dfft, scatter_out_d, scatter_in_d, vsiz, -2, howmany )
|
|
#endif
|
|
|
|
DO i=0,howmany-1
|
|
! Extract data from GPU. Data are spaced by nstick_zx*n3x
|
|
start_out = 1 + i*nstick_zx*n3x
|
|
end_out = (i+1)*nstick_zx*n3x
|
|
!$omp target update from(scatter_out_d(start_out:end_out))
|
|
#if !defined(__OPENMP_GPU)
|
|
aux(start_out:end_out) = scatter_out_d(start_out:end_out)
|
|
#endif
|
|
!
|
|
start_in = i*dfft%nnr + 1
|
|
end_in = i*dfft%nnr + dfft%nsw(mp%me+1)*n3x !nstick_zx*nx3!
|
|
!
|
|
! Extract only data tofft_scatter_yz compare the two methods. Results from the
|
|
! previous call to fft_scatter_yz are separated by nnr, while the
|
|
! new results are separated by nstick_zx*n3x. We read start_out
|
|
! and add from there dfft%nsw(mp%me+1)*n3x to be compared (don't forget the -1!)
|
|
start_sl = start_out
|
|
end_sl = (start_out - 1) + dfft%nsw(mp%me+1)*n3x
|
|
#if !defined(__OPENMP_GPU)
|
|
CALL test%assert_close( aux(start_sl:end_sl), scatter_out(start_in:end_in) )
|
|
#else
|
|
CALL test%assert_close( scatter_out_d(start_sl:end_sl), scatter_out(start_in:end_in) )
|
|
#endif
|
|
END DO
|
|
!
|
|
CALL fft_desc_finalize(dfft, smap)
|
|
!$omp target exit data map(delete:scatter_in_d, scatter_out_d)
|
|
#if !defined(__OPENMP_GPU)
|
|
DEALLOCATE(scatter_in, scatter_in_cpy, scatter_out, aux, scatter_in_d, scatter_out_d)
|
|
#else
|
|
DEALLOCATE(scatter_in, scatter_in_cpy, scatter_out, scatter_in_d, scatter_out_d)
|
|
#endif
|
|
!
|
|
END SUBROUTINE test_fft_scatter_many_yz_gpu_1
|
|
|
|
end program test_fft_scatter_mod_gpu
|
|
#else
|
|
program test_fft_scatter_mod_gpu
|
|
end program test_fft_scatter_mod_gpu
|
|
#endif
|
|
!
|
|
! Dummy
|
|
SUBROUTINE stop_clock(label)
|
|
CHARACTER(*) :: label
|
|
END SUBROUTINE stop_clock
|
|
!
|
|
SUBROUTINE start_clock(label)
|
|
CHARACTER(*) :: label
|
|
END SUBROUTINE start_clock
|
|
!
|
|
SUBROUTINE stop_clock_gpu(label)
|
|
CHARACTER(*) :: label
|
|
END SUBROUTINE stop_clock_gpu
|
|
!
|
|
SUBROUTINE start_clock_gpu(label)
|
|
CHARACTER(*) :: label
|
|
END SUBROUTINE start_clock_gpu
|