mirror of https://gitlab.com/QEF/q-e.git
213 lines
8.2 KiB
Fortran
213 lines
8.2 KiB
Fortran
!----------------------------------------------------------------------------------------------------------------
|
|
!-real version
|
|
SUBROUTINE tg_gather( dffts, v, tg_v )
|
|
!
|
|
USE fft_param
|
|
USE fft_types, ONLY : fft_type_descriptor
|
|
|
|
IMPLICIT NONE
|
|
|
|
TYPE(fft_type_descriptor), INTENT(in) :: dffts
|
|
REAL(DP), INTENT(IN) :: v(dffts%nnr)
|
|
REAL(DP), INTENT(OUT) :: tg_v(dffts%nnr_tg)
|
|
|
|
INTEGER :: nxyp, ir3, off, tg_off
|
|
INTEGER :: i, nsiz, ierr
|
|
|
|
nxyp = dffts%nr1x*dffts%my_nr2p
|
|
!
|
|
! The potential in v is distributed so that each Z-plane is shared among nproc2 processors.
|
|
! We collect the data of whole planes in tg_v to be used with task group distributed wfcs.
|
|
!
|
|
tg_v(:) = (0.d0,0.d0)
|
|
do ir3 =1, dffts%my_nr3p
|
|
off = dffts%nr1x*dffts%my_nr2p*(ir3-1)
|
|
tg_off = dffts%nr1x*dffts%nr2x *(ir3-1) + dffts%nr1x*dffts%my_i0r2p
|
|
tg_v(tg_off+1:tg_off+nxyp) = v(off+1:off+nxyp)
|
|
end do
|
|
!write (6,*) ' tg_v ', dffts%my_i0r2p, dffts%my_nr2p
|
|
!write (6,'(20f12.7)') (v(dffts%my_i0r2p+i+dffts%nr1x*(i-1)), i=1,dffts%my_nr2p)
|
|
!write (6,'(20f12.7)') (tg_v(i+dffts%nr1x*(i-1)), i=1,dffts%nr2x)
|
|
#if defined(__MPI)
|
|
!used to be CALL mp_sum(tg_v, dffts%comm2 )
|
|
nsiz =dffts%nnr_tg
|
|
CALL MPI_ALLREDUCE( MPI_IN_PLACE, tg_v, nsiz, MPI_DOUBLE_PRECISION, MPI_SUM, dffts%comm2, ierr )
|
|
IF( ierr /= 0 ) CALL fftx_error__( ' tg_gather ', ' MPI_ALLREDUCE ', abs( ierr ) )
|
|
!- could be done (more efficintly?) with an ALLgatherv but the loigc of the ALLREDUCE is simpler
|
|
! CALL MPI_Allgatherv( v(1), nsiz, MPI_DOUBLE_PRECISION, &
|
|
! tg_v(1), recv_cnt, recv_displ, MPI_DOUBLE_PRECISION, dffts%comm2, IERR)
|
|
! IF( ierr /= 0 ) CALL fftx_error__( ' tg_gather ', ' MPI_Allgatherv ', abs( ierr ) )
|
|
#endif
|
|
!write (6,'(20f12.7)') (tg_v(i+dffts%nr1x*(i-1)), i=1,dffts%nr1x)
|
|
RETURN
|
|
END SUBROUTINE tg_gather
|
|
|
|
!-complex version of previous routine
|
|
SUBROUTINE tg_cgather( dffts, v, tg_v )
|
|
!
|
|
USE fft_param
|
|
USE fft_types, ONLY : fft_type_descriptor
|
|
|
|
IMPLICIT NONE
|
|
|
|
TYPE(fft_type_descriptor), INTENT(in) :: dffts
|
|
COMPLEX(DP), INTENT(IN) :: v(dffts%nnr)
|
|
COMPLEX(DP), INTENT(OUT) :: tg_v(dffts%nnr_tg)
|
|
|
|
INTEGER :: nxyp, ir3, off, tg_off
|
|
INTEGER :: i, nsiz, ierr
|
|
|
|
nxyp = dffts%nr1x*dffts%my_nr2p
|
|
!
|
|
! The potential in v is distributed so that each Z-plane is shared among nproc2 processors.
|
|
! We collect the data of whole planes in tg_v to be used with task group distributed wfcs.
|
|
!
|
|
tg_v(:) = (0.d0,0.d0)
|
|
do ir3 =1, dffts%my_nr3p
|
|
off = dffts%nr1x*dffts%my_nr2p*(ir3-1)
|
|
tg_off = dffts%nr1x*dffts%nr2x *(ir3-1) + dffts%nr1x*dffts%my_i0r2p
|
|
tg_v(tg_off+1:tg_off+nxyp) = v(off+1:off+nxyp)
|
|
end do
|
|
!write (6,*) ' tg_v ', dffts%my_i0r2p, dffts%my_nr2p
|
|
!write (6,'(20f12.7)') (v(dffts%my_i0r2p+i+dffts%nr1x*(i-1)), i=1,dffts%my_nr2p)
|
|
!write (6,'(20f12.7)') (tg_v(i+dffts%nr1x*(i-1)), i=1,dffts%nr2x)
|
|
#if defined(__MPI)
|
|
!used to be CALL mp_sum(tg_v, dffts%comm2 )
|
|
nsiz =2 * dffts%nnr_tg
|
|
CALL MPI_ALLREDUCE( MPI_IN_PLACE, tg_v, nsiz, MPI_DOUBLE_PRECISION, MPI_SUM, dffts%comm2, ierr )
|
|
IF( ierr /= 0 ) CALL fftx_error__( ' tg_gather ', ' MPI_ALLREDUCE ', abs( ierr ) )
|
|
!- could be done (more efficintly?) with an ALLgatherv but the loigc of the ALLREDUCE is simpler
|
|
! CALL MPI_Allgatherv( v(1), nsiz, MPI_DOUBLE_PRECISION, &
|
|
! tg_v(1), recv_cnt, recv_displ, MPI_DOUBLE_PRECISION, dffts%comm2, IERR)
|
|
! IF( ierr /= 0 ) CALL fftx_error__( ' tg_gather ', ' MPI_Allgatherv ', abs( ierr ) )
|
|
#endif
|
|
!write (6,'(20f12.7)') (tg_v(i+dffts%nr1x*(i-1)), i=1,dffts%nr1x)
|
|
RETURN
|
|
END SUBROUTINE tg_cgather
|
|
|
|
#if defined(__CUDA)
|
|
! === GPU CODE ===
|
|
!----------------------------------------------------------------------------------------------------------------
|
|
!-real version
|
|
SUBROUTINE tg_gather_gpu( dffts, v_d, tg_v_d )
|
|
!
|
|
USE fft_param
|
|
USE fft_types, ONLY : fft_type_descriptor
|
|
|
|
IMPLICIT NONE
|
|
!
|
|
TYPE(fft_type_descriptor), INTENT(in) :: dffts
|
|
REAL(DP), DEVICE, INTENT(IN) :: v_d(dffts%nnr)
|
|
REAL(DP), DEVICE, INTENT(OUT) :: tg_v_d(dffts%nnr_tg)
|
|
!
|
|
REAL(DP), ALLOCATABLE :: tg_v_mpi(:)
|
|
INTEGER :: nxyp, ir3, off, tg_off
|
|
INTEGER :: i, nsiz, ierr
|
|
INTEGER :: nr1x, nr2x, my_nr2p, my_i0r2p
|
|
|
|
nxyp = dffts%nr1x*dffts%my_nr2p
|
|
|
|
! This mapping improves readability but, most importantly, it is needed
|
|
! in the cuf kernels (as of PGI 18.10) since otherwise, when variables from
|
|
! `dffts` appear in the body of do loops, the compiler generates incorrect GPU code.
|
|
nr1x = dffts%nr1x
|
|
nr2x = dffts%nr2x
|
|
my_nr2p = dffts%my_nr2p
|
|
my_i0r2p = dffts%my_i0r2p
|
|
!
|
|
! The potential in v is distributed so that each Z-plane is shared among nproc2 processors.
|
|
! We collect the data of whole planes in tg_v to be used with task group distributed wfcs.
|
|
!
|
|
tg_v_d(:) = (0.d0,0.d0)
|
|
!$cuf kernel do(2) <<<*,*>>>
|
|
do ir3 =1, dffts%my_nr3p
|
|
do i=1, nxyp
|
|
off = nr1x*my_nr2p*(ir3-1)
|
|
tg_off = nr1x*nr2x *(ir3-1) + nr1x*my_i0r2p
|
|
tg_v_d(tg_off + i) = v_d(off+i)
|
|
end do
|
|
end do
|
|
|
|
!write (6,*) ' tg_v ', dffts%my_i0r2p, dffts%my_nr2p
|
|
!write (6,'(20f12.7)') (v(dffts%my_i0r2p+i+dffts%nr1x*(i-1)), i=1,dffts%my_nr2p)
|
|
!write (6,'(20f12.7)') (tg_v(i+dffts%nr1x*(i-1)), i=1,dffts%nr2x)
|
|
#if defined(__MPI)
|
|
!used to be CALL mp_sum(tg_v, dffts%comm2 )
|
|
nsiz =dffts%nnr_tg
|
|
ALLOCATE(tg_v_mpi(dffts%nnr_tg))
|
|
tg_v_mpi = tg_v_d
|
|
CALL MPI_ALLREDUCE( MPI_IN_PLACE, tg_v_mpi, nsiz, MPI_DOUBLE_PRECISION, MPI_SUM, dffts%comm2, ierr )
|
|
IF( ierr /= 0 ) CALL fftx_error__( ' tg_gather ', ' MPI_ALLREDUCE ', abs( ierr ) )
|
|
tg_v_d = tg_v_mpi
|
|
DEALLOCATE(tg_v_mpi)
|
|
|
|
!- could be done (more efficintly?) with an ALLgatherv but the loigc of the ALLREDUCE is simpler
|
|
! CALL MPI_Allgatherv( v(1), nsiz, MPI_DOUBLE_PRECISION, &
|
|
! tg_v(1), recv_cnt, recv_displ, MPI_DOUBLE_PRECISION, dffts%comm2, IERR)
|
|
! IF( ierr /= 0 ) CALL fftx_error__( ' tg_gather ', ' MPI_Allgatherv ', abs( ierr ) )
|
|
#endif
|
|
!write (6,'(20f12.7)') (tg_v(i+dffts%nr1x*(i-1)), i=1,dffts%nr1x)
|
|
RETURN
|
|
END SUBROUTINE tg_gather_gpu
|
|
|
|
!-complex version of previous routine
|
|
SUBROUTINE tg_cgather_gpu( dffts, v_d, tg_v_d )
|
|
!
|
|
USE fft_param
|
|
USE fft_types, ONLY : fft_type_descriptor
|
|
|
|
IMPLICIT NONE
|
|
|
|
TYPE(fft_type_descriptor), INTENT(in) :: dffts
|
|
COMPLEX(DP), DEVICE, INTENT(IN) :: v_d(dffts%nnr)
|
|
COMPLEX(DP), DEVICE, INTENT(OUT) :: tg_v_d(dffts%nnr_tg)
|
|
|
|
COMPLEX(DP), ALLOCATABLE :: tg_v_mpi(:)
|
|
INTEGER :: nxyp, ir3, off, tg_off
|
|
INTEGER :: i, nsiz, ierr
|
|
INTEGER :: nr1x, nr2x, my_nr2p, my_i0r2p
|
|
|
|
nxyp = dffts%nr1x*dffts%my_nr2p
|
|
|
|
! This mapping improves readability but, most importantly, it is needed
|
|
! in the cuf kernels (as of PGI 18.10) since otherwise, when variables from
|
|
! `dffts` appear in the body of do loops, the compiler generates incorrect GPU code.
|
|
nr1x = dffts%nr1x
|
|
nr2x = dffts%nr2x
|
|
my_nr2p = dffts%my_nr2p
|
|
my_i0r2p = dffts%my_i0r2p
|
|
!
|
|
! The potential in v is distributed so that each Z-plane is shared among nproc2 processors.
|
|
! We collect the data of whole planes in tg_v to be used with task group distributed wfcs.
|
|
!
|
|
tg_v_d(:) = (0.d0,0.d0)
|
|
!$cuf kernel do(2) <<<*,*>>>
|
|
do ir3 =1, dffts%my_nr3p
|
|
do i=1, nxyp
|
|
off = nr1x*my_nr2p*(ir3-1)
|
|
tg_off = nr1x*nr2x *(ir3-1) + nr1x*my_i0r2p
|
|
tg_v_d(tg_off + i) = v_d(off+i)
|
|
end do
|
|
end do
|
|
!write (6,*) ' tg_v ', dffts%my_i0r2p, dffts%my_nr2p
|
|
!write (6,'(20f12.7)') (v(dffts%my_i0r2p+i+dffts%nr1x*(i-1)), i=1,dffts%my_nr2p)
|
|
!write (6,'(20f12.7)') (tg_v(i+dffts%nr1x*(i-1)), i=1,dffts%nr2x)
|
|
#if defined(__MPI)
|
|
!used to be CALL mp_sum(tg_v, dffts%comm2 )
|
|
nsiz =2 * dffts%nnr_tg
|
|
ALLOCATE(tg_v_mpi(dffts%nnr_tg))
|
|
tg_v_mpi = tg_v_d
|
|
CALL MPI_ALLREDUCE( MPI_IN_PLACE, tg_v_mpi, nsiz, MPI_DOUBLE_PRECISION, MPI_SUM, dffts%comm2, ierr )
|
|
IF( ierr /= 0 ) CALL fftx_error__( ' tg_gather ', ' MPI_ALLREDUCE ', abs( ierr ) )
|
|
tg_v_d = tg_v_mpi
|
|
DEALLOCATE(tg_v_mpi)
|
|
!- could be done (more efficintly?) with an ALLgatherv but the loigc of the ALLREDUCE is simpler
|
|
! CALL MPI_Allgatherv( v(1), nsiz, MPI_DOUBLE_PRECISION, &
|
|
! tg_v(1), recv_cnt, recv_displ, MPI_DOUBLE_PRECISION, dffts%comm2, IERR)
|
|
! IF( ierr /= 0 ) CALL fftx_error__( ' tg_gather ', ' MPI_Allgatherv ', abs( ierr ) )
|
|
#endif
|
|
!write (6,'(20f12.7)') (tg_v(i+dffts%nr1x*(i-1)), i=1,dffts%nr1x)
|
|
RETURN
|
|
END SUBROUTINE tg_cgather_gpu
|
|
#endif
|