mirror of https://gitlab.com/QEF/q-e.git
A few more comments
This commit is contained in:
parent
ce1779b3bf
commit
a48893ff6f
|
@ -485,6 +485,8 @@ SUBROUTINE many_cft3s_gpu( f_d, dfft, isgn, batchsize )
|
|||
!
|
||||
IF ( isgn > 0 ) THEN
|
||||
DO j = 0, batchsize-1, dfft%subbatchsize
|
||||
! determine whether the FFTs that are left are less than the maximum
|
||||
! subbatchsize size.
|
||||
currsize = min(dfft%subbatchsize, batchsize - j)
|
||||
!
|
||||
IF ( isgn /= 2 ) THEN
|
||||
|
@ -497,10 +499,13 @@ SUBROUTINE many_cft3s_gpu( f_d, dfft, isgn, batchsize )
|
|||
!
|
||||
ENDIF
|
||||
!
|
||||
! perform the FFT along one direction and, at the same time,
|
||||
! read data spaced by dfft%nnr and store in in the output
|
||||
! with spacing ncpx*nx3, making it easy to bach communication.
|
||||
DO i = 0, currsize - 1
|
||||
CALL cft_1z_gpu( f_d((j+i)*dfft%nnr + 1:), sticks(me_p), n3, nx3, isgn, aux_d(j*dfft%nnr + i*ncpx*nx3 +1:), dfft%a2a_comp )
|
||||
ENDDO
|
||||
|
||||
!
|
||||
i = cudaEventRecord(dfft%bevents(j/dfft%subbatchsize + 1), dfft%a2a_comp)
|
||||
i = cudaStreamWaitEvent(dfft%bstreams(j/dfft%subbatchsize + 1), dfft%bevents(j/dfft%subbatchsize + 1), 0)
|
||||
|
||||
|
|
|
@ -450,6 +450,8 @@ SUBROUTINE fft_scatter_gpu_batch ( dfft, f_in_d, f_in, nr3x, nxx_, f_aux_d, f_au
|
|||
CALL start_clock ('a2a_fw')
|
||||
|
||||
istat = cudaDeviceSynchronize()
|
||||
! Here the data are sent to all processors involved in the FFT.
|
||||
! We avoid sending the block of data to be transposed that we already own.
|
||||
DO iter = 2, nprocp
|
||||
IF(IAND(nprocp, nprocp-1) == 0) THEN
|
||||
sorc = IEOR( me-1, iter-1 )
|
||||
|
@ -479,14 +481,18 @@ SUBROUTINE fft_scatter_gpu_batch ( dfft, f_in_d, f_in, nr3x, nxx_, f_aux_d, f_au
|
|||
#endif
|
||||
|
||||
ENDDO
|
||||
|
||||
!
|
||||
! here we move to f_in the portion that we did not send (was already in our hands)
|
||||
! this copy is overlapped with communication that is taking place at the same time
|
||||
! and is eventually completed...
|
||||
#ifdef __GPU_MPI
|
||||
istat = cudaMemcpyAsync( f_in_d( (me-1)*sendsiz + 1), f_aux_d((me-1)*sendsiz + 1), sendsiz, stream=dfft%a2a_comp )
|
||||
IF( istat /= cudaSuccess ) CALL fftx_error__ ('fft_scatter', 'cudaMemcpyAsync failed: ', istat)
|
||||
#else
|
||||
f_in((me-1)*sendsiz + 1 : me*sendsiz) = f_aux((me-1)*sendsiz + 1 : me*sendsiz)
|
||||
#endif
|
||||
|
||||
!
|
||||
! ...here, where we wait for it to finish.
|
||||
CALL MPI_WAITALL(2*nprocp-2, srh, MPI_STATUSES_IGNORE, ierr)
|
||||
IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'MPI_WAITALL info<>0', abs(ierr) )
|
||||
istat = cudaDeviceSynchronize()
|
||||
|
|
Loading…
Reference in New Issue