A few more comments

This commit is contained in:
Pietro Bonfa 2020-11-26 22:41:22 +01:00
parent ce1779b3bf
commit a48893ff6f
2 changed files with 14 additions and 3 deletions

View File

@ -485,6 +485,8 @@ SUBROUTINE many_cft3s_gpu( f_d, dfft, isgn, batchsize )
!
IF ( isgn > 0 ) THEN
DO j = 0, batchsize-1, dfft%subbatchsize
! determine whether the FFTs that are left are less than the maximum
! subbatchsize size.
currsize = min(dfft%subbatchsize, batchsize - j)
!
IF ( isgn /= 2 ) THEN
@ -497,10 +499,13 @@ SUBROUTINE many_cft3s_gpu( f_d, dfft, isgn, batchsize )
!
ENDIF
!
! perform the FFT along one direction and, at the same time,
! read data spaced by dfft%nnr and store in in the output
! with spacing ncpx*nx3, making it easy to bach communication.
DO i = 0, currsize - 1
CALL cft_1z_gpu( f_d((j+i)*dfft%nnr + 1:), sticks(me_p), n3, nx3, isgn, aux_d(j*dfft%nnr + i*ncpx*nx3 +1:), dfft%a2a_comp )
ENDDO
!
i = cudaEventRecord(dfft%bevents(j/dfft%subbatchsize + 1), dfft%a2a_comp)
i = cudaStreamWaitEvent(dfft%bstreams(j/dfft%subbatchsize + 1), dfft%bevents(j/dfft%subbatchsize + 1), 0)

View File

@ -450,6 +450,8 @@ SUBROUTINE fft_scatter_gpu_batch ( dfft, f_in_d, f_in, nr3x, nxx_, f_aux_d, f_au
CALL start_clock ('a2a_fw')
istat = cudaDeviceSynchronize()
! Here the data are sent to all processors involved in the FFT.
! We avoid sending the block of data to be transposed that we already own.
DO iter = 2, nprocp
IF(IAND(nprocp, nprocp-1) == 0) THEN
sorc = IEOR( me-1, iter-1 )
@ -479,14 +481,18 @@ SUBROUTINE fft_scatter_gpu_batch ( dfft, f_in_d, f_in, nr3x, nxx_, f_aux_d, f_au
#endif
ENDDO
!
! here we move to f_in the portion that we did not send (was already in our hands)
! this copy is overlapped with communication that is taking place at the same time
! and is eventually completed...
#ifdef __GPU_MPI
istat = cudaMemcpyAsync( f_in_d( (me-1)*sendsiz + 1), f_aux_d((me-1)*sendsiz + 1), sendsiz, stream=dfft%a2a_comp )
IF( istat /= cudaSuccess ) CALL fftx_error__ ('fft_scatter', 'cudaMemcpyAsync failed: ', istat)
#else
f_in((me-1)*sendsiz + 1 : me*sendsiz) = f_aux((me-1)*sendsiz + 1 : me*sendsiz)
#endif
!
! ...here, where we wait for it to finish.
CALL MPI_WAITALL(2*nprocp-2, srh, MPI_STATUSES_IGNORE, ierr)
IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'MPI_WAITALL info<>0', abs(ierr) )
istat = cudaDeviceSynchronize()