diff --git a/FFTXlib/fft_scalar.cuFFT.f90 b/FFTXlib/fft_scalar.cuFFT.f90 index 9a417c4d5..cc4a851ce 100644 --- a/FFTXlib/fft_scalar.cuFFT.f90 +++ b/FFTXlib/fft_scalar.cuFFT.f90 @@ -98,8 +98,6 @@ #endif IF (isign < 0) THEN - !print *,"exec cufft FWD",nz,ldz,nsl - !call flush(6) istat = cufftExecZ2Z( cufft_planz( ip), c_d(1), c_d(1), CUFFT_FORWARD ) tscale = 1.0_DP / nz IF (is_inplace) THEN @@ -114,12 +112,10 @@ END DO END IF ELSE IF (isign > 0) THEN - !print *,"exec cufft INV",nz,ldz,nsl - !call flush(6) IF (is_inplace) THEN - istat = cufftExecZ2Z( cufft_planz( ip), c_d(1), c_d(1), CUFFT_INVERSE ) !CUFFT_FORWARD ) + istat = cufftExecZ2Z( cufft_planz( ip), c_d(1), c_d(1), CUFFT_INVERSE ) ELSE - istat = cufftExecZ2Z( cufft_planz( ip), c_d(1), cout_d(1), CUFFT_INVERSE ) !CUFFT_FORWARD ) + istat = cufftExecZ2Z( cufft_planz( ip), c_d(1), cout_d(1), CUFFT_INVERSE ) END IF END IF @@ -163,10 +159,6 @@ DATA_DIM, STRIDE, DIST, & CUFFT_Z2Z, BATCH ) -#if defined(__CUDA_DEBUG) - print *,"INIT CUFFT Z PLAN: ",nz,"x",nsl,"x",ldz -#endif - #ifdef TRACK_FLOPS zflops( icurrent ) = 5.0d0 * REAL( nz ) * log( REAL( nz ) )/log( 2.d0 ) #endif @@ -428,10 +420,6 @@ DATA_DIM, STRIDE, DIST, & CUFFT_Z2Z, BATCH ) -#if defined(__CUDA_DEBUG) - print *,"INIT CUFFT ALL_XY PLAN: ",nx,"x",ny,"x",nzl,"ldx:",ldx,"batch:",batch_1,batch_2 -#endif - #else INTEGER, PARAMETER :: RANK=1 INTEGER :: FFT_DIM_X(RANK), DATA_DIM_X(RANK), FFT_DIM_Y(RANK), DATA_DIM_Y(RANK) @@ -455,9 +443,6 @@ IF( cufft_plan_y( 1, icurrent) /= 0 ) istat = cufftDestroy( cufft_plan_y(1,icurrent) ) IF( cufft_plan_y( 2, icurrent) /= 0 ) istat = cufftDestroy( cufft_plan_y(2,icurrent) ) -#if defined(__CUDA_DEBUG) - print *,"INIT CUFFT XY PLAN: ",nx,"x",ny,"x",nzl,"ldx:",ldx,"batch:",batch_1,batch_2 -#endif istat = cufftPlanMany( cufft_plan_x( icurrent), RANK, FFT_DIM_X, & DATA_DIM_X, STRIDE_X, DIST_X, & @@ -567,11 +552,9 @@ DO i=1, ldx*ldy*ldz*howmany f_d( i ) = f_d( i ) * tscale END DO -! call ZDSCAL( nx * ny * nz, tscale, f_d(1), 1) ELSE IF( isign > 0 ) THEN -! call FFTW_INPLACE_DRV_3D( bw_plan(ip), 1, f_d(1), 1, 1 ) istat = cufftExecZ2Z( cufft_plan_3d(ip), f_d(1), f_d(1), CUFFT_INVERSE ) END IF @@ -642,6 +625,9 @@ ! This routine is implemented only for fftw, essl, acml ! If not implemented, cfft3d is called instead ! + ! NB: this version is by far much slower than the 3D FFT of the + ! entire data. + ! !---------------------------------------------------------------------- ! implicit none diff --git a/FFTXlib/fft_types.f90 b/FFTXlib/fft_types.f90 index 7890bb50d..4914c7d7d 100644 --- a/FFTXlib/fft_types.f90 +++ b/FFTXlib/fft_types.f90 @@ -153,21 +153,27 @@ MODULE fft_types INTEGER :: grid_id #if defined(__CUDA) + ! These CUDA streams are used in the 1D+1D+1D GPU implementation INTEGER(kind=cuda_stream_kind), allocatable, dimension(:) :: stream_scatter_yz INTEGER(kind=cuda_stream_kind), allocatable, dimension(:) :: stream_many - INTEGER :: nstream_many = 16 - + ! These CUDA streams (and events) are used in the 1D+2D FPU implementation INTEGER(kind=cuda_stream_kind) :: a2a_comp INTEGER(kind=cuda_stream_kind), allocatable, dimension(:) :: bstreams TYPE(cudaEvent), allocatable, dimension(:) :: bevents - + ! + ! These variables define the dimension of batches and subbatches in + ! * the 1D+2D GPU implementation: INTEGER :: batchsize = 16 ! how many ffts to batch together INTEGER :: subbatchsize = 4 ! size of subbatch for pipelining - + ! * the 1D+1D+1D implementation: + INTEGER :: nstream_many = 16 ! this should be replace by batchsize + ! since it has the same meaning. + ! #if defined(__IPC) INTEGER :: IPC_PEER(16) ! This is used for IPC that is not imlpemented yet. #endif - INTEGER, ALLOCATABLE :: srh(:,:) ! Isend/recv handles by subbatch + INTEGER, ALLOCATABLE :: srh(:,:) ! These are non blocking send/recv handles that are used to + ! overlap computation and communication of FFTs subbatches. #endif COMPLEX(DP), ALLOCATABLE, DIMENSION(:) :: aux #if defined(__FFT_OPENMP_TASKS) diff --git a/external/devxlib b/external/devxlib index e5392b772..a9f7a1b01 160000 --- a/external/devxlib +++ b/external/devxlib @@ -1 +1 @@ -Subproject commit e5392b772497f8597f6c3b2851e0c17da756bb64 +Subproject commit a9f7a1b01ab10e00cae22a5dca4f73ebf7e4917d diff --git a/external/fox b/external/fox index 6fef49bcf..819745f58 160000 --- a/external/fox +++ b/external/fox @@ -1 +1 @@ -Subproject commit 6fef49bcfc4a380432f15734ed0ca1f0b0388977 +Subproject commit 819745f5849de5c9de516be133ab206691738257