quantum-espresso/FFTXlib/fft_parallel_2d.f90

!
! Copyright (C) Quantum ESPRESSO group
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!=---------------------------------------------------------------------==!
!
!
!     Parallel 3D FFT high level Driver
!     ( Charge density and Wave Functions )
!
!     Written and maintained by Carlo Cavazzoni
!     Last update Apr. 2009
!
!!=---------------------------------------------------------------------==!
!
MODULE fft_parallel_2d
!
#ifdef __CUDA
   USE cudafor
#endif
   !
   USE fft_param
   IMPLICIT NONE
   SAVE
   !
!
CONTAINS
!
!  General purpose driver, including Task groups parallelization
!
!----------------------------------------------------------------------------
SUBROUTINE tg_cft3s( f, dfft, isgn )
  !----------------------------------------------------------------------------
  !
  !! ... isgn = +-1 : parallel 3d fft for rho and for the potential
  !                  NOT IMPLEMENTED WITH TASK GROUPS
  !! ... isgn = +-2 : parallel 3d fft for wavefunctions
  !
  !! ... isgn = +   : G-space to R-space, output = \sum_G f(G)exp(+iG*R)
  !! ...              fft along z using pencils        (cft_1z)
  !! ...              transpose across nodes           (fft_scatter)
  !! ...                 and reorder
  ! ...              fft along y (using planes) and x (cft_2xy)
  ! ... isgn = -   : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega
  ! ...              fft along x and y(using planes)  (cft_2xy)
  ! ...              transpose across nodes           (fft_scatter)
  ! ...                 and reorder
  ! ...              fft along z using pencils        (cft_1z)
  !
  ! ...  The array "planes" signals whether a fft is needed along y :
  ! ...    planes(i)=0 : column f(i,*,*) empty , don't do fft along y
  ! ...    planes(i)=1 : column f(i,*,*) filled, fft along y needed
  ! ...  "empty" = no active components are present in f(i,*,*)
  ! ...            after (isgn>0) or before (isgn<0) the fft on z direction
  !
  ! ...  Note that if isgn=+/-1 (fft on rho and pot.) all fft's are needed
  ! ...  and all planes(i) are set to 1
  !
  ! This driver is based on code written by Stefano de Gironcoli for PWSCF.
  ! Task Group added by Costas Bekas, Oct. 2005, adapted from the CPMD code
  ! (Alessandro Curioni) and revised by Carlo Cavazzoni 2007.
  !
  USE fft_scalar, ONLY : cft_1z, cft_2xy
  USE fft_scatter_2d,   ONLY : fft_scatter
  USE fft_types,  ONLY : fft_type_descriptor
  !
  IMPLICIT NONE
  !
  COMPLEX(DP), INTENT(inout)    :: f( : )  ! array containing data to be transformed
  TYPE (fft_type_descriptor), INTENT(in) :: dfft
                                           ! descriptor of fft data layout
  INTEGER, INTENT(in)           :: isgn    ! fft direction
  !
  !
  INTEGER                    :: me_p
  INTEGER                    :: n1, n2, n3, nx1, nx2, nx3
  COMPLEX(DP), ALLOCATABLE   :: aux (:)
  INTEGER                    :: planes( dfft%nr1x )
  !LOGICAL                    :: use_tg
  !
  !
  IF (dfft%has_task_groups) CALL fftx_error__( ' tg_cft3s ', ' task groups on large mesh not implemented ', 1 )
  !
  n1  = dfft%nr1
  n2  = dfft%nr2
  n3  = dfft%nr3
  nx1 = dfft%nr1x
  nx2 = dfft%nr2x
  nx3 = dfft%nr3x
  !
  ALLOCATE( aux( dfft%nnr ) )
  !
  me_p = dfft%mype + 1
  !
  IF ( isgn > 0 ) THEN
     !
     IF ( isgn /= 2 ) THEN
        !
        CALL cft_1z( f, dfft%nsp( me_p ), n3, nx3, isgn, aux )
        !
        planes = dfft%iplp
        !
     ELSE
        !
        CALL cft_1z( f, dfft%nsw( me_p ), n3, nx3, isgn, aux )
        !
        planes = dfft%iplw
        !
     ENDIF
     !
     CALL fw_scatter( isgn ) ! forward scatter from stick to planes
     !
     CALL cft_2xy( f, dfft%my_nr3p, n1, n2, nx1, nx2, isgn, planes )
     !
  ELSE
     !
     IF ( isgn == -1 ) THEN
        !
        planes = dfft%iplp
        !
     ELSE IF ( isgn == -2 ) THEN
        !
        planes = dfft%iplw
        !
     ENDIF
     !
     CALL cft_2xy( f, dfft%my_nr3p, n1, n2, nx1, nx2, isgn, planes )
     !
     CALL bw_scatter( isgn )
     !
     IF ( isgn /= -2 ) THEN
        !
        CALL cft_1z( aux, dfft%nsp( me_p ), n3, nx3, isgn, f )
        !
     ELSE
        !
        CALL cft_1z( aux, dfft%nsw( me_p ), n3, nx3, isgn, f )
        !
     ENDIF
     !
  ENDIF
  !
  DEALLOCATE( aux )
  !
  RETURN
  !
CONTAINS
  !
  SUBROUTINE fw_scatter( iopt )

     !Transpose data for the 2-D FFT on the x-y plane
     !
     !NOGRP*dfft%nnr: The length of aux and f
     !nr3x: The length of each Z-stick
     !aux: input - output
     !f: working space
     !isgn: type of scatter
     !dfft%nsw(me) holds the number of Z-sticks proc. me has.
     !dfft%nr3p: number of planes per processor
     !
     !
     USE fft_scatter_2d, ONLY: fft_scatter
     !
     INTEGER, INTENT(in) :: iopt
     !
     IF( iopt == 2 ) THEN
        !
        CALL fft_scatter( dfft, aux, nx3, dfft%nnr, f, dfft%nsw, dfft%nr3p, iopt )
        !
     ELSEIF( iopt == 1 ) THEN
        !
        CALL fft_scatter( dfft, aux, nx3, dfft%nnr, f, dfft%nsp, dfft%nr3p, iopt )
        !
     ENDIF
     !
     RETURN
  END SUBROUTINE fw_scatter

  !

  SUBROUTINE bw_scatter( iopt )
     !
     USE fft_scatter_2d, ONLY: fft_scatter
     !
     INTEGER, INTENT(in) :: iopt
     !
     IF( iopt == -2 ) THEN
        !
        CALL fft_scatter( dfft, aux, nx3, dfft%nnr, f, dfft%nsw, dfft%nr3p, iopt )
        !
     ELSEIF( iopt == -1 ) THEN
        !
        CALL fft_scatter( dfft, aux, nx3, dfft%nnr, f, dfft%nsp, dfft%nr3p, iopt )
        !
     ENDIF
     !
     RETURN
  END SUBROUTINE bw_scatter
  !
END SUBROUTINE tg_cft3s
!
!
!
#if defined(__CUDA)
!----------------------------------------------------------------------------
SUBROUTINE tg_cft3s_gpu( f_d, dfft, isgn )
  !----------------------------------------------------------------------------
  !
  !! ... isgn = +-1 : parallel 3d fft for rho and for the potential
  !                  NOT IMPLEMENTED WITH TASK GROUPS
  !! ... isgn = +-2 : parallel 3d fft for wavefunctions
  !
  !! ... isgn = +   : G-space to R-space, output = \sum_G f(G)exp(+iG*R)
  !! ...              fft along z using pencils        (cft_1z)
  !! ...              transpose across nodes           (fft_scatter)
  !! ...                 and reorder
  ! ...              fft along y (using planes) and x (cft_2xy)
  ! ... isgn = -   : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega
  ! ...              fft along x and y(using planes)  (cft_2xy)
  ! ...              transpose across nodes           (fft_scatter)
  ! ...                 and reorder
  ! ...              fft along z using pencils        (cft_1z)
  !
  ! ...  The array "planes" signals whether a fft is needed along y :
  ! ...    planes(i)=0 : column f(i,*,*) empty , don't do fft along y
  ! ...    planes(i)=1 : column f(i,*,*) filled, fft along y needed
  ! ...  "empty" = no active components are present in f(i,*,*)
  ! ...            after (isgn>0) or before (isgn<0) the fft on z direction
  !
  ! ...  Note that if isgn=+/-1 (fft on rho and pot.) all fft's are needed
  ! ...  and all planes(i) are set to 1
  !
  ! This driver is based on code written by Stefano de Gironcoli for PWSCF.
  ! Task Group added by Costas Bekas, Oct. 2005, adapted from the CPMD code
  ! (Alessandro Curioni) and revised by Carlo Cavazzoni 2007.
  !
  USE fft_scalar, ONLY : cft_1z_gpu, cft_2xy_gpu
  USE fft_scatter_2d_gpu,   ONLY : fft_scatter_gpu
  USE fft_types,  ONLY : fft_type_descriptor
  USE fft_buffers, ONLY : check_buffers_size, &
                            f_h => pin_space_scatter_in, &
                            aux_h => pin_space_scatter_out, &
                            aux_d => dev_space_fftparallel
  !
  IMPLICIT NONE
  !
  TYPE (fft_type_descriptor), INTENT(in) :: dfft
  COMPLEX(DP), DEVICE, INTENT(inout)    :: f_d( dfft%nnr ) ! array containing data to be transformed
                                           ! descriptor of fft data layout
  INTEGER, INTENT(in)           :: isgn    ! fft direction
  !
  !
  INTEGER                    :: me_p, istat
  INTEGER                    :: n1, n2, n3, nx1, nx2, nx3
  COMPLEX(DP), ALLOCATABLE   :: yf(:)
  INTEGER                    :: planes( dfft%nr1x )
  INTEGER(kind = cuda_stream_kind) :: stream  = 0
  !
  !
  n1  = dfft%nr1
  n2  = dfft%nr2
  n3  = dfft%nr3
  nx1 = dfft%nr1x
  nx2 = dfft%nr2x
  nx3 = dfft%nr3x
  !
  IF( dfft%has_task_groups ) CALL fftx_error__( ' tg_cft3s ', ' task groups in 2D + 1D decomposition not implemented ', 1 )
  !
  CALL check_buffers_size(dfft)
  !
  me_p = dfft%mype + 1
  !
  IF ( isgn > 0 ) THEN
     !
     IF ( isgn /= 2 ) THEN
        !
        CALL cft_1z_gpu( f_d, dfft%nsp( me_p ), n3, nx3, isgn, aux_d, stream )
        !
        planes = dfft%iplp
        !
     ELSE
        !
        CALL cft_1z_gpu( f_d, dfft%nsw( me_p ), n3, nx3, isgn, aux_d, stream )
        !
        planes = dfft%iplw
        !
     ENDIF
     !
     CALL fw_scatter_gpu( isgn ) ! forward scatter from stick to planes
     !
     CALL cft_2xy_gpu( f_d, aux_d, dfft%my_nr3p, n1, n2, nx1, nx2, isgn, stream, planes )
     !
  ELSE
     !
     IF ( isgn /= -2 ) THEN
        !
        planes = dfft%iplp
        !
     ELSE
        !
        planes = dfft%iplw
        !
     ENDIF

     CALL cft_2xy_gpu( f_d, aux_d, dfft%my_nr3p, n1, n2, nx1, nx2, isgn, stream, planes)
     !
     CALL bw_scatter_gpu( isgn )
     !
     !f_d = (0.d0, 0.d0)
     !
     IF ( isgn /= -2 ) THEN
        !
        CALL cft_1z_gpu( aux_d, dfft%nsp( me_p ), n3, nx3, isgn, f_d, stream )
        !
     ELSE
        !
        CALL cft_1z_gpu( aux_d, dfft%nsw( me_p ), n3, nx3, isgn, f_d, stream )
        !
     ENDIF
     !
  ENDIF
  !
  RETURN
  !
CONTAINS
  !
  SUBROUTINE fw_scatter_gpu( iopt )

     !Transpose data for the 2-D FFT on the x-y plane
     !
     !NOGRP*dfft%nnr: The length of aux and f
     !nr3x: The length of each Z-stick
     !aux: input - output
     !f: working space
     !isgn: type of scatter
     !dfft%nsw(me) holds the number of Z-sticks proc. me has.
     !dfft%nr3p: number of planes per processor
     !
     !
     USE fft_scatter_2d_gpu, ONLY: fft_scatter_gpu
     !
     INTEGER, INTENT(in) :: iopt
     !
     IF( iopt == 2 ) THEN
        !
        CALL fft_scatter_gpu( dfft, aux_d, aux_h, nx3, dfft%nnr, f_d, f_h, dfft%nsw, dfft%nr3p, iopt )
        !
     ELSEIF( iopt == 1 ) THEN
        !
        CALL fft_scatter_gpu( dfft, aux_d, aux_h, nx3, dfft%nnr, f_d, f_h, dfft%nsp, dfft%nr3p, iopt )
        !
     ENDIF
     !
     RETURN
  END SUBROUTINE fw_scatter_gpu

  !

  SUBROUTINE bw_scatter_gpu( iopt )
     !
     USE fft_scatter_2d_gpu, ONLY: fft_scatter_gpu
     !
     INTEGER, INTENT(in) :: iopt
     !
     IF( iopt == -2 ) THEN
        !
        CALL fft_scatter_gpu( dfft, aux_d, aux_h, nx3, dfft%nnr, f_d, f_h, dfft%nsw, dfft%nr3p, iopt )
        !
     ELSEIF( iopt == -1 ) THEN
        !
        CALL fft_scatter_gpu( dfft, aux_d, aux_h, nx3, dfft%nnr, f_d, f_h, dfft%nsp, dfft%nr3p, iopt )
        !
     ENDIF
     !
     RETURN
  END SUBROUTINE bw_scatter_gpu
  !
END SUBROUTINE tg_cft3s_gpu

SUBROUTINE many_cft3s_gpu( f_d, dfft, isgn, batchsize )
  !----------------------------------------------------------------------------
  !
  !
  !! ... isgn = +-1 : parallel 3d fft for rho and for the potential
  !                  NOT IMPLEMENTED WITH TASK GROUPS
  !! ... isgn = +-2 : parallel 3d fft for wavefunctions
  !
  !! ... isgn = +   : G-space to R-space, output = \sum_G f(G)exp(+iG*R)
  !! ...              fft along z using pencils        (cft_1z)
  !! ...              transpose across nodes           (fft_scatter)
  !! ...                 and reorder
  ! ...              fft along y (using planes) and x (cft_2xy)
  ! ... isgn = -   : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega
  ! ...              fft along x and y(using planes)  (cft_2xy)
  ! ...              transpose across nodes           (fft_scatter)
  ! ...                 and reorder
  ! ...              fft along z using pencils        (cft_1z)
  !
  ! ...  The array "planes" signals whether a fft is needed along y :
  ! ...    planes(i)=0 : column f(i,*,*) empty , don't do fft along y
  ! ...    planes(i)=1 : column f(i,*,*) filled, fft along y needed
  ! ...  "empty" = no active components are present in f(i,*,*)
  ! ...            after (isgn>0) or before (isgn<0) the fft on z direction
  !
  ! ...  Note that if isgn=+/-1 (fft on rho and pot.) all fft's are needed
  ! ...  and all planes(i) are set to 1
  !
  ! ...  batchsize : number of 3D FFTs contained in f_d to be transformed.
  ! ...              Must be 1 < batchsize <= dfft%batchsize.
  !
  ! This driver is based on code written by Stefano de Gironcoli for PWSCF.
  ! Task Group added by Costas Bekas, Oct. 2005, adapted from the CPMD code
  ! (Alessandro Curioni) and revised by Carlo Cavazzoni 2007.
  !
  ! The GPU version is based on code written by Josh Romero, Everett Phillips
  ! and Massimiliano Fatica and revised by Pietro Bonfà.
  !
  ! The current version performs batchsize FFTs and overlaps computation
  ! with MPI communications and data transfers between host and device.
  !
  USE fft_scalar, ONLY : cft_1z_gpu, cft_2xy_gpu
  USE fft_scatter_2d_gpu,   ONLY : fft_scatter_many_columns_to_planes_send, &
                                   fft_scatter_many_columns_to_planes_store, &
                                   fft_scatter_many_planes_to_columns_send, &
                                   fft_scatter_many_planes_to_columns_store
  USE fft_types,  ONLY : fft_type_descriptor
  USE fft_buffers, ONLY : check_buffers_size, &
                            f_h => pin_space_scatter_in, &
                            aux_h => pin_space_scatter_out, &
                            aux_d => dev_space_fftparallel, &
                            aux2_h => pin_space_scatter_dblbuffer, &
                            aux2_d => dev_space_scatter_dblbuffer
  !
  IMPLICIT NONE
  !
  TYPE (fft_type_descriptor), INTENT(in) :: dfft
                                           ! descriptor of fft data layout
  INTEGER, INTENT(in)           :: isgn    ! fft direction
  INTEGER, INTENT(in)           :: batchsize
  COMPLEX(DP), DEVICE, INTENT(inout)    :: f_d( batchsize * dfft%nnr ) ! array containing data to be transformed
  !
  INTEGER                    :: me_p, istat, i, j, currsize
  INTEGER                    :: n1, n2, n3, nx1, nx2, nx3, ncpx, nppx, proc
  COMPLEX(DP), ALLOCATABLE   :: yf(:)
  INTEGER                    :: planes( dfft%nr1x )
  INTEGER                    :: sticks( dfft%nproc  )
  INTEGER(kind = cuda_stream_kind) :: stream  = 0
  !
  !
  n1  = dfft%nr1
  n2  = dfft%nr2
  n3  = dfft%nr3
  nx1 = dfft%nr1x
  nx2 = dfft%nr2x
  nx3 = dfft%nr3x
  !
  CALL check_buffers_size(dfft, batchsize)
  !
  me_p = dfft%mype + 1
  !
  ncpx = 0
  nppx = 0
  DO proc = 1, dfft%nproc
     IF ( abs(isgn) == 2 ) ncpx = max( ncpx, dfft%nsw ( proc ) )
     IF ( abs(isgn) == 1 ) ncpx = max( ncpx, dfft%nsp ( proc ) )
     nppx = max( nppx, dfft%nr3p ( proc ) )
  ENDDO
  IF ( abs(isgn) == 2 ) sticks = dfft%nsw
  IF ( abs(isgn) == 1 ) sticks = dfft%nsp
  !
  IF ( (abs(isgn) /= 2) .and. (abs(isgn) /= 1) ) &
     CALL fftx_error__( ' many_cft3s_gpu ', ' abs(isgn) /= 1 or 2 not implemented ', isgn )
  !
  IF (dfft%nproc <= 1) CALL fftx_error__( ' many_cft3s_gpu ', ' this subroutine should never be called with nproc= ', dfft%nproc )
  !
  ! FFTs are done in sub-batches of dfft%subbatchsize (default is 4)
  ! When a sub-batch has been transformed in a direction or a plane,
  ! communication between device and host is started and the next subbatch is transformed.
  ! Later, the subbatch is received on target MPI process and transformed
  ! overlapping computation with MPI communication.
  !
  IF ( isgn > 0 ) THEN
     DO j = 0, batchsize-1, dfft%subbatchsize
       ! determine whether the FFTs that are left are less than the maximum
       ! subbatchsize size.
       currsize = min(dfft%subbatchsize, batchsize - j)
       !
       IF ( isgn /= 2 ) THEN
          !
          planes = dfft%iplp
          !
       ELSE
          !
          planes = dfft%iplw
          !
       ENDIF
       !
       ! perform the FFT along one direction and, at the same time,
       ! read data spaced by dfft%nnr and store in in the output
       ! with spacing ncpx*nx3, making it easy to bach communication.
       DO i = 0, currsize - 1
         CALL cft_1z_gpu( f_d((j+i)*dfft%nnr + 1:), sticks(me_p), n3, nx3, isgn, aux_d(j*dfft%nnr + i*ncpx*nx3 +1:), dfft%a2a_comp )
       ENDDO
       !
       i = cudaEventRecord(dfft%bevents(j/dfft%subbatchsize + 1), dfft%a2a_comp)
       i = cudaStreamWaitEvent(dfft%bstreams(j/dfft%subbatchsize + 1), dfft%bevents(j/dfft%subbatchsize + 1), 0)

       IF (j > 0) i = cudaStreamWaitEvent(dfft%bstreams(j/dfft%subbatchsize + 1), dfft%bevents(j/dfft%subbatchsize), 0)

       CALL fft_scatter_many_columns_to_planes_store( dfft, aux_d(j*dfft%nnr + 1:), aux_h(j*dfft%nnr + 1:), nx3, dfft%nnr, f_d(j*dfft%nnr + 1:), &
         f_h(j*dfft%nnr + 1:), aux2_d(j*dfft%nnr + 1:), aux2_h(j*dfft%nnr + 1:), sticks, dfft%nr3p, isgn, currsize, j/dfft%subbatchsize + 1 )

     ENDDO

     DO j = 0, batchsize-1, dfft%subbatchsize
       currsize = min(dfft%subbatchsize, batchsize - j)

       CALL fft_scatter_many_columns_to_planes_send( dfft, aux_d(j*dfft%nnr + 1:), aux_h(j*dfft%nnr + 1:), nx3, dfft%nnr, f_d(j*dfft%nnr + 1:), &
         f_h(j*dfft%nnr + 1:), aux2_d(j*dfft%nnr + 1:), aux2_h(j*dfft%nnr + 1:), sticks, dfft%nr3p, isgn, currsize, j/dfft%subbatchsize + 1 )

       IF (currsize == dfft%subbatchsize) THEN
         CALL cft_2xy_gpu( f_d(j*dfft%nnr + 1:), aux_d(j*dfft%nnr + 1:), currsize * nppx, n1, n2, nx1, nx2, isgn, dfft%a2a_comp, planes )
       ELSE
         DO i = 0, currsize - 1
           CALL cft_2xy_gpu( f_d((j+i)*dfft%nnr + 1:), aux_d((j+i)*dfft%nnr + 1:), dfft%nr3p( me_p ), n1, n2, nx1, nx2, isgn,  &
           dfft%a2a_comp, planes )
         ENDDO
       ENDIF

     ENDDO

!     i = cudaDeviceSynchronize()

     !
  ELSE
!     i = cudaDeviceSynchronize()

     DO j = 0, batchsize-1, dfft%subbatchsize
       currsize = min(dfft%subbatchsize, batchsize - j)
       !
       IF ( isgn /= -2 ) THEN
          !
          planes = dfft%iplp
          !
       ELSE
          !
          planes = dfft%iplw
          !
       ENDIF

       IF (currsize == dfft%subbatchsize) THEN
         CALL cft_2xy_gpu( f_d(j*dfft%nnr + 1:), aux_d(j*dfft%nnr + 1:), currsize * nppx, n1, n2, nx1, nx2, isgn, dfft%a2a_comp, planes )
       ELSE
         DO i = 0, currsize - 1
           CALL cft_2xy_gpu( f_d((j+i)*dfft%nnr + 1:), aux_d((j+i)*dfft%nnr + 1:), dfft%nr3p( me_p ), n1, n2, nx1, nx2, isgn, dfft%a2a_comp, planes )
         ENDDO
       ENDIF

       IF (j > 0) i = cudaStreamWaitEvent(dfft%bstreams(j/dfft%subbatchsize + 1), dfft%bevents(j/dfft%subbatchsize), 0)

       CALL fft_scatter_many_planes_to_columns_store( dfft, aux_d(j*dfft%nnr + 1:), aux_h(j*dfft%nnr + 1:), nx3, dfft%nnr, f_d(j*dfft%nnr + 1:), &
         f_h(j*dfft%nnr + 1:), aux2_d(j*dfft%nnr + 1:), aux2_h(j*dfft%nnr + 1:), sticks, dfft%nr3p, isgn, currsize, j/dfft%subbatchsize + 1 )

     ENDDO

     DO j = 0, batchsize-1, dfft%subbatchsize
       currsize = min(dfft%subbatchsize, batchsize - j)

       CALL fft_scatter_many_planes_to_columns_send( dfft, aux_d(j*dfft%nnr + 1:), aux_h(j*dfft%nnr + 1:), nx3, dfft%nnr, f_d(j*dfft%nnr + 1:), &
         f_h(j*dfft%nnr + 1:), aux2_d(j*dfft%nnr + 1:), aux2_h(j*dfft%nnr + 1:), sticks, dfft%nr3p, isgn, currsize, j/dfft%subbatchsize + 1 )


       i = cudaEventRecord(dfft%bevents(j/dfft%subbatchsize + 1), dfft%bstreams(j/dfft%subbatchsize + 1))
       i = cudaStreamWaitEvent(dfft%a2a_comp, dfft%bevents(j/dfft%subbatchsize + 1), 0)

       DO i = 0, currsize - 1
         CALL cft_1z_gpu( aux_d(j*dfft%nnr + i*ncpx*nx3 + 1:), sticks( me_p ), n3, nx3, isgn, f_d((j+i)*dfft%nnr + 1:), dfft%a2a_comp )
       ENDDO

     ENDDO
!    i = cudaDeviceSynchronize()
  ENDIF
  !
  RETURN
  !
END SUBROUTINE many_cft3s_gpu
#endif
!
END MODULE fft_parallel_2d
Add GPU support and 1D+2D decomposition to FFTXlib 2020-10-02 03:32:07 +08:00			`!`
			`! Copyright (C) Quantum ESPRESSO group`
			`!`
			`! This file is distributed under the terms of the`
			! GNU General Public License. See the file `License'
			`! in the root directory of the present distribution,`
			`! or http://www.gnu.org/copyleft/gpl.txt .`
			`!`
			`!=---------------------------------------------------------------------==!`
			`!`
			`!`
			`! Parallel 3D FFT high level Driver`
			`! ( Charge density and Wave Functions )`
			`!`
			`! Written and maintained by Carlo Cavazzoni`
			`! Last update Apr. 2009`
			`!`
			`!!=---------------------------------------------------------------------==!`
			`!`
			`MODULE fft_parallel_2d`
			`!`
			`#ifdef __CUDA`
			`USE cudafor`
			`#endif`
			`!`
			`USE fft_param`
			`IMPLICIT NONE`
			`SAVE`
			`!`
			`!`
			`CONTAINS`
			`!`
			`! General purpose driver, including Task groups parallelization`
			`!`
			`!----------------------------------------------------------------------------`
			`SUBROUTINE tg_cft3s( f, dfft, isgn )`
			`!----------------------------------------------------------------------------`
			`!`
			`!! ... isgn = +-1 : parallel 3d fft for rho and for the potential`
			`! NOT IMPLEMENTED WITH TASK GROUPS`
			`!! ... isgn = +-2 : parallel 3d fft for wavefunctions`
			`!`
			`!! ... isgn = + : G-space to R-space, output = \sum_G f(G)exp(+iG*R)`
			`!! ... fft along z using pencils (cft_1z)`
			`!! ... transpose across nodes (fft_scatter)`
			`!! ... and reorder`
			`! ... fft along y (using planes) and x (cft_2xy)`
			`! ... isgn = - : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega`
			`! ... fft along x and y(using planes) (cft_2xy)`
			`! ... transpose across nodes (fft_scatter)`
			`! ... and reorder`
			`! ... fft along z using pencils (cft_1z)`
			`!`
			`! ... The array "planes" signals whether a fft is needed along y :`
			`! ... planes(i)=0 : column f(i,,) empty , don't do fft along y`
			`! ... planes(i)=1 : column f(i,,) filled, fft along y needed`
			`! ... "empty" = no active components are present in f(i,,)`
			`! ... after (isgn>0) or before (isgn<0) the fft on z direction`
			`!`
			`! ... Note that if isgn=+/-1 (fft on rho and pot.) all fft's are needed`
			`! ... and all planes(i) are set to 1`
			`!`
			`! This driver is based on code written by Stefano de Gironcoli for PWSCF.`
			`! Task Group added by Costas Bekas, Oct. 2005, adapted from the CPMD code`
			`! (Alessandro Curioni) and revised by Carlo Cavazzoni 2007.`
			`!`
			`USE fft_scalar, ONLY : cft_1z, cft_2xy`
			`USE fft_scatter_2d, ONLY : fft_scatter`
			`USE fft_types, ONLY : fft_type_descriptor`
			`!`
			`IMPLICIT NONE`
			`!`
			`COMPLEX(DP), INTENT(inout) :: f( : ) ! array containing data to be transformed`
			`TYPE (fft_type_descriptor), INTENT(in) :: dfft`
			`! descriptor of fft data layout`
			`INTEGER, INTENT(in) :: isgn ! fft direction`
			`!`
			`!`
			`INTEGER :: me_p`
			`INTEGER :: n1, n2, n3, nx1, nx2, nx3`
			`COMPLEX(DP), ALLOCATABLE :: aux (:)`
			`INTEGER :: planes( dfft%nr1x )`
			`!LOGICAL :: use_tg`
			`!`
			`!`
			`IF (dfft%has_task_groups) CALL fftx_error__( ' tg_cft3s ', ' task groups on large mesh not implemented ', 1 )`
			`!`
			`n1 = dfft%nr1`
			`n2 = dfft%nr2`
			`n3 = dfft%nr3`
			`nx1 = dfft%nr1x`
			`nx2 = dfft%nr2x`
			`nx3 = dfft%nr3x`
			`!`
			`ALLOCATE( aux( dfft%nnr ) )`
			`!`
			`me_p = dfft%mype + 1`
			`!`
			`IF ( isgn > 0 ) THEN`
			`!`
			`IF ( isgn /= 2 ) THEN`
			`!`
			`CALL cft_1z( f, dfft%nsp( me_p ), n3, nx3, isgn, aux )`
			`!`
			`planes = dfft%iplp`
			`!`
			`ELSE`
			`!`
			`CALL cft_1z( f, dfft%nsw( me_p ), n3, nx3, isgn, aux )`
			`!`
			`planes = dfft%iplw`
			`!`
			`ENDIF`
			`!`
			`CALL fw_scatter( isgn ) ! forward scatter from stick to planes`
			`!`
			`CALL cft_2xy( f, dfft%my_nr3p, n1, n2, nx1, nx2, isgn, planes )`
			`!`
			`ELSE`
			`!`
			`IF ( isgn == -1 ) THEN`
			`!`
			`planes = dfft%iplp`
			`!`
			`ELSE IF ( isgn == -2 ) THEN`
			`!`
			`planes = dfft%iplw`
			`!`
			`ENDIF`
			`!`
			`CALL cft_2xy( f, dfft%my_nr3p, n1, n2, nx1, nx2, isgn, planes )`
			`!`
			`CALL bw_scatter( isgn )`
			`!`
			`IF ( isgn /= -2 ) THEN`
			`!`
			`CALL cft_1z( aux, dfft%nsp( me_p ), n3, nx3, isgn, f )`
			`!`
			`ELSE`
			`!`
			`CALL cft_1z( aux, dfft%nsw( me_p ), n3, nx3, isgn, f )`
			`!`
			`ENDIF`
			`!`
			`ENDIF`
			`!`
			`DEALLOCATE( aux )`
			`!`
			`RETURN`
			`!`
			`CONTAINS`
			`!`
			`SUBROUTINE fw_scatter( iopt )`

			`!Transpose data for the 2-D FFT on the x-y plane`
			`!`
			`!NOGRP*dfft%nnr: The length of aux and f`
			`!nr3x: The length of each Z-stick`
			`!aux: input - output`
			`!f: working space`
			`!isgn: type of scatter`
			`!dfft%nsw(me) holds the number of Z-sticks proc. me has.`
			`!dfft%nr3p: number of planes per processor`
			`!`
			`!`
			`USE fft_scatter_2d, ONLY: fft_scatter`
			`!`
			`INTEGER, INTENT(in) :: iopt`
			`!`
			`IF( iopt == 2 ) THEN`
			`!`
			`CALL fft_scatter( dfft, aux, nx3, dfft%nnr, f, dfft%nsw, dfft%nr3p, iopt )`
			`!`
			`ELSEIF( iopt == 1 ) THEN`
			`!`
			`CALL fft_scatter( dfft, aux, nx3, dfft%nnr, f, dfft%nsp, dfft%nr3p, iopt )`
			`!`
			`ENDIF`
			`!`
			`RETURN`
			`END SUBROUTINE fw_scatter`

			`!`

			`SUBROUTINE bw_scatter( iopt )`
			`!`
			`USE fft_scatter_2d, ONLY: fft_scatter`
			`!`
			`INTEGER, INTENT(in) :: iopt`
			`!`
			`IF( iopt == -2 ) THEN`
			`!`
			`CALL fft_scatter( dfft, aux, nx3, dfft%nnr, f, dfft%nsw, dfft%nr3p, iopt )`
			`!`
			`ELSEIF( iopt == -1 ) THEN`
			`!`
			`CALL fft_scatter( dfft, aux, nx3, dfft%nnr, f, dfft%nsp, dfft%nr3p, iopt )`
			`!`
			`ENDIF`
			`!`
			`RETURN`
			`END SUBROUTINE bw_scatter`
			`!`
			`END SUBROUTINE tg_cft3s`
			`!`
			`!`
			`!`
			`#if defined(__CUDA)`
			`!----------------------------------------------------------------------------`
			`SUBROUTINE tg_cft3s_gpu( f_d, dfft, isgn )`
			`!----------------------------------------------------------------------------`
			`!`
			`!! ... isgn = +-1 : parallel 3d fft for rho and for the potential`
			`! NOT IMPLEMENTED WITH TASK GROUPS`
			`!! ... isgn = +-2 : parallel 3d fft for wavefunctions`
			`!`
			`!! ... isgn = + : G-space to R-space, output = \sum_G f(G)exp(+iG*R)`
			`!! ... fft along z using pencils (cft_1z)`
			`!! ... transpose across nodes (fft_scatter)`
			`!! ... and reorder`
			`! ... fft along y (using planes) and x (cft_2xy)`
			`! ... isgn = - : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega`
			`! ... fft along x and y(using planes) (cft_2xy)`
			`! ... transpose across nodes (fft_scatter)`
			`! ... and reorder`
			`! ... fft along z using pencils (cft_1z)`
			`!`
			`! ... The array "planes" signals whether a fft is needed along y :`
			`! ... planes(i)=0 : column f(i,,) empty , don't do fft along y`
			`! ... planes(i)=1 : column f(i,,) filled, fft along y needed`
			`! ... "empty" = no active components are present in f(i,,)`
			`! ... after (isgn>0) or before (isgn<0) the fft on z direction`
			`!`
			`! ... Note that if isgn=+/-1 (fft on rho and pot.) all fft's are needed`
			`! ... and all planes(i) are set to 1`
			`!`
			`! This driver is based on code written by Stefano de Gironcoli for PWSCF.`
			`! Task Group added by Costas Bekas, Oct. 2005, adapted from the CPMD code`
			`! (Alessandro Curioni) and revised by Carlo Cavazzoni 2007.`
			`!`
			`USE fft_scalar, ONLY : cft_1z_gpu, cft_2xy_gpu`
			`USE fft_scatter_2d_gpu, ONLY : fft_scatter_gpu`
			`USE fft_types, ONLY : fft_type_descriptor`
			`USE fft_buffers, ONLY : check_buffers_size, &`
			`f_h => pin_space_scatter_in, &`
			`aux_h => pin_space_scatter_out, &`
			`aux_d => dev_space_fftparallel`
			`!`
			`IMPLICIT NONE`
			`!`
			`TYPE (fft_type_descriptor), INTENT(in) :: dfft`
			`COMPLEX(DP), DEVICE, INTENT(inout) :: f_d( dfft%nnr ) ! array containing data to be transformed`
			`! descriptor of fft data layout`
			`INTEGER, INTENT(in) :: isgn ! fft direction`
			`!`
			`!`
			`INTEGER :: me_p, istat`
			`INTEGER :: n1, n2, n3, nx1, nx2, nx3`
			`COMPLEX(DP), ALLOCATABLE :: yf(:)`
			`INTEGER :: planes( dfft%nr1x )`
			`INTEGER(kind = cuda_stream_kind) :: stream = 0`
			`!`
			`!`
			`n1 = dfft%nr1`
			`n2 = dfft%nr2`
			`n3 = dfft%nr3`
			`nx1 = dfft%nr1x`
			`nx2 = dfft%nr2x`
			`nx3 = dfft%nr3x`
			`!`
			`IF( dfft%has_task_groups ) CALL fftx_error__( ' tg_cft3s ', ' task groups in 2D + 1D decomposition not implemented ', 1 )`
			`!`
			`CALL check_buffers_size(dfft)`
			`!`
			`me_p = dfft%mype + 1`
			`!`
			`IF ( isgn > 0 ) THEN`
			`!`
			`IF ( isgn /= 2 ) THEN`
			`!`
			`CALL cft_1z_gpu( f_d, dfft%nsp( me_p ), n3, nx3, isgn, aux_d, stream )`
			`!`
			`planes = dfft%iplp`
			`!`
			`ELSE`
			`!`
			`CALL cft_1z_gpu( f_d, dfft%nsw( me_p ), n3, nx3, isgn, aux_d, stream )`
			`!`
			`planes = dfft%iplw`
			`!`
			`ENDIF`
			`!`
			`CALL fw_scatter_gpu( isgn ) ! forward scatter from stick to planes`
			`!`
			`CALL cft_2xy_gpu( f_d, aux_d, dfft%my_nr3p, n1, n2, nx1, nx2, isgn, stream, planes )`
			`!`
			`ELSE`
			`!`
			`IF ( isgn /= -2 ) THEN`
			`!`
			`planes = dfft%iplp`
			`!`
			`ELSE`
			`!`
			`planes = dfft%iplw`
			`!`
			`ENDIF`

			`CALL cft_2xy_gpu( f_d, aux_d, dfft%my_nr3p, n1, n2, nx1, nx2, isgn, stream, planes)`
			`!`
			`CALL bw_scatter_gpu( isgn )`
			`!`
			`!f_d = (0.d0, 0.d0)`
			`!`
			`IF ( isgn /= -2 ) THEN`
			`!`
			`CALL cft_1z_gpu( aux_d, dfft%nsp( me_p ), n3, nx3, isgn, f_d, stream )`
			`!`
			`ELSE`
			`!`
			`CALL cft_1z_gpu( aux_d, dfft%nsw( me_p ), n3, nx3, isgn, f_d, stream )`
			`!`
			`ENDIF`
			`!`
			`ENDIF`
			`!`
			`RETURN`
			`!`
			`CONTAINS`
			`!`
			`SUBROUTINE fw_scatter_gpu( iopt )`

			`!Transpose data for the 2-D FFT on the x-y plane`
			`!`
			`!NOGRP*dfft%nnr: The length of aux and f`
			`!nr3x: The length of each Z-stick`
			`!aux: input - output`
			`!f: working space`
			`!isgn: type of scatter`
			`!dfft%nsw(me) holds the number of Z-sticks proc. me has.`
			`!dfft%nr3p: number of planes per processor`
			`!`
			`!`
			`USE fft_scatter_2d_gpu, ONLY: fft_scatter_gpu`
			`!`
			`INTEGER, INTENT(in) :: iopt`
			`!`
			`IF( iopt == 2 ) THEN`
			`!`
			`CALL fft_scatter_gpu( dfft, aux_d, aux_h, nx3, dfft%nnr, f_d, f_h, dfft%nsw, dfft%nr3p, iopt )`
			`!`
			`ELSEIF( iopt == 1 ) THEN`
			`!`
			`CALL fft_scatter_gpu( dfft, aux_d, aux_h, nx3, dfft%nnr, f_d, f_h, dfft%nsp, dfft%nr3p, iopt )`
			`!`
			`ENDIF`
			`!`
			`RETURN`
			`END SUBROUTINE fw_scatter_gpu`

			`!`

			`SUBROUTINE bw_scatter_gpu( iopt )`
			`!`
			`USE fft_scatter_2d_gpu, ONLY: fft_scatter_gpu`
			`!`
			`INTEGER, INTENT(in) :: iopt`
			`!`
			`IF( iopt == -2 ) THEN`
			`!`
			`CALL fft_scatter_gpu( dfft, aux_d, aux_h, nx3, dfft%nnr, f_d, f_h, dfft%nsw, dfft%nr3p, iopt )`
			`!`
			`ELSEIF( iopt == -1 ) THEN`
			`!`
			`CALL fft_scatter_gpu( dfft, aux_d, aux_h, nx3, dfft%nnr, f_d, f_h, dfft%nsp, dfft%nr3p, iopt )`
			`!`
			`ENDIF`
			`!`
			`RETURN`
			`END SUBROUTINE bw_scatter_gpu`
			`!`
			`END SUBROUTINE tg_cft3s_gpu`

			`SUBROUTINE many_cft3s_gpu( f_d, dfft, isgn, batchsize )`
			`!----------------------------------------------------------------------------`
			`!`
Well begun is half done? 2020-11-12 16:48:39 +08:00			`!`
Add GPU support and 1D+2D decomposition to FFTXlib 2020-10-02 03:32:07 +08:00			`!! ... isgn = +-1 : parallel 3d fft for rho and for the potential`
			`! NOT IMPLEMENTED WITH TASK GROUPS`
			`!! ... isgn = +-2 : parallel 3d fft for wavefunctions`
			`!`
			`!! ... isgn = + : G-space to R-space, output = \sum_G f(G)exp(+iG*R)`
			`!! ... fft along z using pencils (cft_1z)`
			`!! ... transpose across nodes (fft_scatter)`
			`!! ... and reorder`
			`! ... fft along y (using planes) and x (cft_2xy)`
			`! ... isgn = - : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega`
			`! ... fft along x and y(using planes) (cft_2xy)`
			`! ... transpose across nodes (fft_scatter)`
			`! ... and reorder`
			`! ... fft along z using pencils (cft_1z)`
			`!`
			`! ... The array "planes" signals whether a fft is needed along y :`
			`! ... planes(i)=0 : column f(i,,) empty , don't do fft along y`
			`! ... planes(i)=1 : column f(i,,) filled, fft along y needed`
			`! ... "empty" = no active components are present in f(i,,)`
			`! ... after (isgn>0) or before (isgn<0) the fft on z direction`
			`!`
			`! ... Note that if isgn=+/-1 (fft on rho and pot.) all fft's are needed`
			`! ... and all planes(i) are set to 1`
			`!`
Fix ambiguous description of batched FFT 2020-11-27 05:14:57 +08:00			`! ... batchsize : number of 3D FFTs contained in f_d to be transformed.`
			`! ... Must be 1 < batchsize <= dfft%batchsize.`
Well begun is half done? 2020-11-12 16:48:39 +08:00			`!`
Add GPU support and 1D+2D decomposition to FFTXlib 2020-10-02 03:32:07 +08:00			`! This driver is based on code written by Stefano de Gironcoli for PWSCF.`
			`! Task Group added by Costas Bekas, Oct. 2005, adapted from the CPMD code`
			`! (Alessandro Curioni) and revised by Carlo Cavazzoni 2007.`
			`!`
Well begun is half done? 2020-11-12 16:48:39 +08:00			`! The GPU version is based on code written by Josh Romero, Everett Phillips`
			`! and Massimiliano Fatica and revised by Pietro Bonfà.`
			`!`
			`! The current version performs batchsize FFTs and overlaps computation`
			`! with MPI communications and data transfers between host and device.`
			`!`
Add GPU support and 1D+2D decomposition to FFTXlib 2020-10-02 03:32:07 +08:00			`USE fft_scalar, ONLY : cft_1z_gpu, cft_2xy_gpu`
			`USE fft_scatter_2d_gpu, ONLY : fft_scatter_many_columns_to_planes_send, &`
			`fft_scatter_many_columns_to_planes_store, &`
			`fft_scatter_many_planes_to_columns_send, &`
			`fft_scatter_many_planes_to_columns_store`
			`USE fft_types, ONLY : fft_type_descriptor`
			`USE fft_buffers, ONLY : check_buffers_size, &`
			`f_h => pin_space_scatter_in, &`
			`aux_h => pin_space_scatter_out, &`
			`aux_d => dev_space_fftparallel, &`
			`aux2_h => pin_space_scatter_dblbuffer, &`
			`aux2_d => dev_space_scatter_dblbuffer`
			`!`
			`IMPLICIT NONE`
			`!`
			`TYPE (fft_type_descriptor), INTENT(in) :: dfft`
			`! descriptor of fft data layout`
			`INTEGER, INTENT(in) :: isgn ! fft direction`
			`INTEGER, INTENT(in) :: batchsize`
			`COMPLEX(DP), DEVICE, INTENT(inout) :: f_d( batchsize * dfft%nnr ) ! array containing data to be transformed`
			`!`
			`INTEGER :: me_p, istat, i, j, currsize`
			`INTEGER :: n1, n2, n3, nx1, nx2, nx3, ncpx, nppx, proc`
			`COMPLEX(DP), ALLOCATABLE :: yf(:)`
			`INTEGER :: planes( dfft%nr1x )`
			`INTEGER :: sticks( dfft%nproc )`
			`INTEGER(kind = cuda_stream_kind) :: stream = 0`
			`!`
			`!`
			`n1 = dfft%nr1`
			`n2 = dfft%nr2`
			`n3 = dfft%nr3`
			`nx1 = dfft%nr1x`
			`nx2 = dfft%nr2x`
			`nx3 = dfft%nr3x`
			`!`
			`CALL check_buffers_size(dfft, batchsize)`
			`!`
			`me_p = dfft%mype + 1`
			`!`
			`ncpx = 0`
			`nppx = 0`
			`DO proc = 1, dfft%nproc`
			`IF ( abs(isgn) == 2 ) ncpx = max( ncpx, dfft%nsw ( proc ) )`
			`IF ( abs(isgn) == 1 ) ncpx = max( ncpx, dfft%nsp ( proc ) )`
			`nppx = max( nppx, dfft%nr3p ( proc ) )`
			`ENDDO`
			`IF ( abs(isgn) == 2 ) sticks = dfft%nsw`
			`IF ( abs(isgn) == 1 ) sticks = dfft%nsp`
			`!`
			`IF ( (abs(isgn) /= 2) .and. (abs(isgn) /= 1) ) &`
			`CALL fftx_error__( ' many_cft3s_gpu ', ' abs(isgn) /= 1 or 2 not implemented ', isgn )`
			`!`
			`IF (dfft%nproc <= 1) CALL fftx_error__( ' many_cft3s_gpu ', ' this subroutine should never be called with nproc= ', dfft%nproc )`
			`!`
Well begun is half done? 2020-11-12 16:48:39 +08:00			`! FFTs are done in sub-batches of dfft%subbatchsize (default is 4)`
			`! When a sub-batch has been transformed in a direction or a plane,`
			`! communication between device and host is started and the next subbatch is transformed.`
			`! Later, the subbatch is received on target MPI process and transformed`
			`! overlapping computation with MPI communication.`
			`!`
Add GPU support and 1D+2D decomposition to FFTXlib 2020-10-02 03:32:07 +08:00			`IF ( isgn > 0 ) THEN`
			`DO j = 0, batchsize-1, dfft%subbatchsize`
A few more comments 2020-11-27 05:41:22 +08:00			`! determine whether the FFTs that are left are less than the maximum`
			`! subbatchsize size.`
Add GPU support and 1D+2D decomposition to FFTXlib 2020-10-02 03:32:07 +08:00			`currsize = min(dfft%subbatchsize, batchsize - j)`
			`!`
			`IF ( isgn /= 2 ) THEN`
			`!`
			`planes = dfft%iplp`
			`!`
			`ELSE`
			`!`
			`planes = dfft%iplw`
			`!`
			`ENDIF`
			`!`
A few more comments 2020-11-27 05:41:22 +08:00			`! perform the FFT along one direction and, at the same time,`
			`! read data spaced by dfft%nnr and store in in the output`
			`! with spacing ncpx*nx3, making it easy to bach communication.`
Add GPU support and 1D+2D decomposition to FFTXlib 2020-10-02 03:32:07 +08:00			`DO i = 0, currsize - 1`
			`CALL cft_1z_gpu( f_d((j+i)dfft%nnr + 1:), sticks(me_p), n3, nx3, isgn, aux_d(jdfft%nnr + incpxnx3 +1:), dfft%a2a_comp )`
			`ENDDO`
A few more comments 2020-11-27 05:41:22 +08:00			`!`
Add GPU support and 1D+2D decomposition to FFTXlib 2020-10-02 03:32:07 +08:00			`i = cudaEventRecord(dfft%bevents(j/dfft%subbatchsize + 1), dfft%a2a_comp)`
			`i = cudaStreamWaitEvent(dfft%bstreams(j/dfft%subbatchsize + 1), dfft%bevents(j/dfft%subbatchsize + 1), 0)`

			`IF (j > 0) i = cudaStreamWaitEvent(dfft%bstreams(j/dfft%subbatchsize + 1), dfft%bevents(j/dfft%subbatchsize), 0)`

			`CALL fft_scatter_many_columns_to_planes_store( dfft, aux_d(jdfft%nnr + 1:), aux_h(jdfft%nnr + 1:), nx3, dfft%nnr, f_d(j*dfft%nnr + 1:), &`
			`f_h(jdfft%nnr + 1:), aux2_d(jdfft%nnr + 1:), aux2_h(j*dfft%nnr + 1:), sticks, dfft%nr3p, isgn, currsize, j/dfft%subbatchsize + 1 )`

			`ENDDO`

			`DO j = 0, batchsize-1, dfft%subbatchsize`
			`currsize = min(dfft%subbatchsize, batchsize - j)`

			`CALL fft_scatter_many_columns_to_planes_send( dfft, aux_d(jdfft%nnr + 1:), aux_h(jdfft%nnr + 1:), nx3, dfft%nnr, f_d(j*dfft%nnr + 1:), &`
			`f_h(jdfft%nnr + 1:), aux2_d(jdfft%nnr + 1:), aux2_h(j*dfft%nnr + 1:), sticks, dfft%nr3p, isgn, currsize, j/dfft%subbatchsize + 1 )`

			`IF (currsize == dfft%subbatchsize) THEN`
			`CALL cft_2xy_gpu( f_d(jdfft%nnr + 1:), aux_d(jdfft%nnr + 1:), currsize * nppx, n1, n2, nx1, nx2, isgn, dfft%a2a_comp, planes )`
			`ELSE`
			`DO i = 0, currsize - 1`
			`CALL cft_2xy_gpu( f_d((j+i)dfft%nnr + 1:), aux_d((j+i)dfft%nnr + 1:), dfft%nr3p( me_p ), n1, n2, nx1, nx2, isgn, &`
			`dfft%a2a_comp, planes )`
			`ENDDO`
			`ENDIF`

			`ENDDO`

			`! i = cudaDeviceSynchronize()`

			`!`
			`ELSE`
			`! i = cudaDeviceSynchronize()`

			`DO j = 0, batchsize-1, dfft%subbatchsize`
			`currsize = min(dfft%subbatchsize, batchsize - j)`
			`!`
			`IF ( isgn /= -2 ) THEN`
			`!`
			`planes = dfft%iplp`
			`!`
			`ELSE`
			`!`
			`planes = dfft%iplw`
			`!`
			`ENDIF`

			`IF (currsize == dfft%subbatchsize) THEN`
			`CALL cft_2xy_gpu( f_d(jdfft%nnr + 1:), aux_d(jdfft%nnr + 1:), currsize * nppx, n1, n2, nx1, nx2, isgn, dfft%a2a_comp, planes )`
			`ELSE`
			`DO i = 0, currsize - 1`
			`CALL cft_2xy_gpu( f_d((j+i)dfft%nnr + 1:), aux_d((j+i)dfft%nnr + 1:), dfft%nr3p( me_p ), n1, n2, nx1, nx2, isgn, dfft%a2a_comp, planes )`
			`ENDDO`
			`ENDIF`

			`IF (j > 0) i = cudaStreamWaitEvent(dfft%bstreams(j/dfft%subbatchsize + 1), dfft%bevents(j/dfft%subbatchsize), 0)`

			`CALL fft_scatter_many_planes_to_columns_store( dfft, aux_d(jdfft%nnr + 1:), aux_h(jdfft%nnr + 1:), nx3, dfft%nnr, f_d(j*dfft%nnr + 1:), &`
			`f_h(jdfft%nnr + 1:), aux2_d(jdfft%nnr + 1:), aux2_h(j*dfft%nnr + 1:), sticks, dfft%nr3p, isgn, currsize, j/dfft%subbatchsize + 1 )`

			`ENDDO`

			`DO j = 0, batchsize-1, dfft%subbatchsize`
			`currsize = min(dfft%subbatchsize, batchsize - j)`

			`CALL fft_scatter_many_planes_to_columns_send( dfft, aux_d(jdfft%nnr + 1:), aux_h(jdfft%nnr + 1:), nx3, dfft%nnr, f_d(j*dfft%nnr + 1:), &`
			`f_h(jdfft%nnr + 1:), aux2_d(jdfft%nnr + 1:), aux2_h(j*dfft%nnr + 1:), sticks, dfft%nr3p, isgn, currsize, j/dfft%subbatchsize + 1 )`


			`i = cudaEventRecord(dfft%bevents(j/dfft%subbatchsize + 1), dfft%bstreams(j/dfft%subbatchsize + 1))`
			`i = cudaStreamWaitEvent(dfft%a2a_comp, dfft%bevents(j/dfft%subbatchsize + 1), 0)`

			`DO i = 0, currsize - 1`
			`CALL cft_1z_gpu( aux_d(jdfft%nnr + incpxnx3 + 1:), sticks( me_p ), n3, nx3, isgn, f_d((j+i)dfft%nnr + 1:), dfft%a2a_comp )`
			`ENDDO`

			`ENDDO`
			`! i = cudaDeviceSynchronize()`
			`ENDIF`
			`!`
			`RETURN`
			`!`
			`END SUBROUTINE many_cft3s_gpu`
			`#endif`
			`!`
			`END MODULE fft_parallel_2d`