!
! Copyright (C) Quantum ESPRESSO group
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------
! FFT base Module.
! Written by Carlo Cavazzoni, modified by Paolo Giannozzi
!----------------------------------------------------------------------
!
!=----------------------------------------------------------------------=!
   MODULE scatter_mod
!=----------------------------------------------------------------------=!

        USE fft_types, ONLY: fft_type_descriptor
        USE task_groups, ONLY: task_groups_descriptor

        IMPLICIT NONE
#include "fft_param.f90"

        INTERFACE gather_grid
           MODULE PROCEDURE gather_real_grid, gather_complex_grid
        END INTERFACE

        INTERFACE scatter_grid
           MODULE PROCEDURE scatter_real_grid, scatter_complex_grid
        END INTERFACE

        SAVE

        PRIVATE

        PUBLIC :: fft_type_descriptor
        PUBLIC :: fft_scatter, gather_grid, scatter_grid
        PUBLIC :: cgather_sym, cgather_sym_many, cscatter_sym_many
        PUBLIC :: maps_sticks_to_3d

!=----------------------------------------------------------------------=!
      CONTAINS
!=----------------------------------------------------------------------=!
!
!
#if ! defined __NON_BLOCKING_SCATTER
!
!   ALLTOALL based SCATTER, should be better on network
!   with a defined topology, like on bluegene and cray machine
!
!-----------------------------------------------------------------------
SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
  !-----------------------------------------------------------------------
  !
  ! transpose the fft grid across nodes
  ! a) From columns to planes (isgn > 0)
  !
  !    "columns" (or "pencil") representation:
  !    processor "me" has ncp_(me) contiguous columns along z
  !    Each column has nr3x elements for a fft of order nr3
  !    nr3x can be =nr3+1 in order to reduce memory conflicts.
  !
  !    The transpose take places in two steps:
  !    1) on each processor the columns are divided into slices along z
  !       that are stored contiguously. On processor "me", slices for
  !       processor "proc" are npp_(proc)*ncp_(me) big
  !    2) all processors communicate to exchange slices
  !       (all columns with z in the slice belonging to "me"
  !        must be received, all the others must be sent to "proc")
  !    Finally one gets the "planes" representation:
  !    processor "me" has npp_(me) complete xy planes
  !    f_in  contains input columns, is destroyed on output
  !    f_aux contains output planes
  !
  !  b) From planes to columns (isgn < 0)
  !
  !    Quite the same in the opposite direction
  !    f_aux contains input planes, is destroyed on output
  !    f_in  contains output columns
  !
  !  If optional argument "dtgs" is present the subroutines performs
  !  the trasposition using the Task Groups distribution
  !
  IMPLICIT NONE

  TYPE (fft_type_descriptor), INTENT(in) :: dfft
  INTEGER, INTENT(in)           :: nr3x, nxx_, isgn, ncp_ (:), npp_ (:)
  COMPLEX (DP), INTENT(inout)   :: f_in (nxx_), f_aux (nxx_)
  TYPE (task_groups_descriptor), OPTIONAL, INTENT(in) :: dtgs

#if defined(__MPI)

  INTEGER :: k, offset, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom
  INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff, sendsiz, ncpx, ipp, nblk, nsiz
  !
  LOGICAL :: use_tg_

  !
  !  Task Groups

  use_tg_ = .false.

  IF( present( dtgs ) ) use_tg_ = .true.

  me     = dfft%mype + 1
  !
  IF( use_tg_ ) THEN
    !  This is the number of procs. in the plane-wave group
     nprocp = dtgs%npgrp
  ELSE
     nprocp = dfft%nproc
  ENDIF
  !
  CALL start_clock ('fft_scatter')
  !
  ncpx = 0
  nppx = 0
  IF( use_tg_ ) THEN
     ncpx   = dtgs%tg_ncpx
     nppx   = dtgs%tg_nppx
     gcomm  = dtgs%pgrp_comm
  ELSE
     DO proc = 1, nprocp
        ncpx = max( ncpx, ncp_ ( proc ) )
        nppx = max( nppx, npp_ ( proc ) )
     ENDDO
     IF ( dfft%nproc == 1 ) THEN
        nppx = dfft%nr3x
     END IF
  ENDIF
  sendsiz = ncpx * nppx
  !

  ierr = 0
  IF (isgn.gt.0) THEN

     IF (nprocp==1) GO TO 10
     !
     ! "forward" scatter from columns to planes
     !
     ! step one: store contiguously the slices
     !
     offset = 0

     DO proc = 1, nprocp
        IF( use_tg_ ) THEN
           gproc = dtgs%nplist(proc)+1
        ELSE
           gproc = proc
        ENDIF
        !
        kdest = ( proc - 1 ) * sendsiz
        kfrom = offset
        !
        DO k = 1, ncp_ (me)
           DO i = 1, npp_ ( gproc )
              f_aux ( kdest + i ) =  f_in ( kfrom + i )
           ENDDO
           kdest = kdest + nppx
           kfrom = kfrom + nr3x
        ENDDO
        offset = offset + npp_ ( gproc )
     ENDDO

     !
     ! maybe useless; ensures that no garbage is present in the output
     !
     !! f_in = 0.0_DP
     !
     ! step two: communication
     !
     IF( use_tg_ ) THEN
        gcomm = dtgs%pgrp_comm
     ELSE
        gcomm = dfft%comm
     ENDIF

     CALL mpi_alltoall (f_aux(1), sendsiz, MPI_DOUBLE_COMPLEX, f_in(1), sendsiz, MPI_DOUBLE_COMPLEX, gcomm, ierr)

     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
     !
10   CONTINUE
     !
     f_aux = (0.d0, 0.d0)
     !
     IF( isgn == 1 ) THEN

        DO ip = 1, dfft%nproc
           ioff = dfft%iss( ip )
           it = ( ip - 1 ) * sendsiz
           DO i = 1, dfft%nsp( ip )
              mc = dfft%ismap( i + ioff )
              DO j = 1, dfft%npp( me )
                 f_aux( mc + ( j - 1 ) * dfft%nnp ) = f_in( j + it )
              ENDDO
              it = it + nppx
           ENDDO
        ENDDO

     ELSE

        IF( use_tg_ ) THEN
           npp  = dtgs%tg_npp( me )
           nnp  = dfft%nr1x * dfft%nr2x
        ELSE
           npp  = dfft%npp( me )
           nnp  = dfft%nnp
        ENDIF

        IF( use_tg_ ) THEN
           nblk = dfft%nproc / dtgs%nogrp
           nsiz = dtgs%nogrp
        ELSE
           nblk = dfft%nproc 
           nsiz = 1
        END IF
        !
        ip = 1
        !
        DO gproc = 1, nblk
           !
           ii = 0
           !
           DO ipp = 1, nsiz
              !
              ioff = dfft%iss( ip )
              !
              DO i = 1, dfft%nsw( ip )
                 !
                 mc = dfft%ismap( i + ioff )
                 !
                 it = ii * nppx + ( gproc - 1 ) * sendsiz
                 !
                 DO j = 1, npp
                    f_aux( mc + ( j - 1 ) * nnp ) = f_in( j + it )
                 ENDDO
                 !
                 ii = ii + 1
                 !
              ENDDO
              !
              ip = ip + 1
              !
           ENDDO
           !
        ENDDO

     END IF

  ELSE
     !
     !  "backward" scatter from planes to columns
     !
     IF( isgn == -1 ) THEN

        DO ip = 1, dfft%nproc
           ioff = dfft%iss( ip )
           it = ( ip - 1 ) * sendsiz
           DO i = 1, dfft%nsp( ip )
              mc = dfft%ismap( i + ioff )
              DO j = 1, dfft%npp( me )
                 f_in( j + it ) = f_aux( mc + ( j - 1 ) * dfft%nnp )
              ENDDO
              it = it + nppx
           ENDDO
        ENDDO

     ELSE

        IF( use_tg_ ) THEN
           npp  = dtgs%tg_npp( me )
           nnp  = dfft%nr1x * dfft%nr2x
        ELSE
           npp  = dfft%npp( me )
           nnp  = dfft%nnp
        ENDIF

        IF( use_tg_ ) THEN
           nblk = dtgs%nproc / dtgs%nogrp
           nsiz = dtgs%nogrp
        ELSE
           nblk = dfft%nproc 
           nsiz = 1
        END IF
        !
        ip = 1
        !
        DO gproc = 1, nblk
           !
           ii = 0
           !
           DO ipp = 1, nsiz
              !
              ioff = dfft%iss( ip )
              !
              DO i = 1, dfft%nsw( ip )
                 !
                 mc = dfft%ismap( i + ioff )
                 !
                 it = ii * nppx + ( gproc - 1 ) * sendsiz
                 !
                 DO j = 1, npp
                    f_in( j + it ) = f_aux( mc + ( j - 1 ) * nnp )
                 ENDDO
                 !
                 ii = ii + 1
                 !
              ENDDO
              !
              ip = ip + 1
              !
           ENDDO
           !
        ENDDO

     END IF

     IF( nprocp == 1 ) GO TO 20
     !
     !  step two: communication
     !
     IF( use_tg_ ) THEN
        gcomm = dtgs%pgrp_comm
     ELSE
        gcomm = dfft%comm
     ENDIF

     CALL mpi_alltoall (f_in(1), sendsiz, MPI_DOUBLE_COMPLEX, f_aux(1), sendsiz, MPI_DOUBLE_COMPLEX, gcomm, ierr)

     IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', 'info<>0', abs(ierr) )
     !
     !  step one: store contiguously the columns
     !
     !! f_in = 0.0_DP
     !
     offset = 0

     DO proc = 1, nprocp
        IF( use_tg_ ) THEN
           gproc = dtgs%nplist(proc)+1
        ELSE
           gproc = proc
        ENDIF
        !
        kdest = ( proc - 1 ) * sendsiz
        kfrom = offset 
        !
        DO k = 1, ncp_ (me)
           DO i = 1, npp_ ( gproc )  
              f_in ( kfrom + i ) = f_aux ( kdest + i )
           ENDDO
           kdest = kdest + nppx
           kfrom = kfrom + nr3x
        ENDDO
        offset = offset + npp_ ( gproc )
     ENDDO

20   CONTINUE

  ENDIF

  CALL stop_clock ('fft_scatter')

#endif

  RETURN

END SUBROUTINE fft_scatter
!
#else
!
!   NON BLOCKING SCATTER, should be better on switched network
!   like infiniband, ethernet, myrinet
!
!-----------------------------------------------------------------------
SUBROUTINE fft_scatter ( dfft, f_in, nr3x, nxx_, f_aux, ncp_, npp_, isgn, dtgs )
  !-----------------------------------------------------------------------
  !
  ! transpose the fft grid across nodes
  ! a) From columns to planes (isgn > 0)
  !
  !    "columns" (or "pencil") representation:
  !    processor "me" has ncp_(me) contiguous columns along z
  !    Each column has nr3x elements for a fft of order nr3
  !    nr3x can be =nr3+1 in order to reduce memory conflicts.
  !
  !    The transpose take places in two steps:
  !    1) on each processor the columns are divided into slices along z
  !       that are stored contiguously. On processor "me", slices for
  !       processor "proc" are npp_(proc)*ncp_(me) big
  !    2) all processors communicate to exchange slices
  !       (all columns with z in the slice belonging to "me"
  !        must be received, all the others must be sent to "proc")
  !    Finally one gets the "planes" representation:
  !    processor "me" has npp_(me) complete xy planes
  !
  !  b) From planes to columns (isgn < 0)
  !
  !  Quite the same in the opposite direction
  !
  !  The output is overwritten on f_in ; f_aux is used as work space
  !
  !  If optional argument "dtgs" is present the subroutines performs
  !  the trasposition using the Task Groups distribution
  !
  IMPLICIT NONE

  TYPE (fft_type_descriptor), INTENT(in) :: dfft
  INTEGER, INTENT(in)           :: nr3x, nxx_, isgn, ncp_ (:), npp_ (:)
  COMPLEX (DP), INTENT(inout)   :: f_in (nxx_), f_aux (nxx_)
  TYPE (task_groups_descriptor), OPTIONAL, INTENT(in) :: dtgs

#if defined(__MPI)

  INTEGER :: k, offset, proc, ierr, me, nprocp, gproc, gcomm, i, kdest, kfrom
  INTEGER :: me_p, nppx, mc, j, npp, nnp, ii, it, ip, ioff, sendsiz, ncpx, ipp, nblk, nsiz, ijp
  INTEGER :: sh(dfft%nproc), rh(dfft%nproc)
  INTEGER :: istat( MPI_STATUS_SIZE )
  !
  INTEGER, SAVE, ALLOCATABLE :: indmap(:,:)
  INTEGER, SAVE, ALLOCATABLE :: indmap_bw(:)
  INTEGER, SAVE  :: nijp
  INTEGER, SAVE  :: dimref(4) = 0
  INTEGER, SAVE  :: dimref_bw(4) = 0
  !
  LOGICAL :: use_tg_
  !
  CALL start_clock ('fft_scatter')

  use_tg_ = .false.

  IF( present( dtgs ) ) use_tg_ = .true.

  me     = dfft%mype + 1
  !
  ncpx = 0
  nppx = 0
  IF( use_tg_ ) THEN
     !  This is the number of procs. in the plane-wave group
     nprocp = dtgs%npgrp
     ncpx   = dtgs%tg_ncpx
     nppx   = dtgs%tg_nppx
     gcomm  = dtgs%pgrp_comm
  ELSE
     nprocp = dfft%nproc
     DO proc = 1, nprocp
        ncpx = max( ncpx, ncp_ ( proc ) )
        nppx = max( nppx, npp_ ( proc ) )
     ENDDO
     IF ( dfft%nproc == 1 ) THEN
        nppx = dfft%nr3x
     END IF
     gcomm = dfft%comm
  ENDIF
  ! 
  sendsiz = ncpx * nppx
  !
  IF ( isgn .gt. 0 ) THEN
     !
     ! "forward" scatter from columns to planes
     !
     ! step one: store contiguously the slices
     !
     offset = 0

     IF( use_tg_ ) THEN
        DO proc = 1, nprocp
           gproc = dtgs%nplist(proc)+1
           kdest = ( proc - 1 ) * sendsiz
           kfrom = offset 
           DO k = 1, ncp_ (me)
              DO i = 1, npp_ ( gproc )
                 f_aux ( kdest + i ) =  f_in ( kfrom + i )
              ENDDO
              kdest = kdest + nppx
              kfrom = kfrom + nr3x
           ENDDO
           offset = offset + npp_ ( gproc )
           ! post the non-blocking send, f_aux can't be overwritten until operation has completed
           CALL mpi_isend( f_aux( (proc-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, proc-1, me, gcomm, sh( proc ), ierr )
           ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' forward send info<>0', abs(ierr) )
        ENDDO
     ELSE
        DO proc = 1, nprocp
           kdest = ( proc - 1 ) * sendsiz
           kfrom = offset 
           DO k = 1, ncp_ (me)
              DO i = 1, npp_ ( proc )
                 f_aux ( kdest + i ) =  f_in ( kfrom + i )
              ENDDO
              kdest = kdest + nppx
              kfrom = kfrom + nr3x
           ENDDO
           offset = offset + npp_ ( proc )
           ! post the non-blocking send, f_aux can't be overwritten until operation has completed
           CALL mpi_isend( f_aux( (proc-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, proc-1, me, gcomm, sh( proc ), ierr )
           ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' forward send info<>0', abs(ierr) )
        ENDDO
     ENDIF
     !
     ! step two: receive
     !
     DO proc = 1, nprocp
        !
        ! now post the receive
        !
        CALL mpi_irecv( f_in( (proc-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, proc-1, MPI_ANY_TAG, gcomm, rh( proc ), ierr )
        !IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' forward receive info<>0', abs(ierr) )
        !
        !
     ENDDO
     !
     ! maybe useless; ensures that no garbage is present in the output
     !
     !f_in( nprocp*sendsiz + 1 : size( f_in )  ) = 0.0_DP
     !
     call mpi_waitall( nprocp, sh, MPI_STATUSES_IGNORE, ierr )
     !
     f_aux = (0.d0, 0.d0)
     !
     call mpi_waitall( nprocp, rh, MPI_STATUSES_IGNORE, ierr )
     !
     IF( isgn == 1 ) THEN

        DO ip = 1, dfft%nproc
           ioff = dfft%iss( ip )
           it = ( ip - 1 ) * sendsiz
           DO i = 1, dfft%nsp( ip )
              mc = dfft%ismap( i + ioff )
              DO j = 1, dfft%npp( me )
                 f_aux( mc + ( j - 1 ) * dfft%nnp ) = f_in( j + it )
              ENDDO
              it = it + nppx
           ENDDO
        ENDDO

     ELSE

        IF( use_tg_ ) THEN
           npp  = dtgs%tg_npp( me )
           nnp  = dfft%nr1x * dfft%nr2x
           nblk = dtgs%nproc / dtgs%nogrp
           nsiz = dtgs%nogrp
        ELSE
           npp  = dfft%npp( me )
           nnp  = dfft%nnp
           nblk = dfft%nproc 
           nsiz = 1
        ENDIF
        !
        IF( ( dimref(1) .ne. npp ) .or. ( dimref(2) .ne. nnp ) .or. &
            ( dimref(3) .ne. nblk ) .or. ( dimref(4) .ne. nsiz ) ) THEN
           !
           IF( ALLOCATED( indmap ) )  &
              DEALLOCATE( indmap )
           ALLOCATE( indmap(2,SIZE(f_aux)) )
           !
           ijp = 0
           !
           DO gproc = 1, nblk
              ii = 0
              DO ipp = 1, nsiz
                 ioff = dfft%iss( (gproc-1)*nsiz + ipp )
                 DO i = 1, dfft%nsw( (gproc-1)*nsiz + ipp )
                    mc = dfft%ismap( i + ioff )
                    it = ii * nppx + (gproc-1) * sendsiz
                    DO j = 1, npp
                       ijp = ijp + 1
                       indmap(1,ijp) = mc + ( j - 1 ) * nnp
                       indmap(2,ijp) = j + it 
                    ENDDO
                    ii = ii + 1
                 ENDDO
              ENDDO
           ENDDO
           !
           nijp = ijp
           CALL fftsort( nijp, indmap )
           dimref(1) = npp
           dimref(2) = nnp
           dimref(3) = nblk
           dimref(4) = nsiz
           !
        END IF
        !
        DO ijp = 1, nijp
           f_aux( indmap(1,ijp) ) = f_in( indmap(2,ijp) )
        END DO
        !
     END IF

  ELSE
     !
     !  "backward" scatter from planes to columns
     !
     IF( isgn == -1 ) THEN

        nblk = dfft%nproc 

        DO ip = 1, dfft%nproc
           ioff = dfft%iss( ip )
           it = ( ip - 1 ) * sendsiz
           DO i = 1, dfft%nsp( ip )
              mc = dfft%ismap( i + ioff )
              DO j = 1, dfft%npp( me )
                 f_in( j + it ) = f_aux( mc + ( j - 1 ) * dfft%nnp )
              ENDDO
              it = it + nppx
           ENDDO

           CALL mpi_isend( f_in( (ip-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, ip-1, me, gcomm, sh( ip ), ierr )
           ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward send info<>0', abs(ierr) )

        ENDDO

        DO ip = 1, dfft%nproc
           CALL mpi_irecv( f_aux( (ip-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, ip-1, MPI_ANY_TAG, gcomm, rh(ip), ierr )
           ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward receive info<>0', abs(ierr) )
        ENDDO

     ELSE

        IF( use_tg_ ) THEN
           npp  = dtgs%tg_npp( me )
           nnp  = dfft%nr1x * dfft%nr2x
           nblk = dtgs%nproc / dtgs%nogrp
           nsiz = dtgs%nogrp
        ELSE
           npp  = dfft%npp( me )
           nnp  = dfft%nnp
           nblk = dfft%nproc 
           nsiz = 1
        ENDIF
        !
        IF( ( dimref_bw(1) .ne. npp ) .or. ( dimref_bw(2) .ne. nnp ) .or. &
            ( dimref_bw(3) .ne. nblk ) .or. ( dimref_bw(4) .ne. nsiz ) ) THEN
           !
           IF( ALLOCATED( indmap_bw ) )  &
              DEALLOCATE( indmap_bw )
           !
           ALLOCATE( indmap_bw(SIZE(f_aux)) )
           !
           ijp = 0
           DO gproc = 1, nblk
              ii = 0
              DO ipp = 1, nsiz
                 ioff = dfft%iss(  (gproc-1)*nsiz + ipp  )
                 DO i = 1, dfft%nsw(  (gproc-1)*nsiz + ipp  )
                    mc = dfft%ismap( i + ioff )
                    it = ii * nppx + ( gproc - 1 ) * sendsiz
                    DO j = 1, npp
                       indmap_bw( j + ijp ) = mc + ( j - 1 ) * nnp
                    ENDDO
                    ijp = ijp + npp 
                    ii = ii + 1
                 ENDDO
              ENDDO
           ENDDO

           dimref_bw(1) = npp
           dimref_bw(2) = nnp
           dimref_bw(3) = nblk
           dimref_bw(4) = nsiz

        END IF


        ijp = 0
        DO gproc = 0, nblk-1
           ii = gproc * sendsiz
           DO ipp = gproc*nsiz+1, gproc*nsiz+nsiz
              DO i = 1, dfft%nsw(  ipp  )
                 DO j = 1, npp
                    f_in( j + ii ) = f_aux( indmap_bw( j + ijp ) )
                 ENDDO
                 ijp = ijp + npp
                 ii = ii + nppx 
              ENDDO
           ENDDO
           CALL mpi_isend( f_in( gproc*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, gproc, me, gcomm, sh( gproc+1 ), ierr )
           ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward send info<>0', abs(ierr) )
        ENDDO

        DO gproc = 1, nblk
           CALL mpi_irecv( f_aux( (gproc-1)*sendsiz + 1 ), sendsiz, MPI_DOUBLE_COMPLEX, gproc-1, MPI_ANY_TAG, gcomm, rh(gproc), ierr )
           ! IF( abs(ierr) /= 0 ) CALL fftx_error__ ('fft_scatter', ' backward receive info<>0', abs(ierr) )
        ENDDO

     END IF
     !
     call mpi_waitall( nblk, rh, MPI_STATUSES_IGNORE, ierr )
     call mpi_waitall( nblk, sh, MPI_STATUSES_IGNORE, ierr )
     !
     offset = 0

     IF( use_tg_ ) THEN
        DO proc = 1, nprocp
           gproc = dtgs%nplist(proc) + 1
           kdest = ( proc - 1 ) * sendsiz
           kfrom = offset 
           DO k = 1, ncp_ (me)
              DO i = 1, npp_ ( gproc )  
                 f_in ( kfrom + i ) = f_aux ( kdest + i )
              ENDDO
              kdest = kdest + nppx
              kfrom = kfrom + nr3x
           ENDDO
           offset = offset + npp_ ( gproc )
        ENDDO
     ELSE
        DO proc = 1, nprocp
           kdest = ( proc - 1 ) * sendsiz 
           kfrom = offset 
           DO k = 1, ncp_ (me)
              DO i = 1, npp_ ( proc )  
                 f_in ( kfrom + i ) = f_aux ( kdest + i )
              ENDDO
              kdest = kdest + nppx
              kfrom = kfrom + nr3x
           ENDDO
           offset = offset + npp_ ( proc )
        ENDDO
     ENDIF

  ENDIF

  CALL stop_clock ('fft_scatter')

#endif

  RETURN

END SUBROUTINE fft_scatter
!
#endif
!
!
SUBROUTINE maps_sticks_to_3d( dffts, dtgs, f_in, nxx_, f_aux, isgn )
  !
  ! this subroutine copy sticks stored in 1D array into the 3D array
  ! to be used with 3D FFT. 
  ! This is meant for the use of 3D scalar FFT in parallel build 
  ! once the data have been "rotated" to have a single band in a single task 
  !
  IMPLICIT NONE

  TYPE (fft_type_descriptor), INTENT(in) :: dffts
  TYPE (task_groups_descriptor), INTENT(in) :: dtgs
  INTEGER, INTENT(in)           :: nxx_, isgn
  COMPLEX (DP), INTENT(in)      :: f_in (nxx_)
  COMPLEX (DP), INTENT(out)     :: f_aux (nxx_)

  INTEGER :: ijp, ii, i, j, it, ioff, ipp, mc, jj, ip, gproc, nr12x
  !
  f_aux = 0.0d0
  !
  IF( isgn == 2 ) THEN
     ip = 1
     nr12x = dffts%nr1x * dffts%nr2x
     DO gproc = 1, dtgs%nproc / dtgs%nogrp
        ii = 0
        DO ipp = 1, dtgs%nogrp
           ioff = dffts%iss( ip )
           DO i = 1, dffts%nsw( ip )
              mc = dffts%ismap( i + ioff )
              it = ( ii + ( gproc - 1 ) * dtgs%tg_ncpx ) * dtgs%tg_nppx
              DO j = 1, dtgs%tg_npp( dffts%mype + 1 )
                 f_aux( mc + ( j - 1 ) * nr12x ) = f_in( j + it )
              ENDDO
              ii = ii + 1
           ENDDO
           ip = ip + 1
        ENDDO
     ENDDO
  ELSE
     CALL fftx_error__ (' maps_sticks_to_3d ', ' isgn .ne. 2  not implemented ', 999 )
  END IF
  RETURN
END SUBROUTINE maps_sticks_to_3d
!
!
!----------------------------------------------------------------------------
SUBROUTINE gather_real_grid ( dfft, f_in, f_out )
  !----------------------------------------------------------------------------
  !
  ! ... gathers a distributed real-space FFT grid to dfft%root, that is,
  ! ... the first processor of input descriptor dfft - version for real arrays
  !
  ! ... REAL*8  f_in  = distributed variable (dfft%nnr)
  ! ... REAL*8  f_out = gathered variable (dfft%nr1x*dfft%nr2x*dfft%nr3x)
  !
  IMPLICIT NONE
  !
  REAL(DP), INTENT(in) :: f_in (:)
  REAL(DP), INTENT(inout):: f_out(:)
  TYPE ( fft_type_descriptor ), INTENT(IN) :: dfft
  !
#if defined(__MPI)
  !
  INTEGER :: proc, info
  ! ... the following are automatic arrays
  INTEGER :: displs(0:dfft%nproc-1), recvcount(0:dfft%nproc-1)
  !
  IF( size( f_in ) < dfft%nnr ) &
     CALL fftx_error__( ' gather_grid ', ' f_in too small ', dfft%nnr-size( f_in ) )
  !
  CALL start_clock( 'gather_grid' )
  !
  DO proc = 0, ( dfft%nproc - 1 )
     !
     recvcount(proc) = dfft%nnp * dfft%npp(proc+1)
     IF ( proc == 0 ) THEN
        displs(proc) = 0
     ELSE
        displs(proc) = displs(proc-1) + recvcount(proc-1)
     ENDIF
     !
  ENDDO
  !
  ! ... the following check should be performed only on processor dfft%root
  ! ... otherwise f_out must be allocated on all processors even if not used
  !
  info = size( f_out ) - displs( dfft%nproc-1 ) - recvcount( dfft%nproc-1 )
  IF( info < 0 ) &
     CALL fftx_error__( ' gather_grid ', ' f_out too small ', -info )
  !
  info = 0
  !
  CALL MPI_GATHERV( f_in, recvcount(dfft%mype), MPI_DOUBLE_PRECISION, f_out, &
                    recvcount, displs, MPI_DOUBLE_PRECISION, dfft%root,      &
                    dfft%comm, info )
  !
  CALL fftx_error__( 'gather_grid', 'info<>0', info )
  !
  CALL stop_clock( 'gather_grid' )
  !
#else
  CALL fftx_error__('gather_grid', 'do not use in serial execution', 1)
#endif
  !
  RETURN
  !
END SUBROUTINE gather_real_grid

!----------------------------------------------------------------------------
SUBROUTINE gather_complex_grid ( dfft, f_in, f_out )
  !----------------------------------------------------------------------------
  !
  ! ... gathers a distributed real-space FFT grid to dfft%root, that is,
  ! ... the first processor of input descriptor dfft - complex arrays
  !
  ! ... COMPLEX*16  f_in  = distributed variable (dfft%nnr)
  ! ... COMPLEX*16  f_out = gathered variable (dfft%nr1x*dfft%nr2x*dfft%nr3x)
  !
  IMPLICIT NONE
  !
  COMPLEX(DP), INTENT(in) :: f_in (:)
  COMPLEX(DP), INTENT(inout):: f_out(:)
  TYPE ( fft_type_descriptor ), INTENT(IN) :: dfft
  !
#if defined(__MPI)
  !
  INTEGER :: proc, info
  ! ... the following are automatic arrays
  INTEGER :: displs(0:dfft%nproc-1), recvcount(0:dfft%nproc-1)
  !
  IF( 2*size( f_in ) < dfft%nnr ) &
     CALL fftx_error__( ' gather_grid ', ' f_in too small ', dfft%nnr-size( f_in ) )
  !
  CALL start_clock( 'gather_grid' )
  !
  DO proc = 0, ( dfft%nproc - 1 )
     !
     recvcount(proc) = 2*dfft%nnp * dfft%npp(proc+1)
     IF ( proc == 0 ) THEN
        displs(proc) = 0
     ELSE
        displs(proc) = displs(proc-1) + recvcount(proc-1)
     ENDIF
     !
  ENDDO
  !
  ! ... the following check should be performed only on processor dfft%root
  ! ... otherwise f_out must be allocated on all processors even if not used
  !
  info = 2*size( f_out ) - displs( dfft%nproc - 1 ) - recvcount( dfft%nproc-1 )
  IF( info < 0 ) &
     CALL fftx_error__( ' gather_grid ', ' f_out too small ', -info )
  !
  info = 0
  !
  CALL MPI_GATHERV( f_in, recvcount(dfft%mype), MPI_DOUBLE_PRECISION, f_out, &
                    recvcount, displs, MPI_DOUBLE_PRECISION, dfft%root,      &
                    dfft%comm, info )
  !
  CALL fftx_error__( 'gather_grid', 'info<>0', info )
  !
  CALL stop_clock( 'gather_grid' )
  !
#else
  CALL fftx_error__('gather_grid', 'do not use in serial execution', 1)
#endif
  !
  RETURN
  !
END SUBROUTINE gather_complex_grid

!----------------------------------------------------------------------------
SUBROUTINE scatter_real_grid ( dfft, f_in, f_out )
  !----------------------------------------------------------------------------
  !
  ! ... scatters a real-space FFT grid from dfft%root, first processor of
  ! ... input descriptor dfft, to all others - opposite of "gather_grid"
  !
  ! ... REAL*8  f_in  = gathered variable (dfft%nr1x*dfft%nr2x*dfft%nr3x)
  ! ... REAL*8  f_out = distributed variable (dfft%nnr)
  !
  IMPLICIT NONE
  !
  REAL(DP), INTENT(in) :: f_in (:)
  REAL(DP), INTENT(inout):: f_out(:)
  TYPE ( fft_type_descriptor ), INTENT(IN) :: dfft
  !
#if defined(__MPI)
  !
  INTEGER :: proc, info
  ! ... the following are automatic arrays
  INTEGER :: displs(0:dfft%nproc-1), sendcount(0:dfft%nproc-1)
  !
  IF( size( f_out ) < dfft%nnr ) &
     CALL fftx_error__( ' scatter_grid ', ' f_out too small ', dfft%nnr-size( f_in ) )
  !
  CALL start_clock( 'scatter_grid' )
  !
  DO proc = 0, ( dfft%nproc - 1 )
     !
     sendcount(proc) = dfft%nnp * dfft%npp(proc+1)
     IF ( proc == 0 ) THEN
        displs(proc) = 0
     ELSE
        displs(proc) = displs(proc-1) + sendcount(proc-1)
     ENDIF
     !
  ENDDO
  !
  ! ... the following check should be performed only on processor dfft%root
  ! ... otherwise f_in must be allocated on all processors even if not used
  !
  info = size( f_in ) - displs( dfft%nproc - 1 ) - sendcount( dfft%nproc - 1 )
  IF( info < 0 ) &
     CALL fftx_error__( ' scatter_grid ', ' f_in too small ', -info )
  !
  info = 0
  !
  CALL MPI_SCATTERV( f_in, sendcount, displs, MPI_DOUBLE_PRECISION,   &
                     f_out, sendcount(dfft%mype), MPI_DOUBLE_PRECISION, &
                     dfft%root, dfft%comm, info )
  !
  CALL fftx_error__( 'scatter_grid', 'info<>0', info )
  !
  IF ( sendcount(dfft%mype) /= dfft%nnr ) &
     f_out(sendcount(dfft%mype)+1:dfft%nnr) = 0.D0
  !
  CALL stop_clock( 'scatter_grid' )
  !
#else
  CALL fftx_error__('scatter_grid', 'do not use in serial execution', 1)
#endif
  !
  RETURN
  !
END SUBROUTINE scatter_real_grid
!----------------------------------------------------------------------------
SUBROUTINE scatter_complex_grid ( dfft, f_in, f_out )
  !----------------------------------------------------------------------------
  !
  ! ... scatters a real-space FFT grid from dfft%root, first processor of
  ! ... input descriptor dfft, to all others - opposite of "gather_grid"
  !
  ! ... COMPLEX*16  f_in  = gathered variable (dfft%nr1x*dfft%nr2x*dfft%nr3x)
  ! ... COMPLEX*16  f_out = distributed variable (dfft%nnr)
  !
  IMPLICIT NONE
  !
  COMPLEX(DP), INTENT(in) :: f_in (:)
  COMPLEX(DP), INTENT(inout):: f_out(:)
  TYPE ( fft_type_descriptor ), INTENT(IN) :: dfft
  !
#if defined(__MPI)
  !
  INTEGER :: proc, info
  ! ... the following are automatic arrays
  INTEGER :: displs(0:dfft%nproc-1), sendcount(0:dfft%nproc-1)
  !
  IF( 2*size( f_out ) < dfft%nnr ) &
     CALL fftx_error__( ' scatter_grid ', ' f_out too small ', dfft%nnr-size( f_in ) )
  !
  CALL start_clock( 'scatter_grid' )
  !
  DO proc = 0, ( dfft%nproc - 1 )
     !
     sendcount(proc) = 2*dfft%nnp * dfft%npp(proc+1)
     IF ( proc == 0 ) THEN
        displs(proc) = 0
     ELSE
        displs(proc) = displs(proc-1) + sendcount(proc-1)
     ENDIF
     !
  ENDDO
  !
  ! ... the following check should be performed only on processor dfft%root
  ! ... otherwise f_in must be allocated on all processors even if not used
  !
  info = 2*size( f_in ) - displs( dfft%nproc - 1 ) - sendcount( dfft%nproc - 1 )
  IF( info < 0 ) &
     CALL fftx_error__( ' scatter_grid ', ' f_in too small ', -info )
  !
  info = 0
  !
  CALL MPI_SCATTERV( f_in, sendcount, displs, MPI_DOUBLE_PRECISION,   &
                     f_out, sendcount(dfft%mype), MPI_DOUBLE_PRECISION, &
                     dfft%root, dfft%comm, info )
  !
  CALL fftx_error__( 'scatter_grid', 'info<>0', info )
  !
  IF ( sendcount(dfft%mype) /= dfft%nnr ) &
     f_out(sendcount(dfft%mype)+1:dfft%nnr) = 0.D0
  !
  CALL stop_clock( 'scatter_grid' )
  !
#else
  CALL fftx_error__('scatter_grid', 'do not use in serial execution', 1)
#endif
  !
  RETURN
  !
END SUBROUTINE scatter_complex_grid
!
! ... "gather"-like subroutines
!
!-----------------------------------------------------------------------
SUBROUTINE cgather_sym( dfftp, f_in, f_out )
  !-----------------------------------------------------------------------
  !
  ! ... gather complex data for symmetrization (used in phonon code)
  ! ... Differs from gather_grid because mpi_allgatherv is used instead
  ! ... of mpi_gatherv - all data is gathered on ALL processors
  ! ... COMPLEX*16  f_in  = distributed variable (nrxx)
  ! ... COMPLEX*16  f_out = gathered variable (nr1x*nr2x*nr3x)
  !
  IMPLICIT NONE
  !
  TYPE (fft_type_descriptor), INTENT(in) :: dfftp
  COMPLEX(DP) :: f_in( : ), f_out(:)
  !
#if defined(__MPI)
  !
  INTEGER :: proc, info
  INTEGER :: displs(0:dfftp%nproc-1), recvcount(0:dfftp%nproc-1)
  !
  !
  CALL start_clock( 'cgather' )
  !
  DO proc = 0, ( dfftp%nproc - 1 )
     !
     recvcount(proc) = 2 * dfftp%nnp * dfftp%npp(proc+1)
     IF ( proc == 0 ) THEN
        displs(proc) = 0
     ELSE
        displs(proc) = displs(proc-1) + recvcount(proc-1)
     ENDIF
     !
  ENDDO
  !
  CALL MPI_BARRIER( dfftp%comm, info )
  !
  CALL MPI_ALLGATHERV( f_in, recvcount(dfftp%mype), MPI_DOUBLE_PRECISION, &
                       f_out, recvcount, displs, MPI_DOUBLE_PRECISION, &
                       dfftp%comm, info )
  !
  CALL fftx_error__( 'cgather_sym', 'info<>0', info )
  !
  CALL stop_clock( 'cgather' )
  !
#else
  CALL fftx_error__('cgather_sym', 'do not use in serial execution', 1)
#endif
  !
  RETURN
  !
END SUBROUTINE cgather_sym
!
!
!-----------------------------------------------------------------------
SUBROUTINE cgather_sym_many( dfftp, f_in, f_out, nbnd, nbnd_proc, start_nbnd_proc )
  !-----------------------------------------------------------------------
  !
  ! ... Written by A. Dal Corso
  !
  ! ... This routine generalizes cgather_sym, receiveng nbnd complex 
  ! ... distributed functions and collecting nbnd_proc(dfftp%mype+1) 
  ! ... functions in each processor.
  ! ... start_nbnd_proc(dfftp%mype+1), says where the data for each processor
  ! ... start in the distributed variable
  ! ... COMPLEX*16  f_in  = distributed variable (nrxx,nbnd)
  ! ... COMPLEX*16  f_out = gathered variable (nr1x*nr2x*nr3x, 
  !                                             nbnd_proc(dfftp%mype+1))
  !
  IMPLICIT NONE
  !
  TYPE (fft_type_descriptor), INTENT(in) :: dfftp
  INTEGER :: nbnd, nbnd_proc(dfftp%nproc), start_nbnd_proc(dfftp%nproc)
  COMPLEX(DP) :: f_in(dfftp%nnr,nbnd)
  COMPLEX(DP) :: f_out(dfftp%nnp*dfftp%nr3x,nbnd_proc(dfftp%mype+1))
  !
#if defined(__MPI)
  !
  INTEGER :: proc, info
  INTEGER :: ibnd, jbnd
  INTEGER :: displs(0:dfftp%nproc-1), recvcount(0:dfftp%nproc-1)
  !
  !
  CALL start_clock( 'cgather' )
  !
  DO proc = 0, ( dfftp%nproc - 1 )
     !
     recvcount(proc) = 2 * dfftp%nnp * dfftp%npp(proc+1)
     !
     IF ( proc == 0 ) THEN
        !
        displs(proc) = 0
        !
     ELSE
        !
        displs(proc) = displs(proc-1) + recvcount(proc-1)
        !
     ENDIF
     !
  ENDDO
  !
  CALL MPI_BARRIER( dfftp%comm, info )
  !
  DO proc = 0, dfftp%nproc - 1
     DO ibnd = 1, nbnd_proc(proc+1)
        jbnd = start_nbnd_proc(proc+1) + ibnd - 1
        CALL MPI_GATHERV( f_in(1,jbnd), recvcount(dfftp%mype), &
                        MPI_DOUBLE_PRECISION, f_out(1,ibnd), recvcount, &
                        displs, MPI_DOUBLE_PRECISION, proc, dfftp%comm, info )
     END DO
  END DO
  !
  CALL fftx_error__( 'cgather_sym_many', 'info<>0', info )
  !
!  CALL mp_barrier( dfftp%comm )
  !
  CALL stop_clock( 'cgather' )
  !
#else
  CALL fftx_error__('cgather_sym_many', 'do not use in serial execution', 1)
#endif
  !
  RETURN
  !
END SUBROUTINE cgather_sym_many
!
!----------------------------------------------------------------------------
SUBROUTINE cscatter_sym_many( dfftp, f_in, f_out, target_ibnd, nbnd, nbnd_proc, &
                              start_nbnd_proc   )
  !----------------------------------------------------------------------------
  !
  ! ... Written by A. Dal Corso
  !
  ! ... generalizes cscatter_sym. It assumes that each processor has
  ! ... a certain number of bands (nbnd_proc(dfftp%mype+1)). The processor 
  ! ... that has target_ibnd scatters it to all the other processors 
  ! ... that receive a distributed part of the target function. 
  ! ... start_nbnd_proc(dfftp%mype+1) is used to identify the processor
  ! ... that has the required band
  !
  ! ... COMPLEX*16  f_in  = gathered variable (nr1x*nr2x*nr3x,
  !                                                nbnd_proc(dfftp%mype+1) )
  ! ... COMPLEX*16  f_out = distributed variable (nrxx)
  !
  IMPLICIT NONE
  !
  TYPE (fft_type_descriptor), INTENT(in) :: dfftp
  INTEGER :: nbnd, nbnd_proc(dfftp%nproc), start_nbnd_proc(dfftp%nproc)
  COMPLEX(DP) :: f_in(dfftp%nnp*dfftp%nr3x,nbnd_proc(dfftp%mype+1))
  COMPLEX(DP) :: f_out(dfftp%nnr)
  INTEGER :: target_ibnd
  !
#if defined(__MPI)
  !
  INTEGER :: proc, info
  INTEGER :: displs(0:dfftp%nproc-1), sendcount(0:dfftp%nproc-1)
  INTEGER :: ibnd, jbnd
  !
  !
  CALL start_clock( 'cscatter_sym' )
  !
  DO proc = 0, ( dfftp%nproc - 1 )
     !
     sendcount(proc) = 2 * dfftp%nnp * dfftp%npp(proc+1)
     !
     IF ( proc == 0 ) THEN
        !
        displs(proc) = 0
        !
     ELSE
        !
        displs(proc) = displs(proc-1) + sendcount(proc-1)
        !
     ENDIF
     !
  ENDDO
  !
  f_out = (0.0_DP, 0.0_DP)
  !
  CALL MPI_BARRIER( dfftp%comm, info )
  !
  DO proc = 0, dfftp%nproc - 1
     DO ibnd = 1, nbnd_proc(proc+1)
        jbnd = start_nbnd_proc(proc+1) + ibnd - 1
        IF (jbnd==target_ibnd) &
        CALL MPI_SCATTERV( f_in(1,ibnd), sendcount, displs, &
               MPI_DOUBLE_PRECISION, f_out, sendcount(dfftp%mype), &
               MPI_DOUBLE_PRECISION, proc, dfftp%comm, info )
     ENDDO
  ENDDO
  !
  CALL fftx_error__( 'cscatter_sym_many', 'info<>0', info )
  !
  CALL stop_clock( 'cscatter_sym' )
  !
#else
  CALL fftx_error__('cscatter_sym_many', 'do not use in serial execution', 1)
#endif
  !
  RETURN
  !
END SUBROUTINE cscatter_sym_many


!=----------------------------------------------------------------------=!
   END MODULE scatter_mod
!=----------------------------------------------------------------------=!
!
!
!---------------------------------------------------------------------
subroutine fftsort (n, ia)  
  !---------------------------------------------------------------------
  ! sort an integer array ia(1:n) into ascending order using heapsort algorithm.
  ! n is input, ia is replaced on output by its sorted rearrangement.
  ! create an index table (ind) by making an exchange in the index array
  ! whenever an exchange is made on the sorted data array (ia).
  ! in case of equal values in the data array (ia) the values in the
  ! index array (ind) are used to order the entries.
  ! if on input ind(1)  = 0 then indices are initialized in the routine,
  ! if on input ind(1) != 0 then indices are assumed to have been
  !                initialized before entering the routine and these
  !                indices are carried around during the sorting process
  !
  ! no work space needed !
  ! free us from machine-dependent sorting-routines !
  !
  ! adapted from Numerical Recipes pg. 329 (new edition)
  !
  implicit none  
  !-input/output variables
  integer :: n  
  integer :: ia (2,n)  
  !-local variables
  integer :: i, ir, j, l
  integer :: iia(2)  
  ! nothing to order
  if (n.lt.2) return  
  ! initialize indices for hiring and retirement-promotion phase
  l = n / 2 + 1  
  ir = n  
10 continue  
  ! still in hiring phase
  if (l.gt.1) then  
     l = l - 1  
     iia(:) = ia (:,l)  
     ! in retirement-promotion phase.
  else  
     ! clear a space at the end of the array
     iia(:) = ia (:,ir)  
     !
     ! retire the top of the heap into it
     ia (:,ir) = ia (:,1)  
     !
     ! decrease the size of the corporation
     ir = ir - 1  
     ! done with the last promotion
     if (ir.eq.1) then  
        ! the least competent worker at all !
        ia (:,1) = iia(:)  
        !
        return  
     endif
  endif
  ! wheter in hiring or promotion phase, we
  i = l  
  ! set up to place iia in its proper level
  j = l + l  
  !
  do while (j.le.ir)  
     if (j.lt.ir) then  
        if (ia (1,j) .lt. ia (1,j + 1) ) then  
           j = j + 1  
        endif
     endif
     ! demote iia
     if (iia(1).lt.ia (1,j) ) then  
        ia (:,i) = ia (:,j)  
        i = j  
        j = j + j  
     else  
        ! set j to terminate do-while loop
        j = ir + 1  
     endif
  enddo
  ia (:,i) = iia(:)  
  goto 10  
  !
end subroutine fftsort