!
! Copyright (C) 2001 PWSCF group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!-----------------------------------------------------------------------
subroutine reduce (size, ps)
  !-----------------------------------------------------------------------
  !
  !     sums a distributed variable s(size) over the processors.
  !     This version uses a fixed-length buffer of appropriate (?) size
  !                  uses shmem for the t3d/t3e, MPI otherwhise
  !
#undef SHMEM
  !
#include "machine.h"
#ifdef __PARA
  use para
#endif
  USE kinds, only : DP
  implicit none
  integer :: size

  real (kind=DP) :: ps (size)
#ifdef __PARA
  include 'mpif.h'
  integer :: info, n, nbuf
#define MAXB 10000
  real (kind=DP) :: buff (MAXB)
#ifdef SHMEM
  include 'mpp/shmem.fh'
  integer :: pWrkSync, pWrkData, start
  common / SH_SYNC / pWrkSync (SHMEM_BARRIER_SYNC_SIZE)
  common / SH_DATA / pWrkData (1024 * 1024)
  data pWrkData / 1048576 * 0 /
  data pWrkSync / SHMEM_BARRIER_SYNC_SIZE * SHMEM_SYNC_VALUE /
  !DIR$ CACHE_ALIGN /SH_SYNC/
  !DIR$ CACHE_ALIGN /SH_DATA/
#endif
  if (nprocp.le.1) return
  if (size.le.0) return
  call start_clock ('reduce')
  !
  !  syncronize processes - maybe unneeded on t3d but necessary on t3e !!!
  !
  call mpi_barrier (MPI_COMM_POOL, info)

  call errore ('reduce', 'error in barrier', info)

  nbuf = size / MAXB
#ifdef SHMEM
  start = (mypool - 1) * nprocp
#endif
  do n = 1, nbuf
#ifdef SHMEM
     call SHMEM_REAL8_SUM_TO_ALL (buff, ps (1 + (n - 1) * MAXB), &
          MAXB, start, 0, nprocp, pWrkData, pWrkSync)
#else
     call mpi_allreduce (ps (1 + (n - 1) * MAXB), buff, MAXB, &
          MPI_REAL8, MPI_SUM, MPI_COMM_POOL, info)
     call errore ('reduce', 'error in allreduce1', info)
#endif
     call DCOPY (MAXB, buff, 1, ps (1 + (n - 1) * MAXB), 1)
  enddo
  !
  !    possible remaining elements < maxb
  !
  if (size-nbuf * MAXB.gt.0) then
#ifdef SHMEM
     call SHMEM_REAL8_SUM_TO_ALL (buff, ps (1 + nbuf * MAXB), &
          size-nbuf * MAXB, start, 0, nprocp, pWrkData, pWrkSync)
#else
     call mpi_allreduce (ps (1 + nbuf * MAXB), buff, size-nbuf * &
          MAXB, MPI_REAL8, MPI_SUM, MPI_COMM_POOL, info)
     call errore ('reduce', 'error in allreduce2', info)
#endif
     call DCOPY (size-nbuf * MAXB, buff, 1, ps (1 + nbuf * MAXB), &
          1)

  endif
  call stop_clock ('reduce')
#endif
  return
end subroutine reduce
!
!-----------------------------------------------------------------------
subroutine ireduce (size, is)
  !-----------------------------------------------------------------------
  !
  !     sums a distributed variable is(size) over the processors.
  !
#include "machine.h"
#ifdef __PARA
  use para
#endif
  implicit none

  integer :: size, is (size)
#ifdef __PARA
  include 'mpif.h'
  integer :: info, n, m, nbuf
#define MAXI 500

  integer :: buff (MAXI)
  if (nprocp.le.1) return
  if (size.le.0) return
  !
  !  syncronize processes
  !
  call mpi_barrier (MPI_COMM_POOL, info)

  call errore ('reduce', 'error in barrier', info)

  nbuf = size / MAXI
  do n = 1, nbuf
     call mpi_allreduce (is (1 + (n - 1) * MAXI), buff, MAXI, &
          MPI_INTEGER, MPI_SUM, MPI_COMM_POOL, info)
     call errore ('ireduce', 'error in allreduce 1', info)
     do m = 1, MAXI
        is (m + (n - 1) * MAXI) = buff (m)
     enddo
  enddo
  !
  !    possible remaining elements < MAXI
  !
  if (size-nbuf * MAXI.gt.0) then
     call mpi_allreduce (is (1 + nbuf * MAXI), buff, size-nbuf * &
          MAXI, MPI_INTEGER, MPI_SUM, MPI_COMM_POOL, info)
     call errore ('reduce', 'error in allreduce 2', info)
     do m = 1, size-nbuf * MAXI
        is (m + nbuf * MAXI) = buff (m)
     enddo

  endif
#endif
  return
end subroutine ireduce