quantum-espresso/UtilXlib/mp.f90

!
! Copyright (C) 2002-2013 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
! This module contains interfaces to most low-level MPI operations:
! initialization and stopping, broadcast, parallel sum, etc.
!
!------------------------------------------------------------------------------!
MODULE mp
!------------------------------------------------------------------------------!
  USE util_param,     ONLY : DP, stdout, i8b
#if defined(__CUDA)
  USE cudafor,        ONLY : cudamemcpy, cudamemcpy2d, &
                            & cudaMemcpyDeviceToDevice, &
                            & cudaDeviceSynchronize
#endif
  !
  IMPLICIT NONE
  PRIVATE
  !
  PUBLIC :: mp_start, mp_abort, mp_stop, mp_end, &
    mp_bcast, mp_sum, mp_max, mp_min, mp_rank, mp_size, &
    mp_gather, mp_alltoall, mp_get, mp_put, &
    mp_barrier, mp_report, mp_group_free, &
    mp_root_sum, mp_comm_free, mp_comm_create, mp_comm_group, &
    mp_group_create, mp_comm_split, mp_set_displs, &
    mp_circular_shift_left, mp_circular_shift_left_start, &
    mp_get_comm_null, mp_get_comm_self, mp_count_nodes, &
    mp_type_create_column_section, mp_type_create_row_section, mp_type_free, &
    mp_allgather, mp_waitall, mp_testall
  !
  INTERFACE mp_bcast
    MODULE PROCEDURE mp_bcast_i1, mp_bcast_r1, mp_bcast_c1, &
      mp_bcast_z, mp_bcast_zv, &
      mp_bcast_iv, mp_bcast_i8v, mp_bcast_rv, mp_bcast_cv, mp_bcast_l, mp_bcast_rm, &
      mp_bcast_cm, mp_bcast_im, mp_bcast_it, mp_bcast_i4d, mp_bcast_rt, mp_bcast_lv, &
      mp_bcast_lm, mp_bcast_r4d, mp_bcast_r5d, mp_bcast_ct,  mp_bcast_c4d,&
      mp_bcast_c5d, mp_bcast_c6d
#if defined(__CUDA)
    MODULE PROCEDURE mp_bcast_i1_gpu, mp_bcast_r1_gpu, mp_bcast_c1_gpu, &
      !mp_bcast_z_gpu, mp_bcast_zv_gpu, &
      mp_bcast_iv_gpu, mp_bcast_rv_gpu, mp_bcast_cv_gpu, mp_bcast_l_gpu, mp_bcast_rm_gpu, &
      mp_bcast_cm_gpu, mp_bcast_im_gpu, mp_bcast_it_gpu, mp_bcast_i4d_gpu, mp_bcast_rt_gpu, mp_bcast_lv_gpu, &
      mp_bcast_lm_gpu, mp_bcast_r4d_gpu, mp_bcast_r5d_gpu, mp_bcast_ct_gpu,  mp_bcast_c4d_gpu,&
      mp_bcast_c5d_gpu, mp_bcast_c6d_gpu
#endif
  END INTERFACE
  !
  INTERFACE mp_sum
    MODULE PROCEDURE mp_sum_i1, mp_sum_iv, mp_sum_i8v, mp_sum_im, mp_sum_it, mp_sum_i4, mp_sum_i5, &
      mp_sum_r1, mp_sum_rv, mp_sum_rm, mp_sum_rm_nc, mp_sum_rt, mp_sum_r4d, &
      mp_sum_c1, mp_sum_cv, mp_sum_cm, mp_sum_cm_nc, mp_sum_ct, mp_sum_c4d, &
      mp_sum_c5d, mp_sum_c6d, mp_sum_rmm, mp_sum_cmm, mp_sum_r5d, &
      mp_sum_r6d
#if defined(__CUDA)
    MODULE PROCEDURE  mp_sum_i1_gpu, mp_sum_iv_gpu, mp_sum_im_gpu, mp_sum_it_gpu, &
      mp_sum_r1_gpu, mp_sum_rv_gpu, mp_sum_rm_gpu, mp_sum_rm_nc_gpu, mp_sum_rt_gpu, mp_sum_r4d_gpu, &
      mp_sum_c1_gpu, mp_sum_cv_gpu, mp_sum_cm_gpu, mp_sum_cm_nc_gpu, mp_sum_ct_gpu, mp_sum_c4d_gpu, &
      mp_sum_c5d_gpu, mp_sum_c6d_gpu, mp_sum_rmm_gpu, mp_sum_cmm_gpu, mp_sum_r5d_gpu, &
      mp_sum_r6d_gpu
#endif
  END INTERFACE
  !
  INTERFACE mp_root_sum
    MODULE PROCEDURE mp_root_sum_rm, mp_root_sum_cm
#if defined(__CUDA)
    MODULE PROCEDURE mp_root_sum_rm_gpu, mp_root_sum_cm_gpu
#endif
  END INTERFACE
  !
  INTERFACE mp_get
    MODULE PROCEDURE mp_get_r1, mp_get_rv, mp_get_cv, mp_get_i1, mp_get_iv, mp_get_rm, mp_get_cm
#if defined(__CUDA)
    MODULE PROCEDURE mp_get_r1_gpu, mp_get_rv_gpu, mp_get_cv_gpu, mp_get_i1_gpu, mp_get_iv_gpu, &
      mp_get_rm_gpu, mp_get_cm_gpu
#endif
   END INTERFACE
   !
   INTERFACE mp_put
     MODULE PROCEDURE mp_put_rv, mp_put_cv, mp_put_i1, mp_put_iv, &
       mp_put_rm
#if defined(__CUDA)
   MODULE PROCEDURE mp_put_rv_gpu, mp_put_cv_gpu, mp_put_i1_gpu, mp_put_iv_gpu, &
     mp_put_rm_gpu
#endif
   END INTERFACE
   !
   INTERFACE mp_max
     MODULE PROCEDURE mp_max_i, mp_max_r, mp_max_rv, mp_max_iv
#if defined(__CUDA)
     MODULE PROCEDURE mp_max_i_gpu, mp_max_r_gpu, mp_max_rv_gpu, mp_max_iv_gpu
#endif
   END INTERFACE
   !
   INTERFACE mp_min
     MODULE PROCEDURE mp_min_i, mp_min_r, mp_min_rv, mp_min_iv
#if defined(__CUDA)
     MODULE PROCEDURE mp_min_i_gpu, mp_min_r_gpu, mp_min_rv_gpu, mp_min_iv_gpu
#endif
   END INTERFACE
   !
   INTERFACE mp_gather
     MODULE PROCEDURE mp_gather_i1, mp_gather_iv, mp_gatherv_rv, mp_gatherv_iv, &
                      mp_gatherv_rm, mp_gatherv_im, mp_gatherv_cv, &
                      mp_gatherv_inplace_cplx_array
#if defined(__CUDA)
     MODULE PROCEDURE mp_gather_i1_gpu, mp_gather_iv_gpu, mp_gatherv_rv_gpu, mp_gatherv_iv_gpu, &
       mp_gatherv_rm_gpu, mp_gatherv_im_gpu, mp_gatherv_cv_gpu, mp_gatherv_inplace_cplx_array_gpu
#endif
   END INTERFACE
   !
   INTERFACE mp_allgather
     MODULE PROCEDURE mp_allgatherv_inplace_cplx_array
     MODULE PROCEDURE mp_allgatherv_inplace_real_array
#if defined(__CUDA)
     MODULE PROCEDURE mp_allgatherv_inplace_cplx_array_gpu
     MODULE PROCEDURE mp_allgatherv_inplace_real_array_gpu
#endif
   END INTERFACE
   !
   INTERFACE mp_alltoall
     MODULE PROCEDURE mp_alltoall_c3d, mp_alltoall_i3d
#if defined(__CUDA)
     MODULE PROCEDURE mp_alltoall_c3d_gpu, mp_alltoall_i3d_gpu
#endif
   END INTERFACE
   !
   INTERFACE mp_circular_shift_left
     MODULE PROCEDURE mp_circular_shift_left_i0, &
       mp_circular_shift_left_i1, &
       mp_circular_shift_left_i2, &
       mp_circular_shift_left_r2d, &
       mp_circular_shift_left_c2d
#if defined(__CUDA)
     MODULE PROCEDURE mp_circular_shift_left_i0_gpu, &
       mp_circular_shift_left_i1_gpu, &
       mp_circular_shift_left_i2_gpu, &
       mp_circular_shift_left_r2d_gpu, &
       mp_circular_shift_left_c2d_gpu
#endif
   END INTERFACE
   !
   INTERFACE mp_circular_shift_left_start
     MODULE PROCEDURE mp_circular_shift_left_start_i0, &
       mp_circular_shift_left_start_i1, &
       mp_circular_shift_left_start_i2, &
       mp_circular_shift_left_start_r2d, &
       mp_circular_shift_left_start_c2d
   END INTERFACE
   !
   INTERFACE mp_type_create_column_section
     MODULE PROCEDURE mp_type_create_cplx_column_section
     MODULE PROCEDURE mp_type_create_real_column_section
#if defined(__CUDA)
     MODULE PROCEDURE mp_type_create_cplx_column_section_gpu
     MODULE PROCEDURE mp_type_create_real_column_section_gpu
#endif
   END INTERFACE

   INTERFACE mp_type_create_row_section
     MODULE PROCEDURE mp_type_create_cplx_row_section
     MODULE PROCEDURE mp_type_create_real_row_section
#if defined(__CUDA)
     MODULE PROCEDURE mp_type_create_cplx_row_section_gpu
     MODULE PROCEDURE mp_type_create_real_row_section_gpu
#endif
   END INTERFACE
!------------------------------------------------------------------------------!
!
   CONTAINS
!
!------------------------------------------------------------------------------!
!
!------------------------------------------------------------------------------!
!..mp_gather_i1
      SUBROUTINE mp_gather_i1(mydata, alldata, root, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, INTENT(IN) :: mydata, root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER, INTENT(OUT) :: alldata(:)
        INTEGER :: ierr


#if defined (__MPI)
        group = gid
        CALL MPI_GATHER(mydata, 1, MPI_INTEGER, alldata, 1, MPI_INTEGER, root, group, IERR)
        IF (ierr/=0) CALL mp_stop( 8013 )
#else
        alldata(1) = mydata
#endif
        RETURN
      END SUBROUTINE mp_gather_i1

!------------------------------------------------------------------------------!
!..mp_gather_iv
!..Carlo Cavazzoni
      SUBROUTINE mp_gather_iv(mydata, alldata, root, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, INTENT(IN) :: mydata(:), root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER, INTENT(OUT) :: alldata(:,:)
        INTEGER :: msglen, ierr


#if defined (__MPI)
        msglen = SIZE(mydata)
        IF( msglen .NE. SIZE(alldata, 1) ) CALL mp_stop( 8014 )
        group = gid
        CALL MPI_GATHER(mydata, msglen, MPI_INTEGER, alldata, msglen, MPI_INTEGER, root, group, IERR)
        IF (ierr/=0) CALL mp_stop( 8014 )
#else
        msglen = SIZE(mydata)
        IF( msglen .NE. SIZE(alldata, 1) ) CALL mp_stop( 8014 )
        alldata(:,1) = mydata(:)
#endif
        RETURN
      END SUBROUTINE mp_gather_iv

!
!------------------------------------------------------------------------------!
!..mp_start
      SUBROUTINE mp_start(numtask, taskid, group)

! ...
        USE parallel_include
        IMPLICIT NONE
        INTEGER, INTENT (OUT) :: numtask, taskid
        INTEGER, INTENT (IN)  :: group
        INTEGER :: ierr
! ...
        ierr = 0
        numtask = 1
        taskid = 0

#  if defined(__MPI)
        IF (ierr/=0) CALL mp_stop( 8004 )
        CALL mpi_comm_rank(group,taskid,ierr)
        IF (ierr/=0) CALL mp_stop( 8005 )
        CALL mpi_comm_size(group,numtask,ierr)
        IF (ierr/=0) CALL mp_stop( 8006 )
! ...
        CALL allocate_buffers()
#if defined(__CUDA)
        CALL allocate_buffers_gpu()
#endif
#endif
        RETURN
      END SUBROUTINE mp_start
!
!------------------------------------------------------------------------------!
!..mp_abort

      SUBROUTINE mp_abort(errorcode,gid)
        IMPLICIT NONE
        INTEGER :: ierr
        INTEGER, INTENT(IN):: errorcode, gid
#if defined(__MPI)
        CALL deallocate_buffers()
#if defined(__CUDA)
        CALL deallocate_buffers_gpu()
#endif
        CALL mpi_abort(gid, errorcode, ierr)
#endif
      END SUBROUTINE mp_abort
!
!------------------------------------------------------------------------------!
!..mp_end

      SUBROUTINE mp_end(groupid, cleanup)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, INTENT(IN) :: groupid
        LOGICAL, OPTIONAL, INTENT(IN) :: cleanup
        INTEGER :: ierr, taskid
        LOGICAL :: cleanup_

        ierr = 0
        taskid = 0

#if defined(__MPI)
        CALL mpi_comm_rank( groupid, taskid, ierr)
        cleanup_ = .FALSE.
        IF (PRESENT(cleanup)) cleanup_ = cleanup
        IF(cleanup_) THEN
           CALL deallocate_buffers()
#if defined(__CUDA)
           CALL deallocate_buffers_gpu()
#endif
        END IF
#endif
        RETURN
      END SUBROUTINE mp_end

!------------------------------------------------------------------------------!
!..mp_group

      SUBROUTINE mp_comm_group( comm, group )
         USE parallel_include
         IMPLICIT NONE
         INTEGER, INTENT (IN) :: comm
         INTEGER, INTENT (OUT) :: group
         INTEGER :: ierr
         ierr = 0
#if defined(__MPI)
         CALL mpi_comm_group( comm, group, ierr )
         IF (ierr/=0) CALL mp_stop( 8007 )
#else
         group = 0
#endif
      END SUBROUTINE  mp_comm_group

      SUBROUTINE mp_comm_split( old_comm, color, key, new_comm )
         USE parallel_include
         IMPLICIT NONE
         INTEGER, INTENT (IN) :: old_comm
         INTEGER, INTENT (IN) :: color, key
         INTEGER, INTENT (OUT) :: new_comm
         INTEGER :: ierr
         ierr = 0
#if defined(__MPI)
         CALL MPI_COMM_SPLIT( old_comm, color, key, new_comm, ierr )
         IF (ierr/=0) CALL mp_stop( 8008 )
#else
         new_comm = old_comm
#endif
      END SUBROUTINE  mp_comm_split


      SUBROUTINE mp_group_create( group_list, group_size, old_grp, new_grp )
        USE parallel_include
        IMPLICIT NONE
        INTEGER, INTENT (IN) :: group_list(:), group_size, old_grp
        INTEGER, INTENT (OUT) :: new_grp
        INTEGER :: ierr

        ierr = 0
        new_grp = old_grp
#if defined(__MPI)
        CALL mpi_group_incl( old_grp, group_size, group_list, new_grp, ierr )
        IF (ierr/=0) CALL mp_stop( 8009 )
#endif
      END SUBROUTINE mp_group_create

!------------------------------------------------------------------------------!
      SUBROUTINE mp_comm_create( old_comm, new_grp, new_comm )
        USE parallel_include
        IMPLICIT NONE
        INTEGER, INTENT (IN) :: old_comm
        INTEGER, INTENT (IN) :: new_grp
        INTEGER, INTENT (OUT) :: new_comm
        INTEGER :: ierr

        ierr = 0
        new_comm = old_comm
#if defined(__MPI)
        CALL mpi_comm_create( old_comm, new_grp, new_comm, ierr )
        IF (ierr/=0) CALL mp_stop( 8010 )
#endif
      END SUBROUTINE mp_comm_create

!------------------------------------------------------------------------------!
!..mp_group_free
      SUBROUTINE mp_group_free( group )
        USE parallel_include
        IMPLICIT NONE
        INTEGER, INTENT (INOUT) :: group
        INTEGER :: ierr
        ierr = 0
#if defined(__MPI)
        CALL mpi_group_free( group, ierr )
        IF (ierr/=0) CALL mp_stop( 8011 )
#endif
      END SUBROUTINE mp_group_free
!------------------------------------------------------------------------------!

      SUBROUTINE mp_comm_free( comm )
         USE parallel_include
         IMPLICIT NONE
         INTEGER, INTENT (INOUT) :: comm
         INTEGER :: ierr
         ierr = 0
#if defined(__MPI)
         IF( comm /= MPI_COMM_NULL ) THEN
            CALL mpi_comm_free( comm, ierr )
            IF (ierr/=0) CALL mp_stop( 8012 )
         END IF
#endif
         RETURN
      END SUBROUTINE mp_comm_free

!------------------------------------------------------------------------------!
! non-blocking helpers
! waits till all request are completed
      SUBROUTINE mp_waitall(requests)
! ...
        USE parallel_include
        IMPLICIT NONE
        INTEGER, INTENT (INOUT) :: requests(:)
        INTEGER :: ierr
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE, size(requests))
#endif
        ierr = 0
#if defined(__MPI)
        call MPI_Waitall(size(requests), requests, istatus, ierr)
        IF (ierr/=0) CALL mp_stop( 8004 )
#endif
        RETURN
      END SUBROUTINE mp_waitall

!tests all requests
      SUBROUTINE mp_testall(requests, flag)
      ! ...
         USE parallel_include
         IMPLICIT NONE
          INTEGER, INTENT (INOUT) :: requests(:)
          INTEGER :: ierr
#if defined(__MPI)
          INTEGER :: istatus(MPI_STATUS_SIZE, size(requests))
#endif
          LOGICAL, INTENT(OUT):: flag
          !
          ierr = 0
          flag = .FALSE.
#if defined(__MPI)
          call MPI_Testall(size(requests), requests, flag, istatus, ierr)
          IF (ierr/=0) CALL mp_stop( 8004 )
#else
          flag = .TRUE.
#endif
          RETURN
       END SUBROUTINE mp_testall

!------------------------------------------------------------------------------!
!..mp_bcast
      SUBROUTINE mp_bcast_i1(msg,source,gid)
        IMPLICIT NONE
        INTEGER :: msg
        INTEGER :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: msglen

#if defined(__MPI)
        msglen = 1
        group = gid
        CALL bcast_integer( msg, msglen, source, group )
#endif
      END SUBROUTINE mp_bcast_i1
      !
      !------------------------------------------------------------------------------!
      SUBROUTINE mp_bcast_iv(msg, source, gid)
      !------------------------------------------------------------------------------!
      !!
      !! Bcast an integer vector
      !!
      IMPLICIT NONE
      !
      INTEGER :: msg(:)
      INTEGER, INTENT(in) :: source
      INTEGER, INTENT(in) :: gid
#if defined(__MPI)
      INTEGER :: msglen
      msglen = SIZE(msg)
      CALL bcast_integer(msg, msglen, source, gid)
#endif
      !------------------------------------------------------------------------------!
      END SUBROUTINE mp_bcast_iv
      !------------------------------------------------------------------------------!
      !
      !------------------------------------------------------------------------------!
      SUBROUTINE mp_bcast_i8v(msg, source, gid)
      !------------------------------------------------------------------------------!
      !!
      !! Bcast an integer vector of kind i8b.
      !!
      IMPLICIT NONE
      !
      INTEGER(KIND = i8b) :: msg(:)
      INTEGER, INTENT(in) :: source
      INTEGER, INTENT(in) :: gid
#if defined(__MPI)
      INTEGER :: msglen
      msglen = SIZE(msg)
      CALL bcast_integer8(msg, msglen, source, gid)
#endif
      !------------------------------------------------------------------------------!
      END SUBROUTINE mp_bcast_i8v
      !------------------------------------------------------------------------------!
      !
      !------------------------------------------------------------------------------!
      SUBROUTINE mp_bcast_im(msg, source, gid)
      !------------------------------------------------------------------------------!
        IMPLICIT NONE
        INTEGER :: msg(:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_integer(msg, msglen, source, gid)
#endif
      END SUBROUTINE mp_bcast_im
!
!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_bcast_it( msg, source, gid )
        IMPLICIT NONE
        INTEGER :: msg(:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_integer( msg, msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_it
!
!------------------------------------------------------------------------------!
!
! Samuel Ponce
!
      SUBROUTINE mp_bcast_i4d(msg, source, gid)
        IMPLICIT NONE
        INTEGER :: msg(:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_integer( msg, msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_i4d
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_r1( msg, source, gid )
        IMPLICIT NONE
        REAL (DP) :: msg
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = 1
        CALL bcast_real( msg, msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_r1
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_rv(msg,source,gid)
        IMPLICIT NONE
        REAL (DP) :: msg(:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_real( msg, msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_rv
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_rm(msg,source,gid)
        IMPLICIT NONE
        REAL (DP) :: msg(:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_real( msg, msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_rm
!
!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_bcast_rt(msg,source,gid)
        IMPLICIT NONE
        REAL (DP) :: msg(:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_real( msg, msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_rt
!
!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_bcast_r4d(msg, source, gid)
        IMPLICIT NONE
        REAL (DP) :: msg(:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_real( msg, msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_r4d

!
!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_bcast_r5d(msg, source, gid)
        IMPLICIT NONE
        REAL (DP) :: msg(:,:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_real( msg, msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_r5d

!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_c1(msg,source,gid)
        IMPLICIT NONE
        COMPLEX (DP) :: msg
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = 1
        CALL bcast_real( msg, 2 * msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_c1
!
!------------------------------------------------------------------------------!
      SUBROUTINE mp_bcast_cv(msg,source,gid)
        IMPLICIT NONE
        COMPLEX (DP) :: msg(:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_real( msg, 2 * msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_cv
!
!------------------------------------------------------------------------------!
      SUBROUTINE mp_bcast_cm(msg,source,gid)
        IMPLICIT NONE
        COMPLEX (DP) :: msg(:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_real( msg, 2 * msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_cm
!
!------------------------------------------------------------------------------!
      SUBROUTINE mp_bcast_ct(msg,source,gid)
        IMPLICIT NONE
        COMPLEX (DP) :: msg(:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_real( msg, 2 * msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_ct

!
!------------------------------------------------------------------------------!
      SUBROUTINE mp_bcast_c4d(msg,source,gid)
        IMPLICIT NONE
        COMPLEX (DP) :: msg(:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_real( msg, 2 * msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_c4d

      SUBROUTINE mp_bcast_c5d(msg,source,gid)
        IMPLICIT NONE
        COMPLEX (DP) :: msg(:,:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_real( msg, 2 * msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_c5d

      SUBROUTINE mp_bcast_c6d(msg,source,gid)
        IMPLICIT NONE
        COMPLEX (DP) :: msg(:,:,:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_real( msg, 2 * msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_c6d

!
!------------------------------------------------------------------------------!

      SUBROUTINE mp_bcast_l(msg,source,gid)
        IMPLICIT NONE
        LOGICAL :: msg
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = 1
        CALL bcast_logical( msg, msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_l
!
!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_bcast_lv(msg,source,gid)
        IMPLICIT NONE
        LOGICAL :: msg(:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_logical( msg, msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_lv

!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_bcast_lm(msg,source,gid)
        IMPLICIT NONE
        LOGICAL :: msg(:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL bcast_logical( msg, msglen, source, gid )
#endif
      END SUBROUTINE mp_bcast_lm


!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_z(msg,source,gid)
        IMPLICIT NONE
        CHARACTER (len=*) :: msg
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: msglen, ierr, i
        INTEGER, ALLOCATABLE :: imsg(:)
#if defined(__MPI)
        ierr = 0
        msglen = len(msg)
        group = gid
        ALLOCATE (imsg(1:msglen), STAT=ierr)
        IF (ierr/=0) CALL mp_stop( 8015 )
        DO i = 1, msglen
          imsg(i) = ichar(msg(i:i))
        END DO
        CALL bcast_integer( imsg, msglen, source, group )
        DO i = 1, msglen
          msg(i:i) = char(imsg(i))
        END DO
        DEALLOCATE (imsg, STAT=ierr)
        IF (ierr/=0) CALL mp_stop( 8016 )
#endif
      END SUBROUTINE mp_bcast_z
!
!------------------------------------------------------------------------------!
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_zv(msg,source,gid)
        IMPLICIT NONE
        CHARACTER (len=*) :: msg(:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: msglen, m1, m2, ierr, i, j
        INTEGER, ALLOCATABLE :: imsg(:,:)
#if defined(__MPI)
        ierr = 0
        m1 = LEN(msg)
        m2 = SIZE(msg)
        msglen = LEN(msg)*SIZE(msg)
        group = gid
        ALLOCATE (imsg(1:m1,1:m2), STAT=ierr)
        IF (ierr/=0) CALL mp_stop( 8017 )
        DO j = 1, m2
          DO i = 1, m1
            imsg(i,j) = ichar(msg(j)(i:i))
          END DO
        END DO
        CALL bcast_integer( imsg, msglen, source, group )
        DO j = 1, m2
          DO i = 1, m1
            msg(j)(i:i) = char(imsg(i,j))
          END DO
        END DO
        DEALLOCATE (imsg, STAT=ierr)
        IF (ierr/=0) CALL mp_stop( 8018 )
#endif
      END SUBROUTINE mp_bcast_zv
!
!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_get_i1(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER :: msg_dest
        INTEGER, INTENT(IN) :: msg_sour
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen = 1

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF(dest .NE. sour) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             msglen=1
             CALL MPI_SEND( msg_sour, msglen, MPI_INTEGER, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8019 )
           ELSE IF(mpime .EQ. dest) THEN
             msglen=1
             CALL MPI_RECV( msg_dest, msglen, MPI_INTEGER, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8020 )
             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8021 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest = msg_sour
          msglen = 1
        END IF

#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8022 )
#endif


        RETURN
      END SUBROUTINE mp_get_i1

!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_get_iv(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER :: msg_dest(:)
        INTEGER, INTENT(IN) :: msg_sour(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             msglen = SIZE(msg_sour)
             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_INTEGER, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8023 )
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_INTEGER, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8024 )
             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8025 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
          msglen = SIZE(msg_sour)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8026 )
#endif
        RETURN
      END SUBROUTINE mp_get_iv

!------------------------------------------------------------------------------!

      SUBROUTINE mp_get_r1(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL (DP)             :: msg_dest
        REAL (DP), INTENT(IN) :: msg_sour
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             msglen = 1
             CALL MPI_SEND( msg_sour, msglen, MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8027 )
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest, 1, MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8028 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8029 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest = msg_sour
          msglen = 1
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8030 )
#endif
        RETURN
      END SUBROUTINE mp_get_r1

!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_get_rv(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL (DP)             :: msg_dest(:)
        REAL (DP), INTENT(IN) :: msg_sour(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             msglen = SIZE(msg_sour)
             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8027 )
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8028 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8029 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
          msglen = SIZE(msg_sour)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8030 )
#endif
        RETURN
      END SUBROUTINE mp_get_rv

!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_get_rm(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL (DP) :: msg_dest(:,:)
        REAL (DP), INTENT(IN) :: msg_sour(:,:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8031 )
             msglen = SIZE(msg_sour)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8032 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8033 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest(1:SIZE(msg_sour,1), 1:SIZE(msg_sour,2)) = msg_sour(:,:)
          msglen = SIZE( msg_sour )
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8034 )
#endif
        RETURN
      END SUBROUTINE mp_get_rm


!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_get_cv(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        COMPLEX (DP)             :: msg_dest(:)
        COMPLEX (DP), INTENT(IN) :: msg_sour(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF( dest .NE. sour ) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8035 )
             msglen = SIZE(msg_sour)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8036 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8037 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
          msglen = SIZE(msg_sour)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8038 )
#endif
        RETURN
      END SUBROUTINE mp_get_cv


!------------------------------------------------------------------------------!
!
! Marco Govoni
!
      SUBROUTINE mp_get_cm(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        COMPLEX (DP)              :: msg_dest(:,:)
        COMPLEX (DP), INTENT(IN)  :: msg_sour(:,:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8031 )
             msglen = SIZE(msg_sour)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8032 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8033 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest(1:SIZE(msg_sour,1), 1:SIZE(msg_sour,2)) = msg_sour(:,:)
          msglen = SIZE( msg_sour )
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8034 )
#endif
        RETURN
      END SUBROUTINE mp_get_cm
!------------------------------------------------------------------------------!
!
!
!------------------------------------------------------------------------------!


      SUBROUTINE mp_put_i1(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER :: msg_dest
        INTEGER, INTENT(IN) :: msg_sour
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF(dest .NE. sour) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour, 1, MPI_INTEGER, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8039 )
             msglen = 1
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest, 1, MPI_INTEGER, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8040 )
             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8041 )
             msglen = 1
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest = msg_sour
          msglen = 1
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8042 )
#endif
        RETURN
      END SUBROUTINE mp_put_i1

!------------------------------------------------------------------------------!
!
!
      SUBROUTINE mp_put_iv(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER             :: msg_dest(:)
        INTEGER, INTENT(IN) :: msg_sour(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
#if defined(__MPI)
        group = gid
#endif
        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_INTEGER, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8043 )
             msglen = SIZE(msg_sour)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_INTEGER, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8044 )
             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8045 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
          msglen = SIZE(msg_sour)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8046 )
#endif
        RETURN
      END SUBROUTINE mp_put_iv

!------------------------------------------------------------------------------!
!
!
      SUBROUTINE mp_put_rv(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL (DP)             :: msg_dest(:)
        REAL (DP), INTENT(IN) :: msg_sour(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
#if defined(__MPI)
        group = gid
#endif
        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8047 )
             msglen = SIZE(msg_sour)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8048 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8049 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
          msglen = SIZE(msg_sour)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8050 )
#endif
        RETURN
      END SUBROUTINE mp_put_rv

!------------------------------------------------------------------------------!
!
!
      SUBROUTINE mp_put_rm(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL (DP)             :: msg_dest(:,:)
        REAL (DP), INTENT(IN) :: msg_sour(:,:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
#if defined(__MPI)
        group = gid
#endif
        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8051 )
             msglen = SIZE(msg_sour)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8052 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8053 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest(1:SIZE(msg_sour,1),1:SIZE(msg_sour,2)) = msg_sour(:,:)
          msglen = SIZE(msg_sour)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8054 )
#endif
        RETURN
      END SUBROUTINE mp_put_rm


!------------------------------------------------------------------------------!
!
!
      SUBROUTINE mp_put_cv(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        COMPLEX (DP)             :: msg_dest(:)
        COMPLEX (DP), INTENT(IN) :: msg_sour(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
#if defined(__MPI)
        group = gid
#endif
        ! processors not taking part in the communication have 0 length message

        msglen = 0

        IF( dest .NE. sour ) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 8055 )
             msglen = SIZE(msg_sour)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 8056 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 8057 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
          msglen = SIZE(msg_sour)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 8058 )
#endif
        RETURN
      END SUBROUTINE mp_put_cv

!
!------------------------------------------------------------------------------!
!
!..mp_stop
!
      SUBROUTINE mp_stop(code)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, INTENT (IN) :: code
        INTEGER :: ierr
        WRITE( stdout, fmt='( "*** error in Message Passing (mp) module ***")' )
        WRITE( stdout, fmt='( "*** error code: ",I5)' ) code
#if defined(__MPI)
        ! abort with extreme prejudice across the entire MPI set of tasks
        CALL mpi_abort(MPI_COMM_WORLD,code,ierr)
#endif
        STOP
      END SUBROUTINE mp_stop
!------------------------------------------------------------------------------!
!
!..mp_sum
      SUBROUTINE mp_sum_i1(msg,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT) :: msg
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = 1
        CALL reduce_base_integer( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_i1
      !
      !------------------------------------------------------------------------------!
      SUBROUTINE mp_sum_iv(msg, gid)
      !------------------------------------------------------------------------------!
      !!
      !! MPI sum an integer vector from all cores and bcast the result to all.
      !!
      IMPLICIT NONE
      !
      INTEGER, INTENT(inout) :: msg(:)
      INTEGER, INTENT(in) :: gid
#if defined(__MPI)
      INTEGER :: msglen
      msglen = SIZE(msg)
      CALL reduce_base_integer(msglen, msg, gid, -1)
#endif
      !------------------------------------------------------------------------------!
      END SUBROUTINE mp_sum_iv
      !------------------------------------------------------------------------------!
      !
      !------------------------------------------------------------------------------!
      SUBROUTINE mp_sum_i8v(msg, gid)
      !------------------------------------------------------------------------------!
      !!
      !! MPI sum an integer vector from all cores and bcast the result to all.
      !!
      IMPLICIT NONE
      !
      INTEGER(KIND = i8b), INTENT(inout) :: msg(:)
      INTEGER, INTENT(in) :: gid
#if defined(__MPI)
      INTEGER :: msglen
      msglen = SIZE(msg)
      CALL reduce_base_integer8(msglen, msg, gid, -1)
#endif
      !------------------------------------------------------------------------------!
      END SUBROUTINE mp_sum_i8v
      !------------------------------------------------------------------------------!
      !
      !------------------------------------------------------------------------------!
      SUBROUTINE mp_sum_im(msg,gid)
      !------------------------------------------------------------------------------!
        IMPLICIT NONE
        INTEGER, INTENT (INOUT) :: msg(:,:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_integer( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_im
!
!------------------------------------------------------------------------------!

      SUBROUTINE mp_sum_it(msg,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT) :: msg(:,:,:)
        INTEGER, INTENT (IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_integer( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_it

!------------------------------------------------------------------------------!

      SUBROUTINE mp_sum_i4(msg,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT) :: msg(:,:,:,:)
        INTEGER, INTENT (IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_integer( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_i4

!------------------------------------------------------------------------------!

      SUBROUTINE mp_sum_i5(msg,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT) :: msg(:,:,:,:,:)
        INTEGER, INTENT (IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_integer( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_i5


!------------------------------------------------------------------------------!

      SUBROUTINE mp_sum_r1(msg,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg
        INTEGER, INTENT (IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = 1
        CALL reduce_base_real( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_r1

!
!------------------------------------------------------------------------------!

      SUBROUTINE mp_sum_rv(msg,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg(:)
        INTEGER, INTENT (IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_rv
!
!------------------------------------------------------------------------------!


      SUBROUTINE mp_sum_rm(msg, gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg(:,:)
        INTEGER, INTENT (IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_rm


      SUBROUTINE mp_sum_rm_nc(msg, k1, k2, k3, k4, gid)
        ! for non-contiguous arrays
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg(:,:)
        INTEGER, INTENT (IN) :: gid
        INTEGER, INTENT (IN) :: k1, k2, k3, k4
#if defined(__MPI)
        REAL(DP), ALLOCATABLE :: msg_buff(:,:)
        INTEGER :: msglen
        ALLOCATE( msg_buff(k2-k1+1,k4-k3+1) )
        msg_buff(1:k2-k1+1,1:k4-k3+1)=msg(k1:k2,k3:k4)
        msglen = size(msg_buff)
        CALL reduce_base_real( msglen, msg_buff, gid, -1 )
        msg(k1:k2,k3:k4)=msg_buff(1:k2-k1+1,1:k4-k3+1)
        DEALLOCATE( msg_buff )
#endif
      END SUBROUTINE mp_sum_rm_nc

      SUBROUTINE mp_root_sum_rm( msg, res, root, gid )
        USE parallel_include
        IMPLICIT NONE
        REAL (DP), INTENT (IN)  :: msg(:,:)
        REAL (DP), INTENT (OUT) :: res(:,:)
        INTEGER,   INTENT (IN)  :: root
        INTEGER,   INTENT (IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen, ierr, taskid

        msglen = size(msg)

        CALL mpi_comm_rank( gid, taskid, ierr)
        IF( ierr /= 0 ) CALL mp_stop( 8059 )
        !
        IF( taskid == root ) THEN
           IF( msglen > size(res) ) CALL mp_stop( 8060 )
        END IF

        CALL reduce_base_real_to( msglen, msg, res, gid, root )

#else

        res = msg

#endif

      END SUBROUTINE mp_root_sum_rm


      SUBROUTINE mp_root_sum_cm( msg, res, root, gid )
        USE parallel_include
        IMPLICIT NONE
        COMPLEX (DP), INTENT (IN)  :: msg(:,:)
        COMPLEX (DP), INTENT (OUT) :: res(:,:)
        INTEGER,   INTENT (IN)  :: root
        INTEGER,  INTENT (IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen, ierr, taskid

        msglen = size(msg)

        CALL mpi_comm_rank( gid, taskid, ierr)
        IF( ierr /= 0 ) CALL mp_stop( 8061 )

        IF( taskid == root ) THEN
           IF( msglen > size(res) ) CALL mp_stop( 8062 )
        END IF

        CALL reduce_base_real_to( 2 * msglen, msg, res, gid, root )

#else

        res = msg

#endif

      END SUBROUTINE mp_root_sum_cm

!
!------------------------------------------------------------------------------!


!------------------------------------------------------------------------------!
!

      SUBROUTINE mp_sum_rmm( msg, res, root, gid )
        USE parallel_include
        IMPLICIT NONE
        REAL (DP), INTENT (IN) :: msg(:,:)
        REAL (DP), INTENT (OUT) :: res(:,:)
        INTEGER, INTENT (IN) :: root
        INTEGER, INTENT (IN) :: gid
        INTEGER :: group
        INTEGER :: msglen
        INTEGER :: taskid, ierr

        msglen = size(msg)

#if defined(__MPI)

        group = gid
        !
        CALL mpi_comm_rank( group, taskid, ierr)
        IF( ierr /= 0 ) CALL mp_stop( 8063 )

        IF( taskid == root ) THEN
           IF( msglen > size(res) ) CALL mp_stop( 8064 )
        END IF
        !
        CALL reduce_base_real_to( msglen, msg, res, group, root )
        !

#else
        res = msg
#endif

      END SUBROUTINE mp_sum_rmm


!
!------------------------------------------------------------------------------!


      SUBROUTINE mp_sum_rt( msg, gid )
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg(:,:,:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_rt

!
!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_sum_r4d(msg,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg(:,:,:,:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_r4d


!------------------------------------------------------------------------------!

      SUBROUTINE mp_sum_c1(msg,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT) :: msg
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = 1
        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_c1
!
!------------------------------------------------------------------------------!

      SUBROUTINE mp_sum_cv(msg,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT) :: msg(:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_cv
!
!------------------------------------------------------------------------------!

      SUBROUTINE mp_sum_cm(msg, gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT) :: msg(:,:)
        INTEGER, INTENT (IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_cm
!
!------------------------------------------------------------------------------!


      SUBROUTINE mp_sum_cm_nc(msg, k1, k2, k3, k4, gid)
        ! for non-contiguous arrays
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT) :: msg(:,:)
        INTEGER, INTENT (IN) :: gid
        INTEGER, INTENT (IN) :: k1, k2, k3, k4
#if defined(__MPI)
        COMPLEX(DP), ALLOCATABLE :: msg_buff(:,:)
        INTEGER :: msglen
        ALLOCATE( msg_buff(k2-k1+1,k4-k3+1) )
        msg_buff(1:k2-k1+1,1:k4-k3+1)=msg(k1:k2,k3:k4)
        msglen = size(msg_buff)
        CALL reduce_base_real( 2 * msglen, msg_buff, gid, -1 )
        msg(k1:k2,k3:k4)=msg_buff(1:k2-k1+1,1:k4-k3+1)
        DEALLOCATE( msg_buff )
#endif
      END SUBROUTINE mp_sum_cm_nc
!
!------------------------------------------------------------------------------!

      SUBROUTINE mp_sum_cmm(msg, res, gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (IN) :: msg(:,:)
        COMPLEX (DP), INTENT (OUT) :: res(:,:)
        INTEGER, INTENT (IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real_to( 2 * msglen, msg, res, gid, -1 )
#else
        res = msg
#endif
      END SUBROUTINE mp_sum_cmm


!
!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_sum_ct(msg,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = SIZE(msg)
        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_ct

!
!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_sum_c4d(msg,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_c4d
!
!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_sum_c5d(msg,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:,:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_c5d

!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_sum_r5d(msg,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg(:,:,:,:,:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_r5d


      SUBROUTINE mp_sum_r6d(msg,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg(:,:,:,:,:,:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_r6d

!
!------------------------------------------------------------------------------!
!
! Carlo Cavazzoni
!
      SUBROUTINE mp_sum_c6d(msg,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:,:,:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_sum_c6d


!------------------------------------------------------------------------------!
      SUBROUTINE mp_max_i(msg,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT) :: msg
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = 1
        CALL parallel_max_integer( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_max_i
!
!------------------------------------------------------------------------------!
!
!..mp_max_iv
!..Carlo Cavazzoni
!
      SUBROUTINE mp_max_iv(msg,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT) :: msg(:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL parallel_max_integer( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_max_iv
!
!----------------------------------------------------------------------

      SUBROUTINE mp_max_r(msg,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = 1
        CALL parallel_max_real( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_max_r
!
!------------------------------------------------------------------------------!
      SUBROUTINE mp_max_rv(msg,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg(:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL parallel_max_real( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_max_rv
!------------------------------------------------------------------------------!
      SUBROUTINE mp_min_i(msg,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT) :: msg
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = 1
        CALL parallel_min_integer( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_min_i
!------------------------------------------------------------------------------!
      SUBROUTINE mp_min_iv(msg,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT) :: msg(:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = SIZE(msg)
        CALL parallel_min_integer( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_min_iv
!------------------------------------------------------------------------------!
      SUBROUTINE mp_min_r(msg,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = 1
        CALL parallel_min_real( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_min_r
!
!------------------------------------------------------------------------------!
      SUBROUTINE mp_min_rv(msg,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT) :: msg(:)
        INTEGER, INTENT(IN) :: gid
#if defined(__MPI)
        INTEGER :: msglen
        msglen = size(msg)
        CALL parallel_min_real( msglen, msg, gid, -1 )
#endif
      END SUBROUTINE mp_min_rv

!------------------------------------------------------------------------------!

      SUBROUTINE mp_barrier(gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, INTENT(IN) :: gid
        INTEGER :: ierr
#if defined(__MPI)
        CALL MPI_BARRIER(gid,IERR)
        IF (ierr/=0) CALL mp_stop( 8066 )
#endif
      END SUBROUTINE mp_barrier

!------------------------------------------------------------------------------!
!.. Carlo Cavazzoni
!..mp_rank
      FUNCTION mp_rank( comm )
        USE parallel_include
        IMPLICIT NONE
        INTEGER :: mp_rank
        INTEGER, INTENT(IN) :: comm
        INTEGER :: ierr, taskid

        ierr = 0
        taskid = 0
#if defined(__MPI)
        CALL mpi_comm_rank(comm,taskid,ierr)
        IF (ierr/=0) CALL mp_stop( 8067 )
#endif
        mp_rank = taskid
      END FUNCTION mp_rank

!------------------------------------------------------------------------------!
!.. Carlo Cavazzoni
!..mp_size
      FUNCTION mp_size( comm )
        USE parallel_include
        IMPLICIT NONE
        INTEGER :: mp_size
        INTEGER, INTENT(IN) :: comm
        INTEGER :: ierr, numtask

        ierr = 0
        numtask = 1
#if defined(__MPI)
        CALL mpi_comm_size(comm,numtask,ierr)
        IF (ierr/=0) CALL mp_stop( 8068 )
#endif
        mp_size = numtask
      END FUNCTION mp_size

      SUBROUTINE mp_report
        INTEGER :: i
        WRITE( stdout, *)
#if defined(__MPI)
#  if defined (__MP_STAT)
        WRITE( stdout, 20 )
20      FORMAT(3X,'please use an MPI profiler to analyze communications ')
#  endif
#else
        WRITE( stdout, *)
#endif
        RETURN
      END SUBROUTINE mp_report


!------------------------------------------------------------------------------!
!..mp_gatherv_rv
!..Carlo Cavazzoni

      SUBROUTINE mp_gatherv_rv( mydata, alldata, recvcount, displs, root, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL(DP) :: mydata(:)
        REAL(DP) :: alldata(:)
        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: ierr, npe, myid

#if defined (__MPI)
        group = gid
        CALL mpi_comm_size( group, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 8069 )
        CALL mpi_comm_rank( group, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 8070 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
        IF ( myid == root ) THEN
           IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
        END IF
        IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
        !
        CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_DOUBLE_PRECISION, &
                         alldata, recvcount, displs, MPI_DOUBLE_PRECISION, root, group, ierr )
        IF (ierr/=0) CALL mp_stop( 8074 )
#else
        IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
        IF ( SIZE( mydata  ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
        !
        alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
#endif
        RETURN
      END SUBROUTINE mp_gatherv_rv

!------------------------------------------------------------------------------!
!..mp_gatherv_cv
!..Carlo Cavazzoni

      SUBROUTINE mp_gatherv_cv( mydata, alldata, recvcount, displs, root, gid)
        USE parallel_include
        IMPLICIT NONE
        COMPLEX(DP) :: mydata(:)
        COMPLEX(DP) :: alldata(:)
        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: ierr, npe, myid

#if defined (__MPI)
        group = gid
        CALL mpi_comm_size( group, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 8069 )
        CALL mpi_comm_rank( group, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 8070 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
        IF ( myid == root ) THEN
           IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
        END IF
        IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
        !
        CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_DOUBLE_COMPLEX, &
                         alldata, recvcount, displs, MPI_DOUBLE_COMPLEX, root, group, ierr )
        IF (ierr/=0) CALL mp_stop( 8074 )
#else
        IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
        IF ( SIZE( mydata  ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
        !
        alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
#endif
        RETURN
      END SUBROUTINE mp_gatherv_cv

!------------------------------------------------------------------------------!
!..mp_gatherv_rv
!..Carlo Cavazzoni

      SUBROUTINE mp_gatherv_iv( mydata, alldata, recvcount, displs, root, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER :: mydata(:)
        INTEGER :: alldata(:)
        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: ierr, npe, myid

#if defined (__MPI)
        group = gid
        CALL mpi_comm_size( group, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 8069 )
        CALL mpi_comm_rank( group, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 8070 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
        IF ( myid == root ) THEN
           IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
        END IF
        IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
        !
        CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_INTEGER, &
                         alldata, recvcount, displs, MPI_INTEGER, root, group, ierr )
        IF (ierr/=0) CALL mp_stop( 8074 )
#else
        IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
        IF ( SIZE( mydata  ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
        !
        alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
#endif
        RETURN
      END SUBROUTINE mp_gatherv_iv


!------------------------------------------------------------------------------!
!..mp_gatherv_rm
!..Carlo Cavazzoni

      SUBROUTINE mp_gatherv_rm( mydata, alldata, recvcount, displs, root, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL(DP) :: mydata(:,:)  ! Warning first dimension is supposed constant!
        REAL(DP) :: alldata(:,:)
        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: ierr, npe, myid, nsiz
        INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)


#if defined (__MPI)
        group = gid
        CALL mpi_comm_size( group, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 8069 )
        CALL mpi_comm_rank( group, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 8070 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
        IF ( myid == root ) THEN
           IF ( SIZE( alldata, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
           IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8072 )
        END IF
        IF ( SIZE( mydata, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
        !
        ALLOCATE( nrecv( npe ), ndisp( npe ) )
        !
        nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata, 1 )
        ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata, 1 )
        !
        CALL MPI_GATHERV( mydata, nrecv( myid + 1 ), MPI_DOUBLE_PRECISION, &
                         alldata, nrecv, ndisp, MPI_DOUBLE_PRECISION, root, group, ierr )
        IF (ierr/=0) CALL mp_stop( 8074 )
        !
        DEALLOCATE( nrecv, ndisp )
        !
#else
        IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8075 )
        IF ( SIZE( alldata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
        IF ( SIZE( mydata, 2  ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
        !
        alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )
#endif
        RETURN
      END SUBROUTINE mp_gatherv_rm

!------------------------------------------------------------------------------!
!..mp_gatherv_im
!..Carlo Cavazzoni

      SUBROUTINE mp_gatherv_im( mydata, alldata, recvcount, displs, root, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER :: mydata(:,:)  ! Warning first dimension is supposed constant!
        INTEGER :: alldata(:,:)
        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: ierr, npe, myid, nsiz
        INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)


#if defined (__MPI)
        group = gid
        CALL mpi_comm_size( group, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 8069 )
        CALL mpi_comm_rank( group, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 8070 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
        IF ( myid == root ) THEN
           IF ( SIZE( alldata, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
           IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8072 )
        END IF
        IF ( SIZE( mydata, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
        !
        ALLOCATE( nrecv( npe ), ndisp( npe ) )
        !
        nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata, 1 )
        ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata, 1 )
        !
        CALL MPI_GATHERV( mydata, nrecv( myid + 1 ), MPI_INTEGER, &
                         alldata, nrecv, ndisp, MPI_INTEGER, root, group, ierr )
        IF (ierr/=0) CALL mp_stop( 8074 )
        !
        DEALLOCATE( nrecv, ndisp )
        !
#else
        IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8075 )
        IF ( SIZE( alldata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
        IF ( SIZE( mydata, 2  ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
        !
        alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )
#endif
        RETURN
      END SUBROUTINE mp_gatherv_im


!------------------------------------------------------------------------------!
!..mp_gatherv_inplace_cplx_array
!..Ye Luo

      SUBROUTINE mp_gatherv_inplace_cplx_array(alldata, my_column_type, recvcount, displs, root, gid)
        USE parallel_include
        IMPLICIT NONE
        COMPLEX(DP) :: alldata(:,:)
        INTEGER, INTENT(IN) :: my_column_type
        INTEGER, INTENT(IN) :: recvcount(:), displs(:)
        INTEGER, INTENT(IN) :: root, gid
        INTEGER :: ierr, npe, myid

#if defined (__MPI)
        CALL mpi_comm_size( gid, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 8069 )
        CALL mpi_comm_rank( gid, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 8070 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
        !
        IF (myid==root) THEN
           CALL MPI_GATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
                             alldata, recvcount, displs, my_column_type, root, gid, ierr )
        ELSE
           CALL MPI_GATHERV( alldata(1,displs(myid+1)+1), recvcount(myid+1), my_column_type, &
                             MPI_IN_PLACE, recvcount, displs, MPI_DATATYPE_NULL, root, gid, ierr )
        ENDIF
        IF (ierr/=0) CALL mp_stop( 8074 )
#endif
        RETURN
      END SUBROUTINE mp_gatherv_inplace_cplx_array

!------------------------------------------------------------------------------!
!..mp_allgatherv_inplace_cplx_array
!..Ye Luo

      SUBROUTINE mp_allgatherv_inplace_cplx_array(alldata, my_element_type, recvcount, displs, gid)
        USE parallel_include
        IMPLICIT NONE
        COMPLEX(DP) :: alldata(:,:)
        INTEGER, INTENT(IN) :: my_element_type
        INTEGER, INTENT(IN) :: recvcount(:), displs(:)
        INTEGER, INTENT(IN) :: gid
        INTEGER :: ierr, npe, myid

#if defined (__MPI)
        CALL mpi_comm_size( gid, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 8069 )
        CALL mpi_comm_rank( gid, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 8070 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
        !
        CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
                             alldata, recvcount, displs, my_element_type, gid, ierr )
        IF (ierr/=0) CALL mp_stop( 8074 )
#endif
        RETURN
      END SUBROUTINE mp_allgatherv_inplace_cplx_array

!.. SdG added 16/08/19
      SUBROUTINE mp_allgatherv_inplace_real_array(alldata, my_element_type, recvcount, displs, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL(DP) :: alldata(:,:)
        INTEGER, INTENT(IN) :: my_element_type
        INTEGER, INTENT(IN) :: recvcount(:), displs(:)
        INTEGER, INTENT(IN) :: gid
        INTEGER :: ierr, npe, myid

#if defined (__MPI)
        CALL mpi_comm_size( gid, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 8069 )
        CALL mpi_comm_rank( gid, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 8070 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
        !
        CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
                             alldata, recvcount, displs, my_element_type, gid, ierr )
        IF (ierr/=0) CALL mp_stop( 8074 )
#endif
        RETURN
      END SUBROUTINE mp_allgatherv_inplace_real_array

!------------------------------------------------------------------------------!

      SUBROUTINE mp_set_displs( recvcount, displs, ntot, nproc )
        !  Given the number of elements on each processor (recvcount), this subroutine
        !  sets the correct offsets (displs) to collect them on a single
        !  array with contiguous elemets
        IMPLICIT NONE
        INTEGER, INTENT(IN) :: recvcount(:) ! number of elements on each processor
        INTEGER, INTENT(OUT) :: displs(:)   ! offsets/displacements
        INTEGER, INTENT(OUT) :: ntot
        INTEGER, INTENT(IN) :: nproc
        INTEGER :: i

        displs( 1 ) = 0
        !
#if defined (__MPI)
        IF( nproc < 1 ) CALL mp_stop( 8090 )
        DO i = 2, nproc
           displs( i ) = displs( i - 1 ) + recvcount( i - 1 )
        END DO
        ntot = displs( nproc ) + recvcount( nproc )
#else
        ntot = recvcount( 1 )
#endif
        RETURN
      END SUBROUTINE mp_set_displs

!------------------------------------------------------------------------------!


SUBROUTINE mp_alltoall_c3d( sndbuf, rcvbuf, gid )
   USE parallel_include
   IMPLICIT NONE
   COMPLEX(DP) :: sndbuf( :, :, : )
   COMPLEX(DP) :: rcvbuf( :, :, : )
   INTEGER, INTENT(IN) :: gid
   INTEGER :: nsiz, group, ierr, npe

#if defined (__MPI)

   group = gid

   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8069 )

   IF ( SIZE( sndbuf, 3 ) < npe ) CALL mp_stop( 8069 )
   IF ( SIZE( rcvbuf, 3 ) < npe ) CALL mp_stop( 8069 )

   nsiz = SIZE( sndbuf, 1 ) * SIZE( sndbuf, 2 )

   CALL MPI_ALLTOALL( sndbuf, nsiz, MPI_DOUBLE_COMPLEX, &
                      rcvbuf, nsiz, MPI_DOUBLE_COMPLEX, group, ierr )

   IF (ierr/=0) CALL mp_stop( 8074 )

#else

   rcvbuf = sndbuf

#endif

   RETURN
END SUBROUTINE mp_alltoall_c3d


!------------------------------------------------------------------------------!

SUBROUTINE mp_alltoall_i3d( sndbuf, rcvbuf, gid )
   USE parallel_include
   IMPLICIT NONE
   INTEGER :: sndbuf( :, :, : )
   INTEGER :: rcvbuf( :, :, : )
   INTEGER, INTENT(IN) :: gid
   INTEGER :: nsiz, group, ierr, npe

#if defined (__MPI)

   group = gid

   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8069 )

   IF ( SIZE( sndbuf, 3 ) < npe ) CALL mp_stop( 8069 )
   IF ( SIZE( rcvbuf, 3 ) < npe ) CALL mp_stop( 8069 )

   nsiz = SIZE( sndbuf, 1 ) * SIZE( sndbuf, 2 )

   CALL MPI_ALLTOALL( sndbuf, nsiz, MPI_INTEGER, &
                      rcvbuf, nsiz, MPI_INTEGER, group, ierr )

   IF (ierr/=0) CALL mp_stop( 8074 )

#else

   rcvbuf = sndbuf

#endif

   RETURN
END SUBROUTINE mp_alltoall_i3d

SUBROUTINE mp_circular_shift_left_i0( buf, itag, gid )
   USE parallel_include
   IMPLICIT NONE
   INTEGER :: buf
   INTEGER, INTENT(IN) :: itag
   INTEGER, INTENT(IN) :: gid
   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)

   INTEGER :: istatus( mpi_status_size )
   !
   group = gid
   !
   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8100 )
   CALL mpi_comm_rank( group, mype, ierr )
   IF (ierr/=0) CALL mp_stop( 8101 )
   !
   sour = mype + 1
   IF( sour == npe ) sour = 0
   dest = mype - 1
   IF( dest == -1 ) dest = npe - 1
   !
   CALL MPI_Sendrecv_replace( buf, 1, MPI_INTEGER, &
        dest, itag, sour, itag, group, istatus, ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8102 )
   !
#else
   ! do nothing
#endif
   RETURN
END SUBROUTINE mp_circular_shift_left_i0


SUBROUTINE mp_circular_shift_left_i1( buf, itag, gid )
   USE parallel_include
   IMPLICIT NONE
   INTEGER :: buf(:)
   INTEGER, INTENT(IN) :: itag
   INTEGER, INTENT(IN) :: gid
   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)

   INTEGER :: istatus( mpi_status_size )
   !
   group = gid
   !
   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8100 )
   CALL mpi_comm_rank( group, mype, ierr )
   IF (ierr/=0) CALL mp_stop( 8101 )
   !
   sour = mype + 1
   IF( sour == npe ) sour = 0
   dest = mype - 1
   IF( dest == -1 ) dest = npe - 1
   !
   CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_INTEGER, &
        dest, itag, sour, itag, group, istatus, ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8102 )
   !
#else
   ! do nothing
#endif
   RETURN
END SUBROUTINE mp_circular_shift_left_i1


SUBROUTINE mp_circular_shift_left_i2( buf, itag, gid )
   USE parallel_include
   IMPLICIT NONE
   INTEGER :: buf(:,:)
   INTEGER, INTENT(IN) :: itag
   INTEGER, INTENT(IN) :: gid
   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)

   INTEGER :: istatus( mpi_status_size )
   !
   group = gid
   !
   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8100 )
   CALL mpi_comm_rank( group, mype, ierr )
   IF (ierr/=0) CALL mp_stop( 8101 )
   !
   sour = mype + 1
   IF( sour == npe ) sour = 0
   dest = mype - 1
   IF( dest == -1 ) dest = npe - 1
   !
   CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_INTEGER, &
        dest, itag, sour, itag, group, istatus, ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8102 )
   !
#else
   ! do nothing
#endif
   RETURN
END SUBROUTINE mp_circular_shift_left_i2


SUBROUTINE mp_circular_shift_left_r2d( buf, itag, gid )
   USE parallel_include
   IMPLICIT NONE
   REAL(DP) :: buf( :, : )
   INTEGER, INTENT(IN) :: itag
   INTEGER, INTENT(IN) :: gid
   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)

   INTEGER :: istatus( mpi_status_size )
   !
   group = gid
   !
   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8100 )
   CALL mpi_comm_rank( group, mype, ierr )
   IF (ierr/=0) CALL mp_stop( 8101 )
   !
   sour = mype + 1
   IF( sour == npe ) sour = 0
   dest = mype - 1
   IF( dest == -1 ) dest = npe - 1
   !
   CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_DOUBLE_PRECISION, &
        dest, itag, sour, itag, group, istatus, ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8102 )
   !
#else
   ! do nothing
#endif
   RETURN
END SUBROUTINE mp_circular_shift_left_r2d

SUBROUTINE mp_circular_shift_left_c2d( buf, itag, gid )
   USE parallel_include
   IMPLICIT NONE
   COMPLEX(DP) :: buf( :, : )
   INTEGER, INTENT(IN) :: itag
   INTEGER, INTENT(IN) :: gid
   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)

   INTEGER :: istatus( mpi_status_size )
   !
   group = gid
   !
   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8100 )
   CALL mpi_comm_rank( group, mype, ierr )
   IF (ierr/=0) CALL mp_stop( 8101 )
   !
   sour = mype + 1
   IF( sour == npe ) sour = 0
   dest = mype - 1
   IF( dest == -1 ) dest = npe - 1
   !
   CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_DOUBLE_COMPLEX, &
        dest, itag, sour, itag, group, istatus, ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8102 )
   !
#else
   ! do nothing
#endif
   RETURN
END SUBROUTINE mp_circular_shift_left_c2d


!------------------------------------------------------------------------------!
!..mp_circular_shift_left_start
SUBROUTINE mp_circular_shift_left_start_i0( sendbuf, recvbuf, itag, gid, requests)
   USE parallel_include
   IMPLICIT NONE
   INTEGER  :: sendbuf, recvbuf
   INTEGER, INTENT(IN) :: itag
   INTEGER, INTENT(IN) :: gid
   INTEGER, INTENT(INOUT) :: requests(2)
   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)

   !set null requests
   requests = mpi_request_null

   !communicator
   group = gid
   !
   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8100 )
   CALL mpi_comm_rank( group, mype, ierr )
   IF (ierr/=0) CALL mp_stop( 8101 )
   !
   sour = modulo(mype + 1, npe)
   dest = modulo(mype + npe - 1, npe)
   !
   CALL MPI_Irecv( recvbuf, 1, MPI_INTEGER, &
        sour, itag, group, requests(1), ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8102 )
   !
   CALL MPI_Isend( sendbuf, 1, MPI_INTEGER, &
        dest, itag, group, requests(2), ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8103 )
   !
#else

   recvbuf = sendbuf

#endif
   RETURN
END SUBROUTINE mp_circular_shift_left_start_i0


SUBROUTINE mp_circular_shift_left_start_i1( sendbuf, recvbuf, itag, gid, requests)
   USE parallel_include
   IMPLICIT NONE
   INTEGER  :: sendbuf( : ), recvbuf( : )
   INTEGER, INTENT(IN) :: itag
   INTEGER, INTENT(IN) :: gid
   INTEGER, INTENT(INOUT) :: requests(2)
   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)

   !set null requests
   requests = mpi_request_null

   !communicator
   group = gid
   !
   IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8100 )
   CALL mpi_comm_rank( group, mype, ierr )
   IF (ierr/=0) CALL mp_stop( 8101 )
   !
   sour = modulo(mype + 1, npe)
   dest = modulo(mype + npe - 1, npe)
   !
   CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_INTEGER, &
        sour, itag, group, requests(1), ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8102 )
   !
   CALL MPI_Isend( sendbuf, size(sendbuf), MPI_INTEGER, &
        dest, itag, group, requests(2), ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8103 )
   !
#else

   recvbuf = sendbuf

#endif
   RETURN
END SUBROUTINE mp_circular_shift_left_start_i1


SUBROUTINE mp_circular_shift_left_start_i2( sendbuf, recvbuf, itag, gid, requests)
   USE parallel_include
   IMPLICIT NONE
   INTEGER  :: sendbuf( :, : ), recvbuf( :, : )
   INTEGER, INTENT(IN) :: itag
   INTEGER, INTENT(IN) :: gid
   INTEGER, INTENT(INOUT) :: requests(2)
   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)

   !set null requests
   requests = mpi_request_null

   !communicator
   group = gid
   !
   IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8100 )
   CALL mpi_comm_rank( group, mype, ierr )
   IF (ierr/=0) CALL mp_stop( 8101 )
   !
   sour = modulo(mype + 1, npe)
   dest = modulo(mype + npe - 1, npe)
   !
   CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_INTEGER, &
        sour, itag, group, requests(1), ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8102 )
   !
   CALL MPI_Isend( sendbuf, size(sendbuf), MPI_INTEGER, &
        dest, itag, group, requests(2), ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8103 )
   !
#else

   recvbuf = sendbuf

#endif
   RETURN
END SUBROUTINE mp_circular_shift_left_start_i2


SUBROUTINE mp_circular_shift_left_start_r2d( sendbuf, recvbuf, itag, gid, requests)
   USE parallel_include
   IMPLICIT NONE
   REAL(DP) :: sendbuf( :, : ), recvbuf( :, : )
   INTEGER, INTENT(IN) :: itag
   INTEGER, INTENT(IN) :: gid
   INTEGER, INTENT(INOUT) :: requests(2)
   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)

   !set null requests
   requests = mpi_request_null

   !communicator
   group = gid
   !
   IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8100 )
   CALL mpi_comm_rank( group, mype, ierr )
   IF (ierr/=0) CALL mp_stop( 8101 )
   !
   sour = modulo(mype + 1, npe)
   dest = modulo(mype + npe - 1, npe)
   !
   CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_DOUBLE_PRECISION, &
        sour, itag, group, requests(1), ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8102 )
   !
   CALL MPI_Isend( sendbuf, size(sendbuf), MPI_DOUBLE_PRECISION, &
        dest, itag, group, requests(2), ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8103 )
   !
#else

   recvbuf = sendbuf

#endif
   RETURN
END SUBROUTINE mp_circular_shift_left_start_r2d


SUBROUTINE mp_circular_shift_left_start_c2d( sendbuf, recvbuf, itag, gid, requests)
   USE parallel_include
   IMPLICIT NONE
   COMPLEX(DP) :: sendbuf( :, : ), recvbuf( :, : )
   INTEGER, INTENT(IN) :: itag
   INTEGER, INTENT(IN) :: gid
   INTEGER, INTENT(INOUT) :: requests(2)
   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)

   !set null requests
   requests = mpi_request_null

   !communicator
   group = gid
   !
   IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
   CALL mpi_comm_size( group, npe, ierr )
   IF (ierr/=0) CALL mp_stop( 8100 )
   CALL mpi_comm_rank( group, mype, ierr )
   IF (ierr/=0) CALL mp_stop( 8101 )
   !
   sour = modulo(mype + 1, npe)
   dest = modulo(mype + npe - 1, npe)
   !
   CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_DOUBLE_COMPLEX, &
        sour, itag, group, requests(1), ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8102 )
   !
   CALL MPI_Isend( sendbuf, size(sendbuf), MPI_DOUBLE_COMPLEX, &
        dest, itag, group, requests(2), ierr)
   !
   IF (ierr/=0) CALL mp_stop( 8103 )
   !
#else

   recvbuf = sendbuf

#endif
   RETURN
END SUBROUTINE mp_circular_shift_left_start_c2d
!
!
!------------------------------------------------------------------------------!
!..mp_count_nodes
SUBROUTINE mp_count_nodes(num_nodes, color, key, group)
  !
  ! ... This routine counts the number of nodes using
  ! ...  MPI_GET_PROCESSOR_NAME in the group specified by `group`.
  ! ...  It returns colors and keys to be used in MPI_COMM_SPLIT.
  ! ...  When running in parallel, the evaluation of color and key
  ! ...  is done by all processors.
  ! ...
  ! ...
  ! ... input:
  ! ...    group      Communicator used to count nodes.
  !
  ! ... output:
  ! ...    num_nodes  Number of unique nodes in the communicator
  ! ...    color      Integer (positive), same for all processes residing on a node.
  ! ...    key        Integer, unique number identifying each process on the same node.
  ! ...
  USE parallel_include
  IMPLICIT NONE
  INTEGER, INTENT (OUT) :: num_nodes
  INTEGER, INTENT (OUT) :: color
  INTEGER, INTENT (OUT) :: key
  INTEGER, INTENT (IN)  :: group
#if defined (__MPI)
  CHARACTER(len=MPI_MAX_PROCESSOR_NAME) :: hostname
  CHARACTER(len=MPI_MAX_PROCESSOR_NAME), ALLOCATABLE :: host_list(:)
#endif

  LOGICAL, ALLOCATABLE   :: found_list(:)
  INTEGER, ALLOCATABLE   :: color_list(:)
  INTEGER, ALLOCATABLE   :: key_list(:)
  !
  INTEGER :: hostname_len, max_hostname_len, numtask, me, ierr
  !
  ! Loops variables
  INTEGER :: i, j, e, s, c, k
  ! ...
  ierr      = 0
  num_nodes = 1
  color     = 1
  key       = 0
  !
#if defined(__MPI)
  !
  CALL MPI_GET_PROCESSOR_NAME(hostname, hostname_len, ierr)
  IF (ierr/=0)  CALL mp_stop( 8103 )

  ! find total number of ranks and my rank in communicator
  CALL MPI_COMM_SIZE(group, numtask, ierr)
  IF (ierr/=0) CALL mp_stop( 8104 )
  !
  CALL MPI_COMM_RANK(group, me, ierr)
  IF (ierr/=0) CALL mp_stop( 8105 )
  !
  ALLOCATE(host_list(0:numtask-1))
  !
  host_list(me) = hostname(1:hostname_len)
  !
  ! Each process broadcast its name to the others
  DO i=0,numtask-1
    CALL MPI_BCAST(host_list(i), MPI_MAX_PROCESSOR_NAME, MPI_CHARACTER,&
                     i, group, ierr)
    IF (ierr/=0) CALL mp_stop( 8106 )
  END DO
  !
  ! Simple algorithm to count unique entries.
  !
  ALLOCATE(found_list(0:numtask-1),color_list(0:numtask-1))
  ALLOCATE(key_list(0:numtask-1))
  found_list(:) = .false.
  color_list(:) = -1
  key_list(:)   = -1
  !
  ! c is the counter for colors
  ! k is the counter for keys
  !
  c = 0
  DO i=0,numtask-1
    ! if node_counter == .true., this element has already been found,
    ! so skip it.
    IF (found_list(i)) CYCLE
    ! else increment color counter and reset key counter
    c = c + 1; k = 0
    color_list(i) = c
    key_list(i)   = k
    !
    DO j=i+1,numtask-1
      !
      IF ( LLE(host_list(i),host_list(j)) .and. &
           LGE(host_list(i),host_list(j))        ) THEN
        ! increment the key, key=0 is the one we are comparing to
        k = k + 1
        ! element should not be already found
        IF ( found_list(j) ) CALL mp_stop( 8107 )
        found_list(j) = .true.
        color_list(j) = c
        key_list(j)   = k
      END IF
    END DO
  END DO
  ! Sanity checks
  IF ( MINVAL(color_list) < 0 ) CALL mp_stop( 8108 )
  IF ( MINVAL(key_list)   < 0 ) CALL mp_stop( 8109 )
  !
  color     = color_list(me)
  key       = key_list(me)
  num_nodes = MAXVAL(color_list)
  DEALLOCATE(host_list,found_list,color_list,key_list)
!
#endif
  RETURN
END SUBROUTINE mp_count_nodes
!
FUNCTION mp_get_comm_null( )
  USE parallel_include
  IMPLICIT NONE
  INTEGER :: mp_get_comm_null
  mp_get_comm_null = MPI_COMM_NULL
END FUNCTION mp_get_comm_null

FUNCTION mp_get_comm_self( )
  USE parallel_include
  IMPLICIT NONE
  INTEGER :: mp_get_comm_self
  mp_get_comm_self = MPI_COMM_SELF
END FUNCTION mp_get_comm_self

SUBROUTINE mp_type_create_cplx_column_section(dummy, start, length, stride, mytype)
  USE parallel_include
  IMPLICIT NONE
  !
  COMPLEX (DP), INTENT(IN) :: dummy
  INTEGER, INTENT(IN) :: start, length, stride
  INTEGER, INTENT(OUT) :: mytype
  !
#if defined(__MPI)
  INTEGER :: ierr
  !
  CALL MPI_TYPE_CREATE_SUBARRAY(1, [stride], [length], [start], &
          MPI_ORDER_FORTRAN, MPI_DOUBLE_COMPLEX, mytype, ierr)
  IF (ierr/=0) CALL mp_stop( 8081 )
  CALL MPI_Type_commit(mytype, ierr)
  IF (ierr/=0) CALL mp_stop( 8082 )
#else
  mytype = 0;
#endif
  !
  RETURN
END SUBROUTINE mp_type_create_cplx_column_section

SUBROUTINE mp_type_create_real_column_section(dummy, start, length, stride, mytype)
  USE parallel_include
  IMPLICIT NONE
  !
  REAL (DP), INTENT(IN) :: dummy
  INTEGER, INTENT(IN) :: start, length, stride
  INTEGER, INTENT(OUT) :: mytype
  !
#if defined(__MPI)
  INTEGER :: ierr
  !
  CALL MPI_TYPE_CREATE_SUBARRAY(1, [stride], [length], [start], &
          MPI_ORDER_FORTRAN, MPI_DOUBLE_PRECISION, mytype, ierr)
  IF (ierr/=0) CALL mp_stop( 8083 )
  CALL MPI_Type_commit(mytype, ierr)
  IF (ierr/=0) CALL mp_stop( 8084 )
#else
  mytype = 0;
#endif
  !
  RETURN
END SUBROUTINE mp_type_create_real_column_section

SUBROUTINE mp_type_create_cplx_row_section(dummy, column_start, column_stride, row_length, mytype)
  USE parallel_include
  IMPLICIT NONE
  !
  COMPLEX (DP), INTENT(IN) :: dummy
  INTEGER, INTENT(IN) :: column_start, column_stride, row_length
  INTEGER, INTENT(OUT) :: mytype
  !
#if defined(__MPI)
  INTEGER :: ierr, temporary
  INTEGER :: strides(2), lengths(2), starts(2)
  INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
  !
  strides(1) = column_stride ; strides(2) = row_length
  lengths(1) = 1             ; lengths(2) = row_length
  starts(1)  = column_start  ; starts(2)  = 0
  CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
                                MPI_DOUBLE_COMPLEX, temporary, ierr)
  IF (ierr/=0) CALL mp_stop( 8085 )
  CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_COMPLEX, lb, extent, ierr)
  IF (ierr/=0) CALL mp_stop( 8085 )
  CALL MPI_TYPE_COMMIT(temporary, ierr)
  IF (ierr/=0) CALL mp_stop( 8085 )
  CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
  IF (ierr/=0) CALL mp_stop( 8086 )
  CALL MPI_Type_commit(mytype, ierr)
  IF (ierr/=0) CALL mp_stop( 8086 )
#else
  mytype = 0;
#endif
  !
  RETURN
END SUBROUTINE mp_type_create_cplx_row_section

SUBROUTINE mp_type_create_real_row_section(dummy, column_start, column_stride, row_length, mytype)
  USE parallel_include
  IMPLICIT NONE
  !
  REAL (DP), INTENT(IN) :: dummy
  INTEGER, INTENT(IN) :: column_start, column_stride, row_length
  INTEGER, INTENT(OUT) :: mytype
  !
#if defined(__MPI)
  INTEGER :: ierr, temporary
  INTEGER :: strides(2), lengths(2), starts(2)
  INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
  !
  strides(1) = column_stride ; strides(2) = row_length
  lengths(1) = 1             ; lengths(2) = row_length
  starts(1)  = column_start  ; starts(2)  = 0
  CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
                                MPI_DOUBLE_PRECISION, temporary, ierr)
  IF (ierr/=0) CALL mp_stop( 8087 )
  CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_PRECISION, lb, extent, ierr)
  IF (ierr/=0) CALL mp_stop( 8087 )
  CALL MPI_TYPE_COMMIT(temporary, ierr)
  IF (ierr/=0) CALL mp_stop( 8087 )
  CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
  IF (ierr/=0) CALL mp_stop( 8088 )
  CALL MPI_Type_commit(mytype, ierr)
  IF (ierr/=0) CALL mp_stop( 8088 )
#else
  mytype = 0;
#endif
  !
  RETURN
END SUBROUTINE mp_type_create_real_row_section

SUBROUTINE mp_type_free(mytype)
  USE parallel_include
  IMPLICIT NONE
  INTEGER :: mytype, ierr
  !
#if defined(__MPI)
  CALL MPI_TYPE_FREE(mytype, ierr)
  IF (ierr/=0) CALL mp_stop( 8089 )
#endif
  !
  RETURN
END SUBROUTINE mp_type_free
!------------------------------------------------------------------------------!
!  GPU specific subroutines (Pietro Bonfa')
!------------------------------------------------------------------------------!
! Before hacking on the CUDA part remember that:
!
! 1. all mp_* interface should be blocking with respect to both MPI and CUDA.
!    MPI will only wait for completion on the default stream therefore device
!    synchronization must be enforced.
! 2. Host -> device memory copies of a memory block of 64 KB or less are
!    asynchronous in the sense that they may return before the data is actually
!    available on the GPU. However, the user is still free to change the buffer
!    as soon as those calls return with no ill effects.
!    (https://devtalk.nvidia.com/default/topic/471866/cuda-programming-and-performance/host-device-memory-copies-up-to-64-kb-are-asynchronous/)
! 3. For transfers from device to either pageable or pinned host memory,
!    the function returns only once the copy has completed.
! 4. GPU synchronization is always enforced even if no communication takes place.
!------------------------------------------------------------------------------!
#ifdef __CUDA

!------------------------------------------------------------------------------!
!..mp_bcast

      SUBROUTINE mp_bcast_i1_gpu(msg_d,source,gid)
        IMPLICIT NONE
        INTEGER, DEVICE :: msg_d
        INTEGER :: msg_h
        INTEGER :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: msglen, ierr
        !
#if defined(__MPI)
        msglen = 1
        group = gid
#if defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI case
        CALL bcast_integer_gpu( msg_d, msglen, source, group )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        msg_h = msg_d                   ! This syncs __MPI case
        CALL bcast_integer( msg_h, msglen, source, group )
        msg_d = msg_h
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_i1_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_iv_gpu(msg_d,source,gid)
        IMPLICIT NONE
        INTEGER, DEVICE :: msg_d(:)
        INTEGER, ALLOCATABLE :: msg_h(:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
        !
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_integer_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_integer( msg_h, msglen, source, gid )
        msg_d = msg_h; DEALLOCATE( msg_h )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_iv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_im_gpu( msg_d, source, gid )
        IMPLICIT NONE
        INTEGER, DEVICE :: msg_d(:,:)
        INTEGER, ALLOCATABLE :: msg_h(:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_integer_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_integer( msg_h, msglen, source, gid )
        msg_d = msg_h; DEALLOCATE( msg_h )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_im_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_it_gpu( msg_d, source, gid )
        IMPLICIT NONE
        INTEGER, DEVICE :: msg_d(:,:,:)

        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_integer_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        INTEGER, ALLOCATABLE :: msg_h(:,:,:)
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_integer( msg_h, msglen, source, gid )
        msg_d = msg_h; DEALLOCATE( msg_h )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_it_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_i4d_gpu(msg_d, source, gid)
        IMPLICIT NONE
        INTEGER, DEVICE :: msg_d(:,:,:,:)
        INTEGER, ALLOCATABLE :: msg_h(:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_integer_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_integer( msg_h, msglen, source, gid )
        msg_d = msg_h; DEALLOCATE( msg_h )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_i4d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_r1_gpu( msg_d, source, gid )
        IMPLICIT NONE
        REAL (DP), DEVICE :: msg_d
        REAL (DP) :: msg_h
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
        msglen = 1
#if defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        msg_h=msg_d                         ! This syncs __MPI case
        CALL bcast_real( msg_h, msglen, source, gid )
        msg_d = msg_h
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_r1_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_rv_gpu(msg_d,source,gid)
        IMPLICIT NONE
        REAL (DP), DEVICE :: msg_d(:)
        REAL (DP), ALLOCATABLE :: msg_h(:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_real( msg_h, msglen, source, gid )
        msg_d = msg_h ; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_rv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_rm_gpu(msg_d,source,gid)
        IMPLICIT NONE
        REAL (DP), DEVICE :: msg_d(:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        REAL (DP), ALLOCATABLE :: msg_h(:,:)
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_real( msg_h, msglen, source, gid )
        msg_d = msg_h ; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_rm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_rt_gpu(msg_d,source,gid)
        IMPLICIT NONE
        REAL (DP), DEVICE :: msg_d(:,:,:)
        REAL (DP), ALLOCATABLE :: msg_h(:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_real( msg_h, msglen, source, gid )
        msg_d = msg_h ; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_rt_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_r4d_gpu(msg_d, source, gid)
        IMPLICIT NONE
        REAL (DP), DEVICE :: msg_d(:,:,:,:)
        REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_real( msg_h, msglen, source, gid )
        msg_d = msg_h ; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_r4d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_r5d_gpu(msg_d, source, gid)
        IMPLICIT NONE
        REAL (DP), DEVICE :: msg_d(:,:,:,:,:)
        REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_real( msg_h, msglen, source, gid )
        msg_d = msg_h ; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_r5d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_c1_gpu(msg_d,source,gid)
        IMPLICIT NONE
        COMPLEX (DP), DEVICE :: msg_d
        COMPLEX (DP) :: msg_h
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
        msglen = 1
#if defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        msg_h=msg_d                         ! This syncs __MPI case
        CALL bcast_real( msg_h, 2 * msglen, source, gid )
        msg_d = msg_h
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_c1_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_cv_gpu(msg_d,source,gid)
        IMPLICIT NONE
        COMPLEX (DP), DEVICE :: msg_d(:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_real( msg_h, 2 * msglen, source, gid )
        msg_d = msg_h ; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_cv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_cm_gpu(msg_d,source,gid)
        IMPLICIT NONE
        COMPLEX (DP), DEVICE :: msg_d(:,:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_real( msg_h, 2 * msglen, source, gid )
        msg_d = msg_h ; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_cm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_ct_gpu(msg_d,source,gid)
        IMPLICIT NONE
        COMPLEX (DP), DEVICE :: msg_d(:,:,:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_real( msg_h, 2 * msglen, source, gid )
        msg_d = msg_h ; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_ct_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_c4d_gpu(msg_d,source,gid)
        IMPLICIT NONE
        COMPLEX (DP), DEVICE :: msg_d(:,:,:,:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_real( msg_h, 2 * msglen, source, gid )
        msg_d = msg_h ; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_c4d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_c5d_gpu(msg_d,source,gid)
        IMPLICIT NONE
        COMPLEX (DP), DEVICE :: msg_d(:,:,:,:,:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_real( msg_h, 2 * msglen, source, gid )
        msg_d = msg_h ; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_c5d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_c6d_gpu(msg_d,source,gid)
        IMPLICIT NONE
        COMPLEX (DP), DEVICE :: msg_d(:,:,:,:,:,:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_real( msg_h, 2 * msglen, source, gid )
        msg_d = msg_h ; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_c6d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_l_gpu(msg_d,source,gid)
        IMPLICIT NONE
        LOGICAL, DEVICE :: msg_d
        LOGICAL         :: msg_h
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
        msglen = 1
#if defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_logical_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        msg_h = msg_d                       ! This syncs __MPI case
        CALL bcast_logical( msg_h, msglen, source, gid )
        msg_d = msg_h
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_l_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_lv_gpu(msg_d,source,gid)
        IMPLICIT NONE
        LOGICAL, DEVICE :: msg_d(:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_logical_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        LOGICAL, ALLOCATABLE :: msg_h(:)
        ALLOCATE(msg_h, source=msg_d)       ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_logical( msg_h, msglen, source, gid )
        msg_d = msg_h; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_lv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_bcast_lm_gpu(msg_d,source,gid)
        IMPLICIT NONE
        LOGICAL, DEVICE :: msg_d(:,:)
        INTEGER, INTENT(IN) :: source
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
        CALL bcast_logical_gpu( msg_d, msglen, source, gid )
        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
#else
        LOGICAL, ALLOCATABLE :: msg_h(:,:)
        ALLOCATE(msg_h, source=msg_d)       ! This syncs __MPI case
        msglen = size(msg_h)
        CALL bcast_logical( msg_h, msglen, source, gid )
        msg_d = msg_h; DEALLOCATE(msg_h)
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_bcast_lm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_get_i1_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, DEVICE             :: msg_dest_d
        INTEGER, INTENT(IN), DEVICE :: msg_sour_d
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group, ierr
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: nrcv
        INTEGER :: msglen = 1

#if ! defined(__GPU_MPI)
        ! Call CPU implementation
        INTEGER :: msg_dest_h, msg_sour_h
        !
        msg_dest_h = msg_dest_d; msg_sour_h = msg_sour_d      ! This syncs __MPI case
        CALL mp_get_i1(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
        msg_dest_d = msg_dest_h
#else

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF(dest .NE. sour) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             msglen=1
             CALL MPI_SEND( msg_sour_d, msglen, MPI_INTEGER, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( -8001 )
           ELSE IF(mpime .EQ. dest) THEN
             msglen=1
             CALL MPI_RECV( msg_dest_d, msglen, MPI_INTEGER, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( -8002 )
             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( -8003 )
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest_d = msg_sour_d
          msglen = 1
        END IF

#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( -8004 )
#endif

#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_get_i1_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_get_iv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, DEVICE             :: msg_dest_d(:)
        INTEGER, INTENT(IN), DEVICE :: msg_sour_d(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
        !
#if ! defined(__GPU_MPI)
        INTEGER, ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
        !
        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );       ! This syncs __MPI case
        CALL mp_get_iv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
        msg_dest_d = msg_dest_h
        DEALLOCATE(msg_dest_h, msg_sour_h)
#else

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             msglen = SIZE(msg_sour_d)
             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_INTEGER, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 9001 )
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_INTEGER, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 9002 )
             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 9003 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
          msglen = SIZE(msg_sour_d)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 9004 )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_get_iv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_get_r1_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL (DP), DEVICE             :: msg_dest_d
        REAL (DP), INTENT(IN), DEVICE :: msg_sour_d
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
#if ! defined(__GPU_MPI)
        REAL(DP) :: msg_dest_h, msg_sour_h
        !
        msg_dest_h=msg_dest_d; msg_sour_h=msg_sour_d      ! This syncs __MPI case
        CALL mp_get_r1(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
        msg_dest_d = msg_dest_h
#else
#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             msglen = 1
             CALL MPI_SEND( msg_sour_d, msglen, MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 9005 )
           ELSE IF(mpime .EQ. dest) THEN
             msglen = 1
             CALL MPI_RECV( msg_dest_d, msglen, MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 9006 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 9007 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest_d = msg_sour_d
          msglen = 1
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 9008 )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_get_r1_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_get_rv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL (DP), DEVICE             :: msg_dest_d(:)
        REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
        !
#if ! defined(__GPU_MPI)
        REAL (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
        !
        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );       ! This syncs __MPI case
        CALL mp_get_rv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
        msg_dest_d = msg_dest_h
        DEALLOCATE(msg_dest_h, msg_sour_h)
#else
        !
#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             msglen = SIZE(msg_sour_d)
             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 9009 )
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 9010 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 9011 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
          msglen = SIZE(msg_sour_d)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 9012 )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_get_rv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_get_rm_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL (DP), DEVICE             :: msg_dest_d(:,:)
        REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
        !
#if ! defined(__GPU_MPI)
        REAL (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:)
        !
        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );        ! This syncs __MPI case
        CALL mp_get_rm(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
        msg_dest_d = msg_dest_h
        DEALLOCATE(msg_dest_h, msg_sour_h)
#else

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 9013 )
             msglen = SIZE(msg_sour_d)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 9014 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 9015 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          !msg_dest_d(1:SIZE(msg_sour_d,1), 1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:)
          ! function cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kdir)
          ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),&
                              msg_sour_d, SIZE(msg_sour_d,1),&
                              SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), &
                              cudaMemcpyDeviceToDevice )
          msglen = SIZE( msg_sour_d )
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 9016 )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_get_rm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_get_cv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        COMPLEX (DP), DEVICE             :: msg_dest_d(:)
        COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
        !
#if ! defined(__GPU_MPI)
        COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
        !
        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );         ! This syncs __MPI case
        CALL mp_get_cv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
        msg_dest_d = msg_dest_h;
        DEALLOCATE(msg_dest_h, msg_sour_h)
#else
        !
#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF( dest .NE. sour ) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 9017 )
             msglen = SIZE(msg_sour_d)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 9018 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 9019 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
          msglen = SIZE(msg_sour_d)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 9020 )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_get_cv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_get_cm_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:)
        COMPLEX (DP), DEVICE             :: msg_dest_d(:,:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
        !
#if ! defined(__GPU_MPI)
        COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:)
        !
        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );          ! This syncs __MPI case
        CALL mp_get_cm(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
        msg_dest_d = msg_dest_h;
        DEALLOCATE(msg_dest_h, msg_sour_h)
#else
        !
#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 9021 )
             msglen = SIZE(msg_sour_d)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 9022 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 9023 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          !msg_dest_d(1:SIZE(msg_sour_d,1), 1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:)
          ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),&
                              msg_sour_d, SIZE(msg_sour_d,1),&
                              SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), &
                              cudaMemcpyDeviceToDevice )
          msglen = SIZE( msg_sour_d )
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 9024 )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_get_cm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_put_i1_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, DEVICE             :: msg_dest_d
        INTEGER, INTENT(IN), DEVICE :: msg_sour_d
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
        !
#if ! defined(__GPU_MPI)
        INTEGER :: msg_dest_h, msg_sour_h
        !
        msg_dest_h=msg_dest_d ; msg_sour_h=msg_sour_d           ! This syncs __MPI case
        CALL mp_put_i1(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
        msg_dest_d = msg_dest_h
#else

#if defined(__MPI)
        group = gid
#endif

        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF(dest .NE. sour) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour_d, 1, MPI_INTEGER, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 9025 )
             msglen = 1
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest_d, 1, MPI_INTEGER, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 9026 )
             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 9027 )
             msglen = 1
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          msg_dest_d = msg_sour_d
          msglen = 1
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 9028 )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_put_i1_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_put_iv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, DEVICE             :: msg_dest_d(:)
        INTEGER, INTENT(IN), DEVICE :: msg_sour_d(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
        !
#if ! defined(__GPU_MPI)
        INTEGER, ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
        !
        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );           ! This syncs __MPI case
        CALL mp_put_iv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
        msg_dest_d = msg_dest_h
        DEALLOCATE(msg_dest_h, msg_sour_h)
#else
        !
#if defined(__MPI)
        group = gid
#endif
        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_INTEGER, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 9029 )
             msglen = SIZE(msg_sour_d)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_INTEGER, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 9030 )
             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 9031 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
          msglen = SIZE(msg_sour_d)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 9032 )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_put_iv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_put_rv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL (DP), DEVICE             :: msg_dest_d(:)
        REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
        !
#if ! defined(__GPU_MPI)
        REAL (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
        !
        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );           ! This syncs __MPI case
        CALL mp_put_rv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
        msg_dest_d = msg_dest_h
        DEALLOCATE(msg_dest_h, msg_sour_h)
#else
        !
#if defined(__MPI)
        group = gid
#endif
        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 9033 )
             msglen = SIZE(msg_sour_d)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 9034 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 9035 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
          msglen = SIZE(msg_sour_d)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 9036 )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_put_rv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_put_rm_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL (DP), DEVICE             :: msg_dest_d(:,:)
        REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
        !
#if ! defined(__GPU_MPI)
        REAL (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:)
        !
        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );           ! This syncs __MPI case
        CALL mp_put_rm(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
        msg_dest_d = msg_dest_h
        DEALLOCATE(msg_dest_h, msg_sour_h)
#else
        !
#if defined(__MPI)
        group = gid
#endif
        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF(sour .NE. dest) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 9037 )
             msglen = SIZE(msg_sour_d)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 9038 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 9039 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          !msg_dest_d(1:SIZE(msg_sour_d,1),1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:)
          ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),&
                              msg_sour_d, SIZE(msg_sour_d,1),&
                              SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), &
                              cudaMemcpyDeviceToDevice )
          msglen = SIZE(msg_sour_d)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 9040 )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_put_rm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_put_cv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
        USE parallel_include
        IMPLICIT NONE
        COMPLEX (DP),             DEVICE :: msg_dest_d(:)
        COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
#if defined(__MPI)
        INTEGER :: istatus(MPI_STATUS_SIZE)
#endif
        INTEGER :: ierr, nrcv
        INTEGER :: msglen
        !
#if ! defined(__GPU_MPI)
        COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
        !
        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );           ! This syncs __MPI case
        CALL mp_put_cv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
        msg_dest_d = msg_dest_h
        DEALLOCATE(msg_dest_h, msg_sour_h)
#else
        !
#if defined(__MPI)
        group = gid
#endif
        ! processors not taking part in the communication have 0 length message

        msglen = 0
        !
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
        !
        IF( dest .NE. sour ) THEN
#if defined(__MPI)
           IF(mpime .EQ. sour) THEN
             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
             IF (ierr/=0) CALL mp_stop( 9041 )
             msglen = SIZE(msg_sour_d)
           ELSE IF(mpime .EQ. dest) THEN
             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
             IF (ierr/=0) CALL mp_stop( 9042 )
             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
             IF (ierr/=0) CALL mp_stop( 9043 )
             msglen = nrcv
           END IF
#endif
        ELSEIF(mpime .EQ. sour)THEN
          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
          msglen = SIZE(msg_sour_d)
        END IF
#if defined(__MPI)
        CALL MPI_BARRIER(group, IERR)
        IF (ierr/=0) CALL mp_stop( 9044 )
#endif
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
        RETURN
      END SUBROUTINE mp_put_cv_gpu
!
!------------------------------------------------------------------------------!
!
!..mp_sum
      SUBROUTINE mp_sum_i1_gpu(msg_d,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT), DEVICE :: msg_d
        INTEGER, msg_h
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
        msglen = 1
#if defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
        CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
        ! No need for final syncronization
#else
        !
        msg_h = msg_d                   ! This syncs __MPI case
        CALL reduce_base_integer( msglen, msg_h, gid, -1 )
        msg_d = msg_h
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_i1_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_iv_gpu(msg_d,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT), DEVICE :: msg_d(:)
        INTEGER, ALLOCATABLE :: msg_h(:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
        CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
        ! No need for final syncronization
#else
        ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_integer( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_iv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_im_gpu(msg_d,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT), DEVICE :: msg_d(:,:)
        INTEGER, ALLOCATABLE :: msg_h(:,:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
        CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
        ! No need for final syncronization
#else
        ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_integer( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_im_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_it_gpu(msg_d,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT), DEVICE :: msg_d(:,:,:)
        INTEGER, ALLOCATABLE :: msg_h(:,:,:)
        INTEGER, INTENT (IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
        CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
        ! No need for final syncronization
#else
        ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_integer( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_it_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_r1_gpu(msg_d,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d
        REAL(DP) :: msg_h
        INTEGER, INTENT (IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
        msglen = 1
#if defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
        ! No need for final syncronization
#else
        msg_h=msg_d                     ! This syncs __MPI case
        CALL reduce_base_real( msglen, msg_h, gid, -1 )
        msg_d = msg_h
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_r1_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_rv_gpu(msg_d,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:)
        REAL(DP), ALLOCATABLE :: msg_h(:)
        INTEGER, INTENT (IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
        ! No need for final syncronization
#else
        ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_rv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_rm_gpu(msg_d, gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:)
        REAL (DP), ALLOCATABLE :: msg_h(:,:)
        INTEGER, INTENT (IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
        ! No need for final syncronization
#else
        ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_rm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_rm_nc_gpu(msg_d, k1, k2, k3, k4, gid)
        ! for non-contiguous (nc) arrays
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:)
        REAL (DP), ALLOCATABLE :: msg_h(:,:)
        INTEGER, INTENT (IN) :: k1, k2, k3, k4
        INTEGER, INTENT (IN) :: gid
        REAL (DP), ALLOCATABLE :: msg_buff(:,:)
#if defined(__GPU_MPI)
        ATTRIBUTES(DEVICE) :: msg_buff
#endif
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
        ALLOCATE( msg_buff(k2-k1+1,k4-k3+1) )

        msg_buff(1:k2-k1+1,1:k4-k3+1) = msg_d(k1:k2,k3:k4)
        msglen = size(msg_buff)
        !
#if defined(__MPI)
#if defined(__GPU_MPI)
        !msglen = size(msg_buff)
        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( msglen, msg_buff, gid, -1 )
        ! No need for final syncronization
#else
        !ALLOCATE( msg_h, source=msg_buff ) ! This syncs __MPI case
        !msglen = size(msg_h)
        !msglen = size(msg_buff)
        CALL reduce_base_real( msglen, msg_buff, gid, -1 )
        !msg_buff = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
        !
        msg_d(k1:k2,k3:k4) = msg_buff(1:k2-k1+1,1:k4-k3+1)
        DEALLOCATE( msg_buff )
        !
      END SUBROUTINE mp_sum_rm_nc_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_root_sum_rm_gpu( msg_d, res_d, root, gid )
        USE parallel_include
        IMPLICIT NONE
        REAL (DP), INTENT (IN) , DEVICE :: msg_d(:,:)
        REAL (DP), INTENT (OUT), DEVICE :: res_d(:,:)
        REAL (DP), ALLOCATABLE :: res_h(:,:), msg_h(:,:)
        INTEGER,   INTENT (IN) :: root
        INTEGER,   INTENT (IN) :: gid
        !
        INTEGER :: msglen, ierr, taskid
#if defined(__MPI)
        !
        CALL mpi_comm_rank( gid, taskid, ierr)
        IF( ierr /= 0 ) CALL mp_stop( 9045 )
        !
        msglen = size(msg_d)
        IF( taskid == root ) THEN
           IF( msglen > size(res_d) ) CALL mp_stop( 9046 )
        END IF
#if  defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_to_gpu( msglen, msg_d, res_d, gid, root )
        RETURN ! Sync not needed in this case
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2)));
        CALL reduce_base_real_to( msglen, msg_h, res_h, gid, root )
        IF( taskid == root ) res_d = res_h;
        IF( taskid == root ) DEALLOCATE(res_h)
        DEALLOCATE(msg_h)
#endif

#else
        res_d = msg_d
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_root_sum_rm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_root_sum_cm_gpu( msg_d, res_d, root, gid )
        USE parallel_include
        IMPLICIT NONE
        COMPLEX (DP), INTENT (IN) , DEVICE :: msg_d(:,:)
        COMPLEX (DP), INTENT (OUT), DEVICE :: res_d(:,:)
        COMPLEX (DP), ALLOCATABLE          :: res_h(:,:), msg_h(:,:)
        INTEGER,   INTENT (IN)  :: root
        INTEGER,  INTENT (IN) :: gid
        !
        INTEGER :: msglen, ierr, taskid
#if defined(__MPI)
        msglen = size(msg_d)

        CALL mpi_comm_rank( gid, taskid, ierr)
        IF( ierr /= 0 ) CALL mp_stop( 9047 )

        IF( taskid == root ) THEN
           IF( msglen > size(res_d) ) CALL mp_stop( 9048 )
        END IF
#if  defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_to_gpu( 2 * msglen, msg_d, res_d, gid, root )
        RETURN ! Sync not needed in this case
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2)));
        CALL reduce_base_real_to( 2 * msglen, msg_h, res_h, gid, root )
        IF( taskid == root ) res_d = res_h;
        IF( taskid == root ) DEALLOCATE(res_h)
        DEALLOCATE(msg_h)
#endif
#else
        res_d = msg_d
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_root_sum_cm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_rmm_gpu( msg_d, res_d, root, gid )
        USE parallel_include
        IMPLICIT NONE
        REAL (DP), INTENT (IN), DEVICE :: msg_d(:,:)
        REAL (DP), INTENT (OUT),DEVICE :: res_d(:,:)
        REAL (DP), ALLOCATABLE         :: res_h(:,:), msg_h(:,:)
        INTEGER, INTENT (IN) :: root
        INTEGER, INTENT (IN) :: gid
        INTEGER :: group
        INTEGER :: msglen
        INTEGER :: taskid, ierr


#if defined(__MPI)

        msglen = size(msg_d)
        !
        group = gid
        !
        CALL mpi_comm_rank( group, taskid, ierr)
        IF( ierr /= 0 ) CALL mp_stop( 9049 )

        IF( taskid == root ) THEN
           IF( msglen > size(res_d) ) CALL mp_stop( 9050 )
        END IF
        !
#if  defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_to_gpu( msglen, msg_d, res_d, group, root )
        RETURN ! Sync not needed in this case
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2)));
        CALL reduce_base_real_to( msglen, msg_h, res_h, gid, root )
        IF( taskid == root ) res_d = res_h;
        IF( taskid == root ) DEALLOCATE(res_h)
        DEALLOCATE(msg_h)
#endif
        !
#else
        res_d = msg_d
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_sum_rmm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_rt_gpu( msg_d, gid )
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:)
        REAL (DP), ALLOCATABLE :: msg_h(:,:,:)
        INTEGER, INTENT(IN) :: gid
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_rt_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_r4d_gpu(msg_d,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:)
        REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_r4d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_c1_gpu(msg_d,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d
        COMPLEX (DP) :: msg_h
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
        msglen = 1
#if  defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        msg_h=msg_d                               ! This syncs __MPI case
        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
        msg_d = msg_h
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_c1_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_cv_gpu(msg_d,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        !
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs the device after small message copies
#endif
#endif
      END SUBROUTINE mp_sum_cv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_cm_gpu(msg_d, gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:)
        INTEGER, INTENT (IN) :: gid
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_cm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_cm_nc_gpu(msg_d, k1, k2, k3, k4, gid)
        ! for non-contiguous (nc) arrays
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:)
        INTEGER, INTENT (IN) :: k1, k2, k3, k4
        INTEGER, INTENT (IN) :: gid
        COMPLEX (DP), ALLOCATABLE :: msg_buff(:,:)
#if defined(__GPU_MPI)
        ATTRIBUTES(DEVICE) :: msg_buff
#endif
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
        ALLOCATE( msg_buff(k2-k1+1,k4-k3+1) )

        msg_buff(1:k2-k1+1,1:k4-k3+1) = msg_d(k1:k2,k3:k4)
        msglen = size(msg_buff)
        !
#if defined(__MPI)
#if defined(__GPU_MPI)
        !msglen = size(msg_buff)
        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( 2 * msglen, msg_buff, gid, -1 )
        ! No need for final syncronization
#else
        !ALLOCATE( msg_h, source=msg_buff ) ! This syncs __MPI case
        !msglen = size(msg_h)
        !msglen = size(msg_buff)
        CALL reduce_base_real( 2 * msglen, msg_buff, gid, -1 )
        !msg_buff = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
        !
        msg_d(k1:k2,k3:k4) = msg_buff(1:k2-k1+1,1:k4-k3+1)
        DEALLOCATE( msg_buff )
        !
      END SUBROUTINE mp_sum_cm_nc_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_cmm_gpu(msg_d, res_d, gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (IN), DEVICE :: msg_d(:,:)
        COMPLEX (DP), INTENT (OUT), DEVICE :: res_d(:,:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:), res_h(:,:)
        INTEGER, INTENT (IN) :: gid
        !
        INTEGER :: msglen, ierr
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_to_gpu( 2 * msglen, msg_d, res_d, gid, -1 )
        RETURN ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        ALLOCATE( res_h(lbound(msg_h,1):ubound(msg_h,1), lbound(msg_h,2):ubound(msg_h,2)));
        CALL reduce_base_real_to( 2 * msglen, msg_h, res_h, gid, -1 )
        res_d = res_h; DEALLOCATE(msg_h, res_h)
#endif
#else
        res_d = msg_d
#endif
        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_sum_cmm_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_ct_gpu(msg_d,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = SIZE(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_ct_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_c4d_gpu(msg_d,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:)
        COMPLEX (DP),ALLOCATABLE :: msg_h(:,:,:,:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_c4d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_c5d_gpu(msg_d,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_c5d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_r5d_gpu(msg_d,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:)
        REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_r5d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_r6d_gpu(msg_d,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:,:)
        REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_r6d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_sum_c6d_gpu(msg_d,gid)
        IMPLICIT NONE
        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:,:)
        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_sum_c6d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_max_i_gpu(msg_d,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT), DEVICE :: msg_d
        INTEGER :: msg_h
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
        msglen = 1
#if  defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        msg_h = msg_d                             ! This syncs __MPI case
        CALL parallel_max_integer( msglen, msg_h, gid, -1 )
        msg_d = msg_h
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_max_i_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_max_iv_gpu(msg_d,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT), DEVICE :: msg_d(:)
        INTEGER, ALLOCATABLE :: msg_h(:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
        CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
        msglen = size(msg_h)
        CALL parallel_max_integer( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_max_iv_gpu
!
!----------------------------------------------------------------------
!
      SUBROUTINE mp_max_r_gpu(msg_d,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d
        REAL (DP) :: msg_h
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
        msglen = 1
#if defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        msg_h = msg_d                    ! This syncs __MPI case
        CALL parallel_max_real( msglen, msg_h, gid, -1 )
        msg_d = msg_h
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_max_r_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_max_rv_gpu(msg_d,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:)
        REAL (DP), ALLOCATABLE :: msg_h(:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )  ! This syncs __MPI case
        msglen = size(msg_h)
        CALL parallel_max_real( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_max_rv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_min_i_gpu(msg_d,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT), DEVICE :: msg_d
        INTEGER  :: msg_h
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
        msglen = 1
#if  defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        msg_h = msg_d                    ! This syncs __MPI case
        CALL parallel_min_integer( msglen, msg_h, gid, -1 )
        msg_d = msg_h
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_min_i_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_min_iv_gpu(msg_d,gid)
        IMPLICIT NONE
        INTEGER, INTENT (INOUT), DEVICE :: msg_d(:)
        INTEGER, ALLOCATABLE :: msg_h(:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = SIZE(msg_d)
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )  ! This syncs __MPI case
        msglen = size(msg_h)
        CALL parallel_min_integer( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_min_iv_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_min_r_gpu(msg_d,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d
        REAL (DP) :: msg_h
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
        msglen = 1
#if  defined(__GPU_MPI)
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        msg_h = msg_d                    ! This syncs __MPI case
        CALL parallel_min_real( msglen, msg_h, gid, -1 )
        msg_d = msg_h
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_min_r_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_min_rv_gpu(msg_d,gid)
        IMPLICIT NONE
        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:)
        REAL (DP), ALLOCATABLE :: msg_h(:)
        INTEGER, INTENT(IN) :: gid
        !
        INTEGER :: msglen, ierr
        ! Avoid unnecessary communications on __MPI and syncs SERIAL
        IF ( mp_size(gid) == 1 ) THEN
          ierr = cudaDeviceSynchronize()
          RETURN
        END IF
        !
#if defined(__MPI)
#if  defined(__GPU_MPI)
        msglen = size(msg_d)
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 )
        ! Sync not needed after MPI call
#else
        ALLOCATE( msg_h, source=msg_d )  ! This syncs __MPI case
        msglen = size(msg_h)
        CALL parallel_min_real( msglen, msg_h, gid, -1 )
        msg_d = msg_h; DEALLOCATE(msg_h)
        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
#endif
#endif
      END SUBROUTINE mp_min_rv_gpu
!
!------------------------------------------------------------------------------!
!..mp_gather

      SUBROUTINE mp_gather_i1_gpu(mydata_d, alldata_d, root, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, DEVICE :: mydata_d
        INTEGER, INTENT(IN) :: gid, root
        INTEGER :: group
        INTEGER, INTENT(OUT), DEVICE :: alldata_d(:)
        INTEGER :: ierr


#if defined (__MPI)
#if ! defined(__GPU_MPI)
        INTEGER :: mydata_h
        INTEGER, ALLOCATABLE :: alldata_h(:)
        ALLOCATE( alldata_h, source=alldata_d ) ! This syncs __MPI
        mydata_h = mydata_d
        CALL mp_gather_i1(mydata_h, alldata_h, root, gid)
        mydata_d = mydata_h; alldata_d = alldata_h
        DEALLOCATE(alldata_h)
#else
        group = gid
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL MPI_GATHER(mydata_d, 1, MPI_INTEGER, alldata_d, 1, MPI_INTEGER, root, group, IERR)
        IF (ierr/=0) CALL mp_stop( 9051 )
        RETURN ! Sync not needed after MPI call
#endif
#else
        !alldata_d(1) = mydata_d
        ierr = cudaMemcpy( alldata_d(1), mydata_d, 1, &
                                            & cudaMemcpyDeviceToDevice )
        IF (ierr/=0) CALL mp_stop( 9052 )
#endif
        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_gather_i1_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_gather_iv_gpu(mydata_d, alldata_d, root, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, DEVICE :: mydata_d(:)
        INTEGER, INTENT(IN) :: gid, root
        INTEGER :: group
        INTEGER, INTENT(OUT), DEVICE :: alldata_d(:,:)
        INTEGER :: msglen, ierr, i


#if defined (__MPI)
#if ! defined(__GPU_MPI)
        INTEGER, ALLOCATABLE :: mydata_h(:)
        INTEGER, ALLOCATABLE :: alldata_h(:,:)
        ALLOCATE( mydata_h, source=mydata_d )     ! This syncs __MPI
        ALLOCATE( alldata_h, source=alldata_d )

        CALL mp_gather_iv(mydata_h, alldata_h, root, gid)
        mydata_d = mydata_h; alldata_d = alldata_h
        DEALLOCATE(alldata_h, mydata_h)
#else
        msglen = SIZE(mydata_d)
        IF( msglen .NE. SIZE(alldata_d, 1) ) CALL mp_stop( 9053 )
        group = gid
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL MPI_GATHER(mydata_d, msglen, MPI_INTEGER, alldata_d, msglen, MPI_INTEGER, root, group, IERR)
        IF (ierr/=0) CALL mp_stop( 9054 )
        RETURN ! Sync not needed after MPI call
#endif
#else
        msglen = SIZE(mydata_d)
        IF( msglen .NE. SIZE(alldata_d, 1) ) CALL mp_stop( 9055 )
        !alldata_d(:,1) = mydata_d(:)
        ierr = cudaMemcpy(alldata_d(:,1) , mydata_d(1), msglen, cudaMemcpyDeviceToDevice )
        IF (ierr/=0) CALL mp_stop( 9056 )
#endif
        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_gather_iv_gpu
!
!------------------------------------------------------------------------------!
!..mp_gatherv_rv
!
      SUBROUTINE mp_gatherv_rv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL(DP), DEVICE :: mydata_d(:)
        REAL(DP), DEVICE :: alldata_d(:)
        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: ierr, npe, myid
#if defined (__MPI)
#if ! defined(__GPU_MPI)
        REAL(DP), ALLOCATABLE :: mydata_h(:)
        REAL(DP), ALLOCATABLE :: alldata_h(:)

        ALLOCATE(mydata_h, source=mydata_d)     ! This syncs __MPI
        ALLOCATE(alldata_h, source=alldata_d)
        CALL mp_gatherv_rv( mydata_h, alldata_h, recvcount, displs, root, gid)
        alldata_d = alldata_h ; mydata_d = mydata_h
        DEALLOCATE(alldata_h , mydata_h)
#else

        group = gid
        CALL mpi_comm_size( group, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 9057 )
        CALL mpi_comm_rank( group, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 9058 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9059 )
        IF ( myid == root ) THEN
           IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9060 )
        END IF
        IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9061 )
        !
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_DOUBLE_PRECISION, &
                         alldata_d, recvcount, displs, MPI_DOUBLE_PRECISION, root, group, ierr )
        IF (ierr/=0) CALL mp_stop( 9062 )
        RETURN ! Sync not needed after MPI call
#endif
#else
        IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9063 )
        IF ( SIZE( mydata_d  ) < recvcount( 1 ) ) CALL mp_stop( 9064 )
        !
        !alldata_d( 1:recvcount( 1 ) ) = mydata_d( 1:recvcount( 1 ) )
        ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice )
        IF (ierr/=0) CALL mp_stop( 9065 )
#endif
        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_gatherv_rv_gpu
!
!------------------------------------------------------------------------------!
!..mp_gatherv_cv
!
      SUBROUTINE mp_gatherv_cv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
        USE parallel_include
        IMPLICIT NONE
        COMPLEX(DP), DEVICE :: mydata_d(:)
        COMPLEX(DP), DEVICE :: alldata_d(:)
        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: ierr, npe, myid

#if defined (__MPI)
#if ! defined(__GPU_MPI)
        COMPLEX(DP), ALLOCATABLE :: mydata_h(:)
        COMPLEX(DP), ALLOCATABLE :: alldata_h(:)

        ALLOCATE(mydata_h, source=mydata_d)     ! This syncs __MPI
        ALLOCATE(alldata_h, source=alldata_d)
        CALL mp_gatherv_cv( mydata_h, alldata_h, recvcount, displs, root, gid)
        alldata_d = alldata_h ; mydata_d = mydata_h
        DEALLOCATE(alldata_h , mydata_h)
#else
        group = gid
        CALL mpi_comm_size( group, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 9066 )
        CALL mpi_comm_rank( group, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 9067 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9068 )
        IF ( myid == root ) THEN
           IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9069 )
        END IF
        IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9070 )
        !
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_DOUBLE_COMPLEX, &
                         alldata_d, recvcount, displs, MPI_DOUBLE_COMPLEX, root, group, ierr )
        IF (ierr/=0) CALL mp_stop( 9071 )
        RETURN ! Sync not needed after MPI call
#endif
#else
        IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9072 )
        IF ( SIZE( mydata_d  ) < recvcount( 1 ) ) CALL mp_stop( 9073 )
        !
        !alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
        ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice )
#endif
        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_gatherv_cv_gpu
!
!------------------------------------------------------------------------------!
!..mp_gatherv_rv_gpu
!

      SUBROUTINE mp_gatherv_iv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, DEVICE :: mydata_d(:)
        INTEGER, DEVICE :: alldata_d(:)
        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: ierr, npe, myid

#if defined (__MPI)
#if ! defined(__GPU_MPI)
        INTEGER, ALLOCATABLE :: mydata_h(:)
        INTEGER, ALLOCATABLE :: alldata_h(:)

        ALLOCATE(mydata_h, source=mydata_d)     ! This syncs __MPI
        ALLOCATE(alldata_h, source=alldata_d)
        CALL mp_gatherv_iv( mydata_h, alldata_h, recvcount, displs, root, gid)
        alldata_d = alldata_h ; mydata_d = mydata_h
        DEALLOCATE(alldata_h , mydata_h)
#else
        group = gid
        CALL mpi_comm_size( group, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 9074 )
        CALL mpi_comm_rank( group, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 9075 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9076 )
        IF ( myid == root ) THEN
           IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9077 )
        END IF
        IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9078 )
        !
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_INTEGER, &
                         alldata_d, recvcount, displs, MPI_INTEGER, root, group, ierr )
        IF (ierr/=0) CALL mp_stop( 9079 )
        RETURN ! Sync not needed after MPI call
#endif
#else
        IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9080 )
        IF ( SIZE( mydata_d  ) < recvcount( 1 ) ) CALL mp_stop( 9081 )
        !
        !alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
        ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice )
#endif
        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_gatherv_iv_gpu
!
!------------------------------------------------------------------------------!
!..mp_gatherv_rm
!

      SUBROUTINE mp_gatherv_rm_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
        USE parallel_include
        IMPLICIT NONE
        REAL(DP), DEVICE :: mydata_d(:,:)  ! Warning first dimension is supposed constant!
        REAL(DP), DEVICE :: alldata_d(:,:)
        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: ierr, npe, myid, nsiz
        INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)


#if defined (__MPI)
#if ! defined(__GPU_MPI)
        REAL(DP), ALLOCATABLE :: mydata_h(:,:)
        REAL(DP), ALLOCATABLE :: alldata_h(:,:)

        ALLOCATE(mydata_h, source=mydata_d)     ! This syncs __MPI
        ALLOCATE(alldata_h, source=alldata_d)
        CALL mp_gatherv_rm( mydata_h, alldata_h, recvcount, displs, root, gid)
        alldata_d = alldata_h ; mydata_d = mydata_h
        DEALLOCATE(alldata_h , mydata_h)
#else
        group = gid
        CALL mpi_comm_size( group, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 9082 )
        CALL mpi_comm_rank( group, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 9083 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9084 )
        IF ( myid == root ) THEN
           IF ( SIZE( alldata_d, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9085 )
           IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9086 )
        END IF
        IF ( SIZE( mydata_d, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 9087 )
        !
        ALLOCATE( nrecv( npe ), ndisp( npe ) )
        !
        nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata_d, 1 )
        ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata_d, 1 )
        !
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL MPI_GATHERV( mydata_d, nrecv( myid + 1 ), MPI_DOUBLE_PRECISION, &
                         alldata_d, nrecv, ndisp, MPI_DOUBLE_PRECISION, root, group, ierr )
        IF (ierr/=0) CALL mp_stop( 9088 )
        !
        DEALLOCATE( nrecv, ndisp )
        !
        RETURN ! Sync not needed after MPI call
#endif
#else
        IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9089 )
        IF ( SIZE( alldata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9090 )
        IF ( SIZE( mydata_d, 2  ) < recvcount( 1 ) ) CALL mp_stop( 9091 )
        !
        !alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )

        ierr = cudaMemcpy2D(alldata_d, SIZE(alldata_d,1),&
                              mydata_d, SIZE(mydata_d,1),&
                              SIZE(mydata_d,1), recvcount( 1 ), &
                              cudaMemcpyDeviceToDevice )

        IF (ierr/=0) CALL mp_stop( 9092 )
#endif
        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_gatherv_rm_gpu
!
!------------------------------------------------------------------------------!
!..mp_gatherv_im
!
      SUBROUTINE mp_gatherv_im_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
        USE parallel_include
        IMPLICIT NONE
        INTEGER, DEVICE :: mydata_d(:,:)  ! Warning first dimension is supposed constant!
        INTEGER, DEVICE :: alldata_d(:,:)
        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
        INTEGER, INTENT(IN) :: gid
        INTEGER :: group
        INTEGER :: ierr, npe, myid, nsiz
        INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)


#if defined (__MPI)
#if ! defined(__GPU_MPI)
        INTEGER, ALLOCATABLE :: mydata_h(:,:)
        INTEGER, ALLOCATABLE :: alldata_h(:,:)

        ALLOCATE(mydata_h, source=mydata_d)     ! This syncs __MPI
        ALLOCATE(alldata_h, source=alldata_d)
        CALL mp_gatherv_im( mydata_h, alldata_h, recvcount, displs, root, gid)
        alldata_d = alldata_h ; mydata_d = mydata_h
        DEALLOCATE(alldata_h , mydata_h)
#else
        group = gid
        CALL mpi_comm_size( group, npe, ierr )
        IF (ierr/=0) CALL mp_stop( 9093 )
        CALL mpi_comm_rank( group, myid, ierr )
        IF (ierr/=0) CALL mp_stop( 9094 )
        !
        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9095 )
        IF ( myid == root ) THEN
           IF ( SIZE( alldata_d, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9096 )
           IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9097 )
        END IF
        IF ( SIZE( mydata_d, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 9098 )
        !
        ALLOCATE( nrecv( npe ), ndisp( npe ) )
        !
        nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata_d, 1 )
        ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata_d, 1 )
        !
        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
        CALL MPI_GATHERV( mydata_d, nrecv( myid + 1 ), MPI_INTEGER, &
                         alldata_d, nrecv, ndisp, MPI_INTEGER, root, group, ierr )
        IF (ierr/=0) CALL mp_stop( 9099 )
        !
        DEALLOCATE( nrecv, ndisp )
        !
        RETURN ! Sync not needed after MPI call
#endif
#else
        IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9100 )
        IF ( SIZE( alldata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9101 )
        IF ( SIZE( mydata_d, 2  ) < recvcount( 1 ) ) CALL mp_stop( 9102 )
        !
        !alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )

        ierr = cudaMemcpy2D(alldata_d, SIZE(alldata_d,1),&
                              mydata_d, SIZE(mydata_d,1),&
                              SIZE(mydata_d,1), recvcount( 1 ), &
                              cudaMemcpyDeviceToDevice )

        IF (ierr/=0) CALL mp_stop( 9103 )
#endif
        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_gatherv_im_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_alltoall_c3d_gpu( sndbuf_d, rcvbuf_d, gid )
         USE parallel_include
         IMPLICIT NONE
         COMPLEX(DP), DEVICE :: sndbuf_d( :, :, : )
         COMPLEX(DP), DEVICE :: rcvbuf_d( :, :, : )
         INTEGER, INTENT(IN) :: gid
         INTEGER :: nsiz, group, ierr, npe

#if defined (__MPI)
#if ! defined(__GPU_MPI)
         COMPLEX(DP), ALLOCATABLE :: sndbuf_h(:,:,:)
         COMPLEX(DP), ALLOCATABLE :: rcvbuf_h(:,:,:)

         ALLOCATE(sndbuf_h, source=sndbuf_d)     ! This syncs __MPI
         ALLOCATE(rcvbuf_h, source=rcvbuf_d)
         CALL mp_alltoall_c3d( sndbuf_h, rcvbuf_h, gid )
         sndbuf_d = sndbuf_h ; rcvbuf_d = rcvbuf_h
         DEALLOCATE(sndbuf_h , rcvbuf_h)
#else
         group = gid

         CALL mpi_comm_size( group, npe, ierr )
         IF (ierr/=0) CALL mp_stop( 9104 )

         IF ( SIZE( sndbuf_d, 3 ) < npe ) CALL mp_stop( 9105 )
         IF ( SIZE( rcvbuf_d, 3 ) < npe ) CALL mp_stop( 9106 )

         nsiz = SIZE( sndbuf_d, 1 ) * SIZE( sndbuf_d, 2 )
         !
         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
         !
         CALL MPI_ALLTOALL( sndbuf_d, nsiz, MPI_DOUBLE_COMPLEX, &
                            rcvbuf_d, nsiz, MPI_DOUBLE_COMPLEX, group, ierr )

         IF (ierr/=0) CALL mp_stop( 9107 )
         RETURN ! Sync not needed after MPI call
#endif
#else
         rcvbuf_d = sndbuf_d
#endif
         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_alltoall_c3d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_alltoall_i3d_gpu( sndbuf_d, rcvbuf_d, gid )
         USE parallel_include
         IMPLICIT NONE
         INTEGER, DEVICE :: sndbuf_d( :, :, : )
         INTEGER, DEVICE :: rcvbuf_d( :, :, : )
         INTEGER, INTENT(IN) :: gid
         INTEGER :: nsiz, group, ierr, npe

#if defined (__MPI)
#if ! defined(__GPU_MPI)
         INTEGER, ALLOCATABLE :: sndbuf_h(:,:,:)
         INTEGER, ALLOCATABLE :: rcvbuf_h(:,:,:)

         ALLOCATE(sndbuf_h, source=sndbuf_d)     ! This syncs __MPI
         ALLOCATE(rcvbuf_h, source=rcvbuf_d)
         CALL mp_alltoall_i3d( sndbuf_h, rcvbuf_h, gid )
         sndbuf_d = sndbuf_h ; rcvbuf_d = rcvbuf_h
         DEALLOCATE(sndbuf_h , rcvbuf_h)
#else
         group = gid

         CALL mpi_comm_size( group, npe, ierr )
         IF (ierr/=0) CALL mp_stop( 9108 )

         IF ( SIZE( sndbuf_d, 3 ) < npe ) CALL mp_stop( 9109 )
         IF ( SIZE( rcvbuf_d, 3 ) < npe ) CALL mp_stop( 9110 )

         nsiz = SIZE( sndbuf_d, 1 ) * SIZE( sndbuf_d, 2 )
         !
         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
         !
         CALL MPI_ALLTOALL( sndbuf_d, nsiz, MPI_INTEGER, &
                            rcvbuf_d, nsiz, MPI_INTEGER, group, ierr )

         IF (ierr/=0) CALL mp_stop( 9111 )
         RETURN ! Sync not needed after MPI call
#endif
#else

         rcvbuf_d = sndbuf_d

#endif
         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_alltoall_i3d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_circular_shift_left_i0_gpu( buf_d, itag, gid )
         USE parallel_include
         IMPLICIT NONE
         INTEGER, DEVICE :: buf_d
         INTEGER, INTENT(IN) :: itag
         INTEGER, INTENT(IN) :: gid
         INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)
#if ! defined(__GPU_MPI)
         INTEGER :: buf_h
         buf_h = buf_d     ! This syncs __MPI
         CALL mp_circular_shift_left_i0( buf_h, itag, gid )
         buf_d = buf_h
#else
         INTEGER :: istatus( mpi_status_size )
         !
         group = gid
         !
         CALL mpi_comm_size( group, npe, ierr )
         IF (ierr/=0) CALL mp_stop( 9112 )
         CALL mpi_comm_rank( group, mype, ierr )
         IF (ierr/=0) CALL mp_stop( 9113 )
         !
         sour = mype + 1
         IF( sour == npe ) sour = 0
         dest = mype - 1
         IF( dest == -1 ) dest = npe - 1
         !
         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
         CALL MPI_Sendrecv_replace( buf_d, 1, MPI_INTEGER, &
              dest, itag, sour, itag, group, istatus, ierr)
         !
         IF (ierr/=0) CALL mp_stop( 9114 )
         !
         RETURN ! Sync not needed after MPI call
#endif
#else
         ! do nothing
#endif
         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_circular_shift_left_i0_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_circular_shift_left_i1_gpu( buf_d, itag, gid )
         USE parallel_include
         IMPLICIT NONE
         INTEGER, DEVICE :: buf_d(:)
         INTEGER, INTENT(IN) :: itag
         INTEGER, INTENT(IN) :: gid
         INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)
#if ! defined(__GPU_MPI)
         INTEGER, ALLOCATABLE :: buf_h(:)
         ALLOCATE(buf_h, source=buf_d)    ! This syncs __MPI
         CALL mp_circular_shift_left_i1( buf_h, itag, gid )
         buf_d = buf_h; DEALLOCATE(buf_h)
#else
         INTEGER :: istatus( mpi_status_size )
         !
         group = gid
         !
         CALL mpi_comm_size( group, npe, ierr )
         IF (ierr/=0) CALL mp_stop( 9115 )
         CALL mpi_comm_rank( group, mype, ierr )
         IF (ierr/=0) CALL mp_stop( 9116 )
         !
         sour = mype + 1
         IF( sour == npe ) sour = 0
         dest = mype - 1
         IF( dest == -1 ) dest = npe - 1
         !
         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
         CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_INTEGER, &
              dest, itag, sour, itag, group, istatus, ierr)
         !
         IF (ierr/=0) CALL mp_stop( 9117 )
         !
         RETURN ! Sync not needed after MPI call
#endif
#else
         ! do nothing
#endif
         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_circular_shift_left_i1_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_circular_shift_left_i2_gpu( buf_d, itag, gid )
         USE parallel_include
         IMPLICIT NONE
         INTEGER, DEVICE :: buf_d(:,:)
         INTEGER, INTENT(IN) :: itag
         INTEGER, INTENT(IN) :: gid
         INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)
#if ! defined(__GPU_MPI)
         INTEGER, ALLOCATABLE :: buf_h(:,:)
         ALLOCATE(buf_h, source=buf_d)    ! This syncs __MPI
         CALL mp_circular_shift_left_i2( buf_h, itag, gid )
         buf_d = buf_h; DEALLOCATE(buf_h)
#else
         INTEGER :: istatus( mpi_status_size )
         !
         group = gid
         !
         CALL mpi_comm_size( group, npe, ierr )
         IF (ierr/=0) CALL mp_stop( 9118 )
         CALL mpi_comm_rank( group, mype, ierr )
         IF (ierr/=0) CALL mp_stop( 9119 )
         !
         sour = mype + 1
         IF( sour == npe ) sour = 0
         dest = mype - 1
         IF( dest == -1 ) dest = npe - 1
         !
         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
         CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_INTEGER, &
              dest, itag, sour, itag, group, istatus, ierr)
         !
         IF (ierr/=0) CALL mp_stop( 9120 )
         !
         RETURN ! Sync not needed after MPI call
#endif
#else
         ! do nothing
#endif
         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_circular_shift_left_i2_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_circular_shift_left_r2d_gpu( buf_d, itag, gid )
         USE parallel_include
         IMPLICIT NONE
         REAL(DP), DEVICE :: buf_d( :, : )
         INTEGER, INTENT(IN) :: itag
         INTEGER, INTENT(IN) :: gid
         INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)
#if ! defined(__GPU_MPI)
         REAL(DP), ALLOCATABLE :: buf_h(:, :)
         ALLOCATE(buf_h, source=buf_d)    ! This syncs __MPI
         CALL mp_circular_shift_left_r2d( buf_h, itag, gid )
         buf_d = buf_h; DEALLOCATE(buf_h)
#else
         INTEGER :: istatus( mpi_status_size )
         !
         group = gid
         !
         CALL mpi_comm_size( group, npe, ierr )
         IF (ierr/=0) CALL mp_stop( 9121 )
         CALL mpi_comm_rank( group, mype, ierr )
         IF (ierr/=0) CALL mp_stop( 9122 )
         !
         sour = mype + 1
         IF( sour == npe ) sour = 0
         dest = mype - 1
         IF( dest == -1 ) dest = npe - 1
         !
         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
         CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_DOUBLE_PRECISION, &
              dest, itag, sour, itag, group, istatus, ierr)
         !
         IF (ierr/=0) CALL mp_stop( 9123 )
         !
         RETURN ! Sync not needed after MPI call
#endif
#else
         ! do nothing
#endif
         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_circular_shift_left_r2d_gpu
!
!------------------------------------------------------------------------------!
!
      SUBROUTINE mp_circular_shift_left_c2d_gpu( buf_d, itag, gid )
         USE parallel_include
         IMPLICIT NONE
         COMPLEX(DP), DEVICE :: buf_d( :, : )
         INTEGER, INTENT(IN) :: itag
         INTEGER, INTENT(IN) :: gid
         INTEGER :: nsiz, group, ierr, npe, sour, dest, mype

#if defined (__MPI)
#if ! defined(__GPU_MPI)
         COMPLEX(DP), ALLOCATABLE :: buf_h(:, :)
         ALLOCATE(buf_h, source=buf_d)    ! This syncs __MPI
         CALL mp_circular_shift_left_c2d( buf_h, itag, gid )
         buf_d = buf_h; DEALLOCATE(buf_h)
#else
         INTEGER :: istatus( mpi_status_size )
         !
         group = gid
         !
         CALL mpi_comm_size( group, npe, ierr )
         IF (ierr/=0) CALL mp_stop( 9124 )
         CALL mpi_comm_rank( group, mype, ierr )
         IF (ierr/=0) CALL mp_stop( 9125 )
         !
         sour = mype + 1
         IF( sour == npe ) sour = 0
         dest = mype - 1
         IF( dest == -1 ) dest = npe - 1
         !
         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
         CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_DOUBLE_COMPLEX, &
              dest, itag, sour, itag, group, istatus, ierr)
         !
         IF (ierr/=0) CALL mp_stop( 9126 )
         !
         RETURN ! Sync not needed after MPI call
#endif
#else
         ! do nothing
#endif
         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_circular_shift_left_c2d_gpu

!------------------------------------------------------------------------------!
!..mp_gatherv_inplace_cplx_array
!

      SUBROUTINE mp_gatherv_inplace_cplx_array_gpu(alldata_d, my_column_type, recvcount, displs, root, gid)
         USE parallel_include
         IMPLICIT NONE
         COMPLEX(DP), DEVICE :: alldata_d(:,:)
         INTEGER, INTENT(IN) :: my_column_type
         INTEGER, INTENT(IN) :: recvcount(:), displs(:)
         INTEGER, INTENT(IN) :: root, gid
         INTEGER :: ierr, npe, myid

#if defined (__MPI)
#if ! defined(__GPU_MPI)
         COMPLEX(DP), ALLOCATABLE :: alldata_h(:, :)
         !
         ! Avoid unnecessary communications on __MPI
         IF ( mp_size(gid) == 1 ) THEN
           ierr = cudaDeviceSynchronize()
           RETURN
         END IF
         !
         ALLOCATE(alldata_h, source=alldata_d)    ! This syncs __MPI
         CALL mp_gatherv_inplace_cplx_array(alldata_h, my_column_type, recvcount, displs, root, gid)
         alldata_d = alldata_h; DEALLOCATE(alldata_h)
         ierr = cudaDeviceSynchronize()   ! This syncs in case of small data chunks
         RETURN
#else
         CALL mpi_comm_size( gid, npe, ierr )
         IF (ierr/=0) CALL mp_stop( 9127 )
         CALL mpi_comm_rank( gid, myid, ierr )
         IF (ierr/=0) CALL mp_stop( 9128 )
         !
         IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9129 )
         !
         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
         IF (myid==root) THEN
            CALL MPI_GATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
                              alldata_d, recvcount, displs, my_column_type, root, gid, ierr )
         ELSE
            CALL MPI_GATHERV( alldata_d(1,displs(myid+1)+1), recvcount(myid+1), my_column_type, &
                              MPI_IN_PLACE, recvcount, displs, MPI_DATATYPE_NULL, root, gid, ierr )
         ENDIF
         !
         IF (ierr/=0) CALL mp_stop( 9130 )
         !
         RETURN ! Sync not needed after MPI call
#endif
#endif
         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL
      END SUBROUTINE mp_gatherv_inplace_cplx_array_gpu

!------------------------------------------------------------------------------!
!..mp_allgatherv_inplace_cplx_array
!

      SUBROUTINE mp_allgatherv_inplace_cplx_array_gpu(alldata_d, my_element_type, recvcount, displs, gid)
         USE parallel_include
         IMPLICIT NONE
         COMPLEX(DP), DEVICE :: alldata_d(:,:)
         INTEGER, INTENT(IN) :: my_element_type
         INTEGER, INTENT(IN) :: recvcount(:), displs(:)
         INTEGER, INTENT(IN) :: gid
         INTEGER :: ierr, npe, myid

#if defined (__MPI)
#if ! defined(__GPU_MPI)
         COMPLEX(DP), ALLOCATABLE :: alldata_h(:, :)
         !
         ! Avoid unnecessary communications on __MPI
         IF ( mp_size(gid) == 1 ) THEN
           ierr = cudaDeviceSynchronize()
           RETURN
         END IF
         !
         ALLOCATE(alldata_h, source=alldata_d)! This syncs __MPI
         CALL mp_allgatherv_inplace_cplx_array(alldata_h, my_element_type, recvcount, displs, gid)
         alldata_d = alldata_h; DEALLOCATE(alldata_h)
         ierr = cudaDeviceSynchronize()   ! This syncs in case of small data chunks
         RETURN
#else
         CALL mpi_comm_size( gid, npe, ierr )
         IF (ierr/=0) CALL mp_stop( 9131 )
         CALL mpi_comm_rank( gid, myid, ierr )
         IF (ierr/=0) CALL mp_stop( 9132 )
         !
         IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9133 )
         !
         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
         CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
                              alldata_d, recvcount, displs, my_element_type, gid, ierr )
         IF (ierr/=0) CALL mp_stop( 9134 )
         RETURN ! Sync not needed after MPI call
#endif
#endif
         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_allgatherv_inplace_cplx_array_gpu
!..mp_allgatherv_inplace_real_array
!

      SUBROUTINE mp_allgatherv_inplace_real_array_gpu(alldata_d, my_element_type, recvcount, displs, gid)
         USE parallel_include
         IMPLICIT NONE
         REAL(DP), DEVICE :: alldata_d(:,:)
         INTEGER, INTENT(IN) :: my_element_type
         INTEGER, INTENT(IN) :: recvcount(:), displs(:)
         INTEGER, INTENT(IN) :: gid
         INTEGER :: ierr, npe, myid

#if defined (__MPI)
#if ! defined(__GPU_MPI)
         REAL(DP), ALLOCATABLE :: alldata_h(:, :)
         !
         ! Avoid unnecessary communications on __MPI
         IF ( mp_size(gid) == 1 ) THEN
           ierr = cudaDeviceSynchronize()
           RETURN
         END IF
         !
         ALLOCATE(alldata_h, source=alldata_d)! This syncs __MPI
         CALL mp_allgatherv_inplace_real_array(alldata_h, my_element_type, recvcount, displs, gid)
         alldata_d = alldata_h; DEALLOCATE(alldata_h)
         ierr = cudaDeviceSynchronize()   ! This syncs in case of small data chunks
         RETURN
#else
         CALL mpi_comm_size( gid, npe, ierr )
         IF (ierr/=0) CALL mp_stop( 9131 )
         CALL mpi_comm_rank( gid, myid, ierr )
         IF (ierr/=0) CALL mp_stop( 9132 )
         !
         IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9133 )
         !
         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
         CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
                              alldata_d, recvcount, displs, my_element_type, gid, ierr )
         IF (ierr/=0) CALL mp_stop( 9134 )
         RETURN ! Sync not needed after MPI call
#endif
#endif
         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
      END SUBROUTINE mp_allgatherv_inplace_real_array_gpu

      SUBROUTINE mp_type_create_cplx_column_section_gpu(dummy, start, length, stride, mytype)
         USE parallel_include
         IMPLICIT NONE
         !
         COMPLEX (DP), DEVICE, INTENT(IN) :: dummy
         INTEGER, INTENT(IN) :: start, length, stride
         INTEGER, INTENT(OUT) :: mytype
         !
#if defined(__MPI)
         INTEGER :: ierr
         !
         CALL MPI_TYPE_CREATE_SUBARRAY(1, [stride], [length], [start], &
                 MPI_ORDER_FORTRAN, MPI_DOUBLE_COMPLEX, mytype, ierr)
         IF (ierr/=0) CALL mp_stop( 8081 )
         CALL MPI_Type_commit(mytype, ierr)
         IF (ierr/=0) CALL mp_stop( 8082 )
#else
         mytype = 0;
#endif
         !
         RETURN
      END SUBROUTINE mp_type_create_cplx_column_section_gpu

      SUBROUTINE mp_type_create_real_column_section_gpu(dummy, start, length, stride, mytype)
         USE parallel_include
         IMPLICIT NONE
         !
         REAL (DP), DEVICE, INTENT(IN) :: dummy
         INTEGER, INTENT(IN) :: start, length, stride
         INTEGER, INTENT(OUT) :: mytype
         !
#if defined(__MPI)
         INTEGER :: ierr
         !
         CALL MPI_TYPE_CREATE_SUBARRAY(1, [stride], [length], [start], &
                 MPI_ORDER_FORTRAN, MPI_DOUBLE_PRECISION, mytype, ierr)
         IF (ierr/=0) CALL mp_stop( 8083 )
         CALL MPI_Type_commit(mytype, ierr)
         IF (ierr/=0) CALL mp_stop( 8084 )
#else
         mytype = 0;
#endif
         !
         RETURN
      END SUBROUTINE mp_type_create_real_column_section_gpu

      SUBROUTINE mp_type_create_cplx_row_section_gpu(dummy, column_start, column_stride, row_length, mytype)
         USE parallel_include
         IMPLICIT NONE
         !
         COMPLEX (DP), DEVICE, INTENT(IN) :: dummy
         INTEGER, INTENT(IN) :: column_start, column_stride, row_length
         INTEGER, INTENT(OUT) :: mytype
         !
#if defined(__MPI)
         INTEGER :: ierr, temporary
         INTEGER :: strides(2), lengths(2), starts(2)
         INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
         !
         strides(1) = column_stride ; strides(2) = row_length
         lengths(1) = 1             ; lengths(2) = row_length
         starts(1)  = column_start  ; starts(2)  = 0
         CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
                                       MPI_DOUBLE_COMPLEX, temporary, ierr)
         IF (ierr/=0) CALL mp_stop( 8085 )
         CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_COMPLEX, lb, extent, ierr)
         IF (ierr/=0) CALL mp_stop( 8085 )
         CALL MPI_TYPE_COMMIT(temporary, ierr)
         IF (ierr/=0) CALL mp_stop( 8085 )
         CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
         IF (ierr/=0) CALL mp_stop( 8086 )
         CALL MPI_Type_commit(mytype, ierr)
         IF (ierr/=0) CALL mp_stop( 8086 )
#else
         mytype = 0;
#endif
         !
         RETURN
      END SUBROUTINE mp_type_create_cplx_row_section_gpu

      SUBROUTINE mp_type_create_real_row_section_gpu(dummy, column_start, column_stride, row_length, mytype)
         USE parallel_include
         IMPLICIT NONE
         !
         REAL (DP), DEVICE, INTENT(IN) :: dummy
         INTEGER, INTENT(IN) :: column_start, column_stride, row_length
         INTEGER, INTENT(OUT) :: mytype
         !
#if defined(__MPI)
         INTEGER :: ierr, temporary
         INTEGER :: strides(2), lengths(2), starts(2)
         INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
         !
         strides(1) = column_stride ; strides(2) = row_length
         lengths(1) = 1             ; lengths(2) = row_length
         starts(1)  = column_start  ; starts(2)  = 0
         CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
                                       MPI_DOUBLE_PRECISION, temporary, ierr)
         IF (ierr/=0) CALL mp_stop( 8087 )
         CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_PRECISION, lb, extent, ierr)
         IF (ierr/=0) CALL mp_stop( 8087 )
         CALL MPI_TYPE_COMMIT(temporary, ierr)
         IF (ierr/=0) CALL mp_stop( 8087 )
         CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
         IF (ierr/=0) CALL mp_stop( 8088 )
         CALL MPI_Type_commit(mytype, ierr)
         IF (ierr/=0) CALL mp_stop( 8088 )
#else
         mytype = 0;
#endif
         !
         RETURN
      END SUBROUTINE mp_type_create_real_row_section_gpu

!------------------------------------------------------------------------------!

#endif
!------------------------------------------------------------------------------!
END MODULE mp
!------------------------------------------------------------------------------!
!
! Script to generate stop messages:
!   # coding: utf-8
!   import re
!   import sys
!   i = 8000
!   def replace(match):
!       global i
!       i += 1
!       return 'mp_stop( {0} )'.format(i)
!
!   with open(sys.argv[1],'r') as f:
!       data = re.sub(r"mp_stop\(\s?\d+\s?\)", replace, f.read())
!       with open(sys.argv[1]+'.new','w') as fo:
!           fo.write(data)