mirror of https://gitlab.com/QEF/q-e.git
6489 lines
214 KiB
Fortran
6489 lines
214 KiB
Fortran
!
|
|
! Copyright (C) 2002-2013 Quantum ESPRESSO group
|
|
! This file is distributed under the terms of the
|
|
! GNU General Public License. See the file `License'
|
|
! in the root directory of the present distribution,
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
!
|
|
! This module contains interfaces to most low-level MPI operations:
|
|
! initialization and stopping, broadcast, parallel sum, etc.
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
MODULE mp
|
|
!------------------------------------------------------------------------------!
|
|
USE util_param, ONLY : DP, stdout, i8b
|
|
#if defined(__CUDA)
|
|
USE cudafor, ONLY : cudamemcpy, cudamemcpy2d, &
|
|
& cudaMemcpyDeviceToDevice, &
|
|
& cudaDeviceSynchronize
|
|
#endif
|
|
!
|
|
IMPLICIT NONE
|
|
PRIVATE
|
|
!
|
|
PUBLIC :: mp_start, mp_abort, mp_stop, mp_end, &
|
|
mp_bcast, mp_sum, mp_max, mp_min, mp_rank, mp_size, &
|
|
mp_gather, mp_alltoall, mp_get, mp_put, &
|
|
mp_barrier, mp_report, mp_group_free, &
|
|
mp_root_sum, mp_comm_free, mp_comm_create, mp_comm_group, &
|
|
mp_group_create, mp_comm_split, mp_set_displs, &
|
|
mp_circular_shift_left, mp_circular_shift_left_start, &
|
|
mp_get_comm_null, mp_get_comm_self, mp_count_nodes, &
|
|
mp_type_create_column_section, mp_type_create_row_section, mp_type_free, &
|
|
mp_allgather, mp_waitall, mp_testall
|
|
!
|
|
INTERFACE mp_bcast
|
|
MODULE PROCEDURE mp_bcast_i1, mp_bcast_r1, mp_bcast_c1, &
|
|
mp_bcast_z, mp_bcast_zv, &
|
|
mp_bcast_iv, mp_bcast_i8v, mp_bcast_rv, mp_bcast_cv, mp_bcast_l, mp_bcast_rm, &
|
|
mp_bcast_cm, mp_bcast_im, mp_bcast_it, mp_bcast_i4d, mp_bcast_rt, mp_bcast_lv, &
|
|
mp_bcast_lm, mp_bcast_r4d, mp_bcast_r5d, mp_bcast_ct, mp_bcast_c4d,&
|
|
mp_bcast_c5d, mp_bcast_c6d
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_bcast_i1_gpu, mp_bcast_r1_gpu, mp_bcast_c1_gpu, &
|
|
!mp_bcast_z_gpu, mp_bcast_zv_gpu, &
|
|
mp_bcast_iv_gpu, mp_bcast_rv_gpu, mp_bcast_cv_gpu, mp_bcast_l_gpu, mp_bcast_rm_gpu, &
|
|
mp_bcast_cm_gpu, mp_bcast_im_gpu, mp_bcast_it_gpu, mp_bcast_i4d_gpu, mp_bcast_rt_gpu, mp_bcast_lv_gpu, &
|
|
mp_bcast_lm_gpu, mp_bcast_r4d_gpu, mp_bcast_r5d_gpu, mp_bcast_ct_gpu, mp_bcast_c4d_gpu,&
|
|
mp_bcast_c5d_gpu, mp_bcast_c6d_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_sum
|
|
MODULE PROCEDURE mp_sum_i1, mp_sum_iv, mp_sum_i8v, mp_sum_im, mp_sum_it, mp_sum_i4, mp_sum_i5, &
|
|
mp_sum_r1, mp_sum_rv, mp_sum_rm, mp_sum_rm_nc, mp_sum_rt, mp_sum_r4d, &
|
|
mp_sum_c1, mp_sum_cv, mp_sum_cm, mp_sum_cm_nc, mp_sum_ct, mp_sum_c4d, &
|
|
mp_sum_c5d, mp_sum_c6d, mp_sum_rmm, mp_sum_cmm, mp_sum_r5d, &
|
|
mp_sum_r6d
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_sum_i1_gpu, mp_sum_iv_gpu, mp_sum_im_gpu, mp_sum_it_gpu, &
|
|
mp_sum_r1_gpu, mp_sum_rv_gpu, mp_sum_rm_gpu, mp_sum_rm_nc_gpu, mp_sum_rt_gpu, mp_sum_r4d_gpu, &
|
|
mp_sum_c1_gpu, mp_sum_cv_gpu, mp_sum_cm_gpu, mp_sum_cm_nc_gpu, mp_sum_ct_gpu, mp_sum_c4d_gpu, &
|
|
mp_sum_c5d_gpu, mp_sum_c6d_gpu, mp_sum_rmm_gpu, mp_sum_cmm_gpu, mp_sum_r5d_gpu, &
|
|
mp_sum_r6d_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_root_sum
|
|
MODULE PROCEDURE mp_root_sum_rm, mp_root_sum_cm
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_root_sum_rm_gpu, mp_root_sum_cm_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_get
|
|
MODULE PROCEDURE mp_get_r1, mp_get_rv, mp_get_cv, mp_get_i1, mp_get_iv, mp_get_rm, mp_get_cm
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_get_r1_gpu, mp_get_rv_gpu, mp_get_cv_gpu, mp_get_i1_gpu, mp_get_iv_gpu, &
|
|
mp_get_rm_gpu, mp_get_cm_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_put
|
|
MODULE PROCEDURE mp_put_rv, mp_put_cv, mp_put_i1, mp_put_iv, &
|
|
mp_put_rm
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_put_rv_gpu, mp_put_cv_gpu, mp_put_i1_gpu, mp_put_iv_gpu, &
|
|
mp_put_rm_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_max
|
|
MODULE PROCEDURE mp_max_i, mp_max_r, mp_max_rv, mp_max_iv
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_max_i_gpu, mp_max_r_gpu, mp_max_rv_gpu, mp_max_iv_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_min
|
|
MODULE PROCEDURE mp_min_i, mp_min_r, mp_min_rv, mp_min_iv
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_min_i_gpu, mp_min_r_gpu, mp_min_rv_gpu, mp_min_iv_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_gather
|
|
MODULE PROCEDURE mp_gather_i1, mp_gather_iv, mp_gatherv_rv, mp_gatherv_iv, &
|
|
mp_gatherv_rm, mp_gatherv_im, mp_gatherv_cv, &
|
|
mp_gatherv_inplace_cplx_array
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_gather_i1_gpu, mp_gather_iv_gpu, mp_gatherv_rv_gpu, mp_gatherv_iv_gpu, &
|
|
mp_gatherv_rm_gpu, mp_gatherv_im_gpu, mp_gatherv_cv_gpu, mp_gatherv_inplace_cplx_array_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_allgather
|
|
MODULE PROCEDURE mp_allgatherv_inplace_cplx_array
|
|
MODULE PROCEDURE mp_allgatherv_inplace_real_array
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_allgatherv_inplace_cplx_array_gpu
|
|
MODULE PROCEDURE mp_allgatherv_inplace_real_array_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_alltoall
|
|
MODULE PROCEDURE mp_alltoall_c3d, mp_alltoall_i3d
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_alltoall_c3d_gpu, mp_alltoall_i3d_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_circular_shift_left
|
|
MODULE PROCEDURE mp_circular_shift_left_i0, &
|
|
mp_circular_shift_left_i1, &
|
|
mp_circular_shift_left_i2, &
|
|
mp_circular_shift_left_r2d, &
|
|
mp_circular_shift_left_c2d
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_circular_shift_left_i0_gpu, &
|
|
mp_circular_shift_left_i1_gpu, &
|
|
mp_circular_shift_left_i2_gpu, &
|
|
mp_circular_shift_left_r2d_gpu, &
|
|
mp_circular_shift_left_c2d_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_circular_shift_left_start
|
|
MODULE PROCEDURE mp_circular_shift_left_start_i0, &
|
|
mp_circular_shift_left_start_i1, &
|
|
mp_circular_shift_left_start_i2, &
|
|
mp_circular_shift_left_start_r2d, &
|
|
mp_circular_shift_left_start_c2d
|
|
END INTERFACE
|
|
!
|
|
INTERFACE mp_type_create_column_section
|
|
MODULE PROCEDURE mp_type_create_cplx_column_section
|
|
MODULE PROCEDURE mp_type_create_real_column_section
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_type_create_cplx_column_section_gpu
|
|
MODULE PROCEDURE mp_type_create_real_column_section_gpu
|
|
#endif
|
|
END INTERFACE
|
|
|
|
INTERFACE mp_type_create_row_section
|
|
MODULE PROCEDURE mp_type_create_cplx_row_section
|
|
MODULE PROCEDURE mp_type_create_real_row_section
|
|
#if defined(__CUDA)
|
|
MODULE PROCEDURE mp_type_create_cplx_row_section_gpu
|
|
MODULE PROCEDURE mp_type_create_real_row_section_gpu
|
|
#endif
|
|
END INTERFACE
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
CONTAINS
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gather_i1
|
|
SUBROUTINE mp_gather_i1(mydata, alldata, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT(IN) :: mydata, root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER, INTENT(OUT) :: alldata(:)
|
|
INTEGER :: ierr
|
|
|
|
|
|
#if defined (__MPI)
|
|
group = gid
|
|
CALL MPI_GATHER(mydata, 1, MPI_INTEGER, alldata, 1, MPI_INTEGER, root, group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8013 )
|
|
#else
|
|
alldata(1) = mydata
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_gather_i1
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gather_iv
|
|
!..Carlo Cavazzoni
|
|
SUBROUTINE mp_gather_iv(mydata, alldata, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT(IN) :: mydata(:), root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER, INTENT(OUT) :: alldata(:,:)
|
|
INTEGER :: msglen, ierr
|
|
|
|
|
|
#if defined (__MPI)
|
|
msglen = SIZE(mydata)
|
|
IF( msglen .NE. SIZE(alldata, 1) ) CALL mp_stop( 8014 )
|
|
group = gid
|
|
CALL MPI_GATHER(mydata, msglen, MPI_INTEGER, alldata, msglen, MPI_INTEGER, root, group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8014 )
|
|
#else
|
|
msglen = SIZE(mydata)
|
|
IF( msglen .NE. SIZE(alldata, 1) ) CALL mp_stop( 8014 )
|
|
alldata(:,1) = mydata(:)
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_gather_iv
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_start
|
|
SUBROUTINE mp_start(numtask, taskid, group)
|
|
|
|
! ...
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (OUT) :: numtask, taskid
|
|
INTEGER, INTENT (IN) :: group
|
|
INTEGER :: ierr
|
|
! ...
|
|
ierr = 0
|
|
numtask = 1
|
|
taskid = 0
|
|
|
|
# if defined(__MPI)
|
|
IF (ierr/=0) CALL mp_stop( 8004 )
|
|
CALL mpi_comm_rank(group,taskid,ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8005 )
|
|
CALL mpi_comm_size(group,numtask,ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8006 )
|
|
! ...
|
|
CALL allocate_buffers()
|
|
#if defined(__CUDA)
|
|
CALL allocate_buffers_gpu()
|
|
#endif
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_start
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_abort
|
|
|
|
SUBROUTINE mp_abort(errorcode,gid)
|
|
IMPLICIT NONE
|
|
INTEGER :: ierr
|
|
INTEGER, INTENT(IN):: errorcode, gid
|
|
#if defined(__MPI)
|
|
CALL deallocate_buffers()
|
|
#if defined(__CUDA)
|
|
CALL deallocate_buffers_gpu()
|
|
#endif
|
|
CALL mpi_abort(gid, errorcode, ierr)
|
|
#endif
|
|
END SUBROUTINE mp_abort
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_end
|
|
|
|
SUBROUTINE mp_end(groupid, cleanup)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT(IN) :: groupid
|
|
LOGICAL, OPTIONAL, INTENT(IN) :: cleanup
|
|
INTEGER :: ierr, taskid
|
|
LOGICAL :: cleanup_
|
|
|
|
ierr = 0
|
|
taskid = 0
|
|
|
|
#if defined(__MPI)
|
|
CALL mpi_comm_rank( groupid, taskid, ierr)
|
|
cleanup_ = .FALSE.
|
|
IF (PRESENT(cleanup)) cleanup_ = cleanup
|
|
IF(cleanup_) THEN
|
|
CALL deallocate_buffers()
|
|
#if defined(__CUDA)
|
|
CALL deallocate_buffers_gpu()
|
|
#endif
|
|
END IF
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_end
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_group
|
|
|
|
SUBROUTINE mp_comm_group( comm, group )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (IN) :: comm
|
|
INTEGER, INTENT (OUT) :: group
|
|
INTEGER :: ierr
|
|
ierr = 0
|
|
#if defined(__MPI)
|
|
CALL mpi_comm_group( comm, group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8007 )
|
|
#else
|
|
group = 0
|
|
#endif
|
|
END SUBROUTINE mp_comm_group
|
|
|
|
SUBROUTINE mp_comm_split( old_comm, color, key, new_comm )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (IN) :: old_comm
|
|
INTEGER, INTENT (IN) :: color, key
|
|
INTEGER, INTENT (OUT) :: new_comm
|
|
INTEGER :: ierr
|
|
ierr = 0
|
|
#if defined(__MPI)
|
|
CALL MPI_COMM_SPLIT( old_comm, color, key, new_comm, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8008 )
|
|
#else
|
|
new_comm = old_comm
|
|
#endif
|
|
END SUBROUTINE mp_comm_split
|
|
|
|
|
|
SUBROUTINE mp_group_create( group_list, group_size, old_grp, new_grp )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (IN) :: group_list(:), group_size, old_grp
|
|
INTEGER, INTENT (OUT) :: new_grp
|
|
INTEGER :: ierr
|
|
|
|
ierr = 0
|
|
new_grp = old_grp
|
|
#if defined(__MPI)
|
|
CALL mpi_group_incl( old_grp, group_size, group_list, new_grp, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8009 )
|
|
#endif
|
|
END SUBROUTINE mp_group_create
|
|
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_comm_create( old_comm, new_grp, new_comm )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (IN) :: old_comm
|
|
INTEGER, INTENT (IN) :: new_grp
|
|
INTEGER, INTENT (OUT) :: new_comm
|
|
INTEGER :: ierr
|
|
|
|
ierr = 0
|
|
new_comm = old_comm
|
|
#if defined(__MPI)
|
|
CALL mpi_comm_create( old_comm, new_grp, new_comm, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8010 )
|
|
#endif
|
|
END SUBROUTINE mp_comm_create
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_group_free
|
|
SUBROUTINE mp_group_free( group )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: group
|
|
INTEGER :: ierr
|
|
ierr = 0
|
|
#if defined(__MPI)
|
|
CALL mpi_group_free( group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8011 )
|
|
#endif
|
|
END SUBROUTINE mp_group_free
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_comm_free( comm )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: comm
|
|
INTEGER :: ierr
|
|
ierr = 0
|
|
#if defined(__MPI)
|
|
IF( comm /= MPI_COMM_NULL ) THEN
|
|
CALL mpi_comm_free( comm, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8012 )
|
|
END IF
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_comm_free
|
|
|
|
!------------------------------------------------------------------------------!
|
|
! non-blocking helpers
|
|
! waits till all request are completed
|
|
SUBROUTINE mp_waitall(requests)
|
|
! ...
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: requests(:)
|
|
INTEGER :: ierr
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE, size(requests))
|
|
#endif
|
|
ierr = 0
|
|
#if defined(__MPI)
|
|
call MPI_Waitall(size(requests), requests, istatus, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8004 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_waitall
|
|
|
|
!tests all requests
|
|
SUBROUTINE mp_testall(requests, flag)
|
|
! ...
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: requests(:)
|
|
INTEGER :: ierr
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE, size(requests))
|
|
#endif
|
|
LOGICAL, INTENT(OUT):: flag
|
|
!
|
|
ierr = 0
|
|
flag = .FALSE.
|
|
#if defined(__MPI)
|
|
call MPI_Testall(size(requests), requests, flag, istatus, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8004 )
|
|
#else
|
|
flag = .TRUE.
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_testall
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_bcast
|
|
SUBROUTINE mp_bcast_i1(msg,source,gid)
|
|
IMPLICIT NONE
|
|
INTEGER :: msg
|
|
INTEGER :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: msglen
|
|
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
group = gid
|
|
CALL bcast_integer( msg, msglen, source, group )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_i1
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_bcast_iv(msg, source, gid)
|
|
!------------------------------------------------------------------------------!
|
|
!!
|
|
!! Bcast an integer vector
|
|
!!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: msg(:)
|
|
INTEGER, INTENT(in) :: source
|
|
INTEGER, INTENT(in) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = SIZE(msg)
|
|
CALL bcast_integer(msg, msglen, source, gid)
|
|
#endif
|
|
!------------------------------------------------------------------------------!
|
|
END SUBROUTINE mp_bcast_iv
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_bcast_i8v(msg, source, gid)
|
|
!------------------------------------------------------------------------------!
|
|
!!
|
|
!! Bcast an integer vector of kind i8b.
|
|
!!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER(KIND = i8b) :: msg(:)
|
|
INTEGER, INTENT(in) :: source
|
|
INTEGER, INTENT(in) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = SIZE(msg)
|
|
CALL bcast_integer8(msg, msglen, source, gid)
|
|
#endif
|
|
!------------------------------------------------------------------------------!
|
|
END SUBROUTINE mp_bcast_i8v
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_bcast_im(msg, source, gid)
|
|
!------------------------------------------------------------------------------!
|
|
IMPLICIT NONE
|
|
INTEGER :: msg(:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_integer(msg, msglen, source, gid)
|
|
#endif
|
|
END SUBROUTINE mp_bcast_im
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_bcast_it( msg, source, gid )
|
|
IMPLICIT NONE
|
|
INTEGER :: msg(:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_integer( msg, msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_it
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Samuel Ponce
|
|
!
|
|
SUBROUTINE mp_bcast_i4d(msg, source, gid)
|
|
IMPLICIT NONE
|
|
INTEGER :: msg(:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_integer( msg, msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_i4d
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_r1( msg, source, gid )
|
|
IMPLICIT NONE
|
|
REAL (DP) :: msg
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = 1
|
|
CALL bcast_real( msg, msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_r1
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_rv(msg,source,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP) :: msg(:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_real( msg, msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_rv
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_rm(msg,source,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP) :: msg(:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_real( msg, msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_rm
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_bcast_rt(msg,source,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP) :: msg(:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_real( msg, msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_rt
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_bcast_r4d(msg, source, gid)
|
|
IMPLICIT NONE
|
|
REAL (DP) :: msg(:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_real( msg, msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_r4d
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_bcast_r5d(msg, source, gid)
|
|
IMPLICIT NONE
|
|
REAL (DP) :: msg(:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_real( msg, msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_r5d
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_c1(msg,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP) :: msg
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = 1
|
|
CALL bcast_real( msg, 2 * msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_c1
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_bcast_cv(msg,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP) :: msg(:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_real( msg, 2 * msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_cv
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_bcast_cm(msg,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP) :: msg(:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_real( msg, 2 * msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_cm
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_bcast_ct(msg,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP) :: msg(:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_real( msg, 2 * msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_ct
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_bcast_c4d(msg,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP) :: msg(:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_real( msg, 2 * msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_c4d
|
|
|
|
SUBROUTINE mp_bcast_c5d(msg,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP) :: msg(:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_real( msg, 2 * msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_c5d
|
|
|
|
SUBROUTINE mp_bcast_c6d(msg,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP) :: msg(:,:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_real( msg, 2 * msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_c6d
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_bcast_l(msg,source,gid)
|
|
IMPLICIT NONE
|
|
LOGICAL :: msg
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = 1
|
|
CALL bcast_logical( msg, msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_l
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_bcast_lv(msg,source,gid)
|
|
IMPLICIT NONE
|
|
LOGICAL :: msg(:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_logical( msg, msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_lv
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_bcast_lm(msg,source,gid)
|
|
IMPLICIT NONE
|
|
LOGICAL :: msg(:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL bcast_logical( msg, msglen, source, gid )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_lm
|
|
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_z(msg,source,gid)
|
|
IMPLICIT NONE
|
|
CHARACTER (len=*) :: msg
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: msglen, ierr, i
|
|
INTEGER, ALLOCATABLE :: imsg(:)
|
|
#if defined(__MPI)
|
|
ierr = 0
|
|
msglen = len(msg)
|
|
group = gid
|
|
ALLOCATE (imsg(1:msglen), STAT=ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8015 )
|
|
DO i = 1, msglen
|
|
imsg(i) = ichar(msg(i:i))
|
|
END DO
|
|
CALL bcast_integer( imsg, msglen, source, group )
|
|
DO i = 1, msglen
|
|
msg(i:i) = char(imsg(i))
|
|
END DO
|
|
DEALLOCATE (imsg, STAT=ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8016 )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_z
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_zv(msg,source,gid)
|
|
IMPLICIT NONE
|
|
CHARACTER (len=*) :: msg(:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: msglen, m1, m2, ierr, i, j
|
|
INTEGER, ALLOCATABLE :: imsg(:,:)
|
|
#if defined(__MPI)
|
|
ierr = 0
|
|
m1 = LEN(msg)
|
|
m2 = SIZE(msg)
|
|
msglen = LEN(msg)*SIZE(msg)
|
|
group = gid
|
|
ALLOCATE (imsg(1:m1,1:m2), STAT=ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8017 )
|
|
DO j = 1, m2
|
|
DO i = 1, m1
|
|
imsg(i,j) = ichar(msg(j)(i:i))
|
|
END DO
|
|
END DO
|
|
CALL bcast_integer( imsg, msglen, source, group )
|
|
DO j = 1, m2
|
|
DO i = 1, m1
|
|
msg(j)(i:i) = char(imsg(i,j))
|
|
END DO
|
|
END DO
|
|
DEALLOCATE (imsg, STAT=ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8018 )
|
|
#endif
|
|
END SUBROUTINE mp_bcast_zv
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_get_i1(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: msg_dest
|
|
INTEGER, INTENT(IN) :: msg_sour
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen = 1
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF(dest .NE. sour) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
msglen=1
|
|
CALL MPI_SEND( msg_sour, msglen, MPI_INTEGER, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8019 )
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
msglen=1
|
|
CALL MPI_RECV( msg_dest, msglen, MPI_INTEGER, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8020 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8021 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest = msg_sour
|
|
msglen = 1
|
|
END IF
|
|
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8022 )
|
|
#endif
|
|
|
|
|
|
RETURN
|
|
END SUBROUTINE mp_get_i1
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_get_iv(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: msg_dest(:)
|
|
INTEGER, INTENT(IN) :: msg_sour(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
msglen = SIZE(msg_sour)
|
|
CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_INTEGER, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8023 )
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_INTEGER, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8024 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8025 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
|
|
msglen = SIZE(msg_sour)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8026 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_get_iv
|
|
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_get_r1(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP) :: msg_dest
|
|
REAL (DP), INTENT(IN) :: msg_sour
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
msglen = 1
|
|
CALL MPI_SEND( msg_sour, msglen, MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8027 )
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest, 1, MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8028 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8029 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest = msg_sour
|
|
msglen = 1
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8030 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_get_r1
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_get_rv(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP) :: msg_dest(:)
|
|
REAL (DP), INTENT(IN) :: msg_sour(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
msglen = SIZE(msg_sour)
|
|
CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8027 )
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8028 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8029 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
|
|
msglen = SIZE(msg_sour)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8030 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_get_rv
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_get_rm(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP) :: msg_dest(:,:)
|
|
REAL (DP), INTENT(IN) :: msg_sour(:,:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8031 )
|
|
msglen = SIZE(msg_sour)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8032 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8033 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest(1:SIZE(msg_sour,1), 1:SIZE(msg_sour,2)) = msg_sour(:,:)
|
|
msglen = SIZE( msg_sour )
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8034 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_get_rm
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_get_cv(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX (DP) :: msg_dest(:)
|
|
COMPLEX (DP), INTENT(IN) :: msg_sour(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF( dest .NE. sour ) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8035 )
|
|
msglen = SIZE(msg_sour)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8036 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8037 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
|
|
msglen = SIZE(msg_sour)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8038 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_get_cv
|
|
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Marco Govoni
|
|
!
|
|
SUBROUTINE mp_get_cm(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX (DP) :: msg_dest(:,:)
|
|
COMPLEX (DP), INTENT(IN) :: msg_sour(:,:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8031 )
|
|
msglen = SIZE(msg_sour)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8032 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8033 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest(1:SIZE(msg_sour,1), 1:SIZE(msg_sour,2)) = msg_sour(:,:)
|
|
msglen = SIZE( msg_sour )
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8034 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_get_cm
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
|
|
|
|
SUBROUTINE mp_put_i1(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: msg_dest
|
|
INTEGER, INTENT(IN) :: msg_sour
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF(dest .NE. sour) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour, 1, MPI_INTEGER, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8039 )
|
|
msglen = 1
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest, 1, MPI_INTEGER, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8040 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8041 )
|
|
msglen = 1
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest = msg_sour
|
|
msglen = 1
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8042 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_put_i1
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!
|
|
SUBROUTINE mp_put_iv(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: msg_dest(:)
|
|
INTEGER, INTENT(IN) :: msg_sour(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_INTEGER, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8043 )
|
|
msglen = SIZE(msg_sour)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_INTEGER, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8044 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8045 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
|
|
msglen = SIZE(msg_sour)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8046 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_put_iv
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!
|
|
SUBROUTINE mp_put_rv(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP) :: msg_dest(:)
|
|
REAL (DP), INTENT(IN) :: msg_sour(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8047 )
|
|
msglen = SIZE(msg_sour)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8048 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8049 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
|
|
msglen = SIZE(msg_sour)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8050 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_put_rv
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!
|
|
SUBROUTINE mp_put_rm(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP) :: msg_dest(:,:)
|
|
REAL (DP), INTENT(IN) :: msg_sour(:,:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8051 )
|
|
msglen = SIZE(msg_sour)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8052 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8053 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest(1:SIZE(msg_sour,1),1:SIZE(msg_sour,2)) = msg_sour(:,:)
|
|
msglen = SIZE(msg_sour)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8054 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_put_rm
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!
|
|
SUBROUTINE mp_put_cv(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX (DP) :: msg_dest(:)
|
|
COMPLEX (DP), INTENT(IN) :: msg_sour(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
|
|
IF( dest .NE. sour ) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8055 )
|
|
msglen = SIZE(msg_sour)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 8056 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8057 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
|
|
msglen = SIZE(msg_sour)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8058 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_put_cv
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!..mp_stop
|
|
!
|
|
SUBROUTINE mp_stop(code)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (IN) :: code
|
|
INTEGER :: ierr
|
|
WRITE( stdout, fmt='( "*** error in Message Passing (mp) module ***")' )
|
|
WRITE( stdout, fmt='( "*** error code: ",I5)' ) code
|
|
#if defined(__MPI)
|
|
! abort with extreme prejudice across the entire MPI set of tasks
|
|
CALL mpi_abort(MPI_COMM_WORLD,code,ierr)
|
|
#endif
|
|
STOP
|
|
END SUBROUTINE mp_stop
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!..mp_sum
|
|
SUBROUTINE mp_sum_i1(msg,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: msg
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = 1
|
|
CALL reduce_base_integer( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_i1
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_sum_iv(msg, gid)
|
|
!------------------------------------------------------------------------------!
|
|
!!
|
|
!! MPI sum an integer vector from all cores and bcast the result to all.
|
|
!!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER, INTENT(inout) :: msg(:)
|
|
INTEGER, INTENT(in) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = SIZE(msg)
|
|
CALL reduce_base_integer(msglen, msg, gid, -1)
|
|
#endif
|
|
!------------------------------------------------------------------------------!
|
|
END SUBROUTINE mp_sum_iv
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_sum_i8v(msg, gid)
|
|
!------------------------------------------------------------------------------!
|
|
!!
|
|
!! MPI sum an integer vector from all cores and bcast the result to all.
|
|
!!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER(KIND = i8b), INTENT(inout) :: msg(:)
|
|
INTEGER, INTENT(in) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = SIZE(msg)
|
|
CALL reduce_base_integer8(msglen, msg, gid, -1)
|
|
#endif
|
|
!------------------------------------------------------------------------------!
|
|
END SUBROUTINE mp_sum_i8v
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_sum_im(msg,gid)
|
|
!------------------------------------------------------------------------------!
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: msg(:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_integer( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_im
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_sum_it(msg,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: msg(:,:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_integer( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_it
|
|
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_sum_i4(msg,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: msg(:,:,:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_integer( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_i4
|
|
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_sum_i5(msg,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: msg(:,:,:,:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_integer( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_i5
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_sum_r1(msg,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg
|
|
INTEGER, INTENT (IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = 1
|
|
CALL reduce_base_real( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_r1
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_sum_rv(msg,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg(:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_rv
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
|
|
|
|
SUBROUTINE mp_sum_rm(msg, gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg(:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_rm
|
|
|
|
|
|
SUBROUTINE mp_sum_rm_nc(msg, k1, k2, k3, k4, gid)
|
|
! for non-contiguous arrays
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg(:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
INTEGER, INTENT (IN) :: k1, k2, k3, k4
|
|
#if defined(__MPI)
|
|
REAL(DP), ALLOCATABLE :: msg_buff(:,:)
|
|
INTEGER :: msglen
|
|
ALLOCATE( msg_buff(k2-k1+1,k4-k3+1) )
|
|
msg_buff(1:k2-k1+1,1:k4-k3+1)=msg(k1:k2,k3:k4)
|
|
msglen = size(msg_buff)
|
|
CALL reduce_base_real( msglen, msg_buff, gid, -1 )
|
|
msg(k1:k2,k3:k4)=msg_buff(1:k2-k1+1,1:k4-k3+1)
|
|
DEALLOCATE( msg_buff )
|
|
#endif
|
|
END SUBROUTINE mp_sum_rm_nc
|
|
|
|
SUBROUTINE mp_root_sum_rm( msg, res, root, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (IN) :: msg(:,:)
|
|
REAL (DP), INTENT (OUT) :: res(:,:)
|
|
INTEGER, INTENT (IN) :: root
|
|
INTEGER, INTENT (IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen, ierr, taskid
|
|
|
|
msglen = size(msg)
|
|
|
|
CALL mpi_comm_rank( gid, taskid, ierr)
|
|
IF( ierr /= 0 ) CALL mp_stop( 8059 )
|
|
!
|
|
IF( taskid == root ) THEN
|
|
IF( msglen > size(res) ) CALL mp_stop( 8060 )
|
|
END IF
|
|
|
|
CALL reduce_base_real_to( msglen, msg, res, gid, root )
|
|
|
|
#else
|
|
|
|
res = msg
|
|
|
|
#endif
|
|
|
|
END SUBROUTINE mp_root_sum_rm
|
|
|
|
|
|
SUBROUTINE mp_root_sum_cm( msg, res, root, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (IN) :: msg(:,:)
|
|
COMPLEX (DP), INTENT (OUT) :: res(:,:)
|
|
INTEGER, INTENT (IN) :: root
|
|
INTEGER, INTENT (IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen, ierr, taskid
|
|
|
|
msglen = size(msg)
|
|
|
|
CALL mpi_comm_rank( gid, taskid, ierr)
|
|
IF( ierr /= 0 ) CALL mp_stop( 8061 )
|
|
|
|
IF( taskid == root ) THEN
|
|
IF( msglen > size(res) ) CALL mp_stop( 8062 )
|
|
END IF
|
|
|
|
CALL reduce_base_real_to( 2 * msglen, msg, res, gid, root )
|
|
|
|
#else
|
|
|
|
res = msg
|
|
|
|
#endif
|
|
|
|
END SUBROUTINE mp_root_sum_cm
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
|
|
SUBROUTINE mp_sum_rmm( msg, res, root, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (IN) :: msg(:,:)
|
|
REAL (DP), INTENT (OUT) :: res(:,:)
|
|
INTEGER, INTENT (IN) :: root
|
|
INTEGER, INTENT (IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: msglen
|
|
INTEGER :: taskid, ierr
|
|
|
|
msglen = size(msg)
|
|
|
|
#if defined(__MPI)
|
|
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_rank( group, taskid, ierr)
|
|
IF( ierr /= 0 ) CALL mp_stop( 8063 )
|
|
|
|
IF( taskid == root ) THEN
|
|
IF( msglen > size(res) ) CALL mp_stop( 8064 )
|
|
END IF
|
|
!
|
|
CALL reduce_base_real_to( msglen, msg, res, group, root )
|
|
!
|
|
|
|
#else
|
|
res = msg
|
|
#endif
|
|
|
|
END SUBROUTINE mp_sum_rmm
|
|
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
|
|
|
|
SUBROUTINE mp_sum_rt( msg, gid )
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg(:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_rt
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_sum_r4d(msg,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg(:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_r4d
|
|
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_sum_c1(msg,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT) :: msg
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = 1
|
|
CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_c1
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_sum_cv(msg,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT) :: msg(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_cv
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_sum_cm(msg, gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT) :: msg(:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_cm
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
|
|
|
|
SUBROUTINE mp_sum_cm_nc(msg, k1, k2, k3, k4, gid)
|
|
! for non-contiguous arrays
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT) :: msg(:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
INTEGER, INTENT (IN) :: k1, k2, k3, k4
|
|
#if defined(__MPI)
|
|
COMPLEX(DP), ALLOCATABLE :: msg_buff(:,:)
|
|
INTEGER :: msglen
|
|
ALLOCATE( msg_buff(k2-k1+1,k4-k3+1) )
|
|
msg_buff(1:k2-k1+1,1:k4-k3+1)=msg(k1:k2,k3:k4)
|
|
msglen = size(msg_buff)
|
|
CALL reduce_base_real( 2 * msglen, msg_buff, gid, -1 )
|
|
msg(k1:k2,k3:k4)=msg_buff(1:k2-k1+1,1:k4-k3+1)
|
|
DEALLOCATE( msg_buff )
|
|
#endif
|
|
END SUBROUTINE mp_sum_cm_nc
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_sum_cmm(msg, res, gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (IN) :: msg(:,:)
|
|
COMPLEX (DP), INTENT (OUT) :: res(:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real_to( 2 * msglen, msg, res, gid, -1 )
|
|
#else
|
|
res = msg
|
|
#endif
|
|
END SUBROUTINE mp_sum_cmm
|
|
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_sum_ct(msg,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = SIZE(msg)
|
|
CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_ct
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_sum_c4d(msg,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_c4d
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_sum_c5d(msg,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_c5d
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_sum_r5d(msg,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg(:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_r5d
|
|
|
|
|
|
SUBROUTINE mp_sum_r6d(msg,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg(:,:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_r6d
|
|
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_sum_c6d(msg,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_sum_c6d
|
|
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_max_i(msg,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: msg
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = 1
|
|
CALL parallel_max_integer( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_max_i
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!..mp_max_iv
|
|
!..Carlo Cavazzoni
|
|
!
|
|
SUBROUTINE mp_max_iv(msg,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: msg(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL parallel_max_integer( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_max_iv
|
|
!
|
|
!----------------------------------------------------------------------
|
|
|
|
SUBROUTINE mp_max_r(msg,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = 1
|
|
CALL parallel_max_real( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_max_r
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_max_rv(msg,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL parallel_max_real( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_max_rv
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_min_i(msg,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: msg
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = 1
|
|
CALL parallel_min_integer( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_min_i
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_min_iv(msg,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT) :: msg(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = SIZE(msg)
|
|
CALL parallel_min_integer( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_min_iv
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_min_r(msg,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = 1
|
|
CALL parallel_min_real( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_min_r
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
SUBROUTINE mp_min_rv(msg,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT) :: msg(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
#if defined(__MPI)
|
|
INTEGER :: msglen
|
|
msglen = size(msg)
|
|
CALL parallel_min_real( msglen, msg, gid, -1 )
|
|
#endif
|
|
END SUBROUTINE mp_min_rv
|
|
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_barrier(gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: ierr
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(gid,IERR)
|
|
IF (ierr/=0) CALL mp_stop( 8066 )
|
|
#endif
|
|
END SUBROUTINE mp_barrier
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!.. Carlo Cavazzoni
|
|
!..mp_rank
|
|
FUNCTION mp_rank( comm )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: mp_rank
|
|
INTEGER, INTENT(IN) :: comm
|
|
INTEGER :: ierr, taskid
|
|
|
|
ierr = 0
|
|
taskid = 0
|
|
#if defined(__MPI)
|
|
CALL mpi_comm_rank(comm,taskid,ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8067 )
|
|
#endif
|
|
mp_rank = taskid
|
|
END FUNCTION mp_rank
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!.. Carlo Cavazzoni
|
|
!..mp_size
|
|
FUNCTION mp_size( comm )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: mp_size
|
|
INTEGER, INTENT(IN) :: comm
|
|
INTEGER :: ierr, numtask
|
|
|
|
ierr = 0
|
|
numtask = 1
|
|
#if defined(__MPI)
|
|
CALL mpi_comm_size(comm,numtask,ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8068 )
|
|
#endif
|
|
mp_size = numtask
|
|
END FUNCTION mp_size
|
|
|
|
SUBROUTINE mp_report
|
|
INTEGER :: i
|
|
WRITE( stdout, *)
|
|
#if defined(__MPI)
|
|
# if defined (__MP_STAT)
|
|
WRITE( stdout, 20 )
|
|
20 FORMAT(3X,'please use an MPI profiler to analyze communications ')
|
|
# endif
|
|
#else
|
|
WRITE( stdout, *)
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_report
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_rv
|
|
!..Carlo Cavazzoni
|
|
|
|
SUBROUTINE mp_gatherv_rv( mydata, alldata, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL(DP) :: mydata(:)
|
|
REAL(DP) :: alldata(:)
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: ierr, npe, myid
|
|
|
|
#if defined (__MPI)
|
|
group = gid
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8069 )
|
|
CALL mpi_comm_rank( group, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8070 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
|
|
IF ( myid == root ) THEN
|
|
IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
|
|
END IF
|
|
IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
|
|
!
|
|
CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_DOUBLE_PRECISION, &
|
|
alldata, recvcount, displs, MPI_DOUBLE_PRECISION, root, group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8074 )
|
|
#else
|
|
IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
|
|
IF ( SIZE( mydata ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
|
|
!
|
|
alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_gatherv_rv
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_cv
|
|
!..Carlo Cavazzoni
|
|
|
|
SUBROUTINE mp_gatherv_cv( mydata, alldata, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX(DP) :: mydata(:)
|
|
COMPLEX(DP) :: alldata(:)
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: ierr, npe, myid
|
|
|
|
#if defined (__MPI)
|
|
group = gid
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8069 )
|
|
CALL mpi_comm_rank( group, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8070 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
|
|
IF ( myid == root ) THEN
|
|
IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
|
|
END IF
|
|
IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
|
|
!
|
|
CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_DOUBLE_COMPLEX, &
|
|
alldata, recvcount, displs, MPI_DOUBLE_COMPLEX, root, group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8074 )
|
|
#else
|
|
IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
|
|
IF ( SIZE( mydata ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
|
|
!
|
|
alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_gatherv_cv
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_rv
|
|
!..Carlo Cavazzoni
|
|
|
|
SUBROUTINE mp_gatherv_iv( mydata, alldata, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: mydata(:)
|
|
INTEGER :: alldata(:)
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: ierr, npe, myid
|
|
|
|
#if defined (__MPI)
|
|
group = gid
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8069 )
|
|
CALL mpi_comm_rank( group, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8070 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
|
|
IF ( myid == root ) THEN
|
|
IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
|
|
END IF
|
|
IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
|
|
!
|
|
CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_INTEGER, &
|
|
alldata, recvcount, displs, MPI_INTEGER, root, group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8074 )
|
|
#else
|
|
IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
|
|
IF ( SIZE( mydata ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
|
|
!
|
|
alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_gatherv_iv
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_rm
|
|
!..Carlo Cavazzoni
|
|
|
|
SUBROUTINE mp_gatherv_rm( mydata, alldata, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL(DP) :: mydata(:,:) ! Warning first dimension is supposed constant!
|
|
REAL(DP) :: alldata(:,:)
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: ierr, npe, myid, nsiz
|
|
INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)
|
|
|
|
|
|
#if defined (__MPI)
|
|
group = gid
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8069 )
|
|
CALL mpi_comm_rank( group, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8070 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
|
|
IF ( myid == root ) THEN
|
|
IF ( SIZE( alldata, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
|
|
IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8072 )
|
|
END IF
|
|
IF ( SIZE( mydata, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
|
|
!
|
|
ALLOCATE( nrecv( npe ), ndisp( npe ) )
|
|
!
|
|
nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata, 1 )
|
|
ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata, 1 )
|
|
!
|
|
CALL MPI_GATHERV( mydata, nrecv( myid + 1 ), MPI_DOUBLE_PRECISION, &
|
|
alldata, nrecv, ndisp, MPI_DOUBLE_PRECISION, root, group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8074 )
|
|
!
|
|
DEALLOCATE( nrecv, ndisp )
|
|
!
|
|
#else
|
|
IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8075 )
|
|
IF ( SIZE( alldata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
|
|
IF ( SIZE( mydata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
|
|
!
|
|
alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_gatherv_rm
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_im
|
|
!..Carlo Cavazzoni
|
|
|
|
SUBROUTINE mp_gatherv_im( mydata, alldata, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: mydata(:,:) ! Warning first dimension is supposed constant!
|
|
INTEGER :: alldata(:,:)
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: ierr, npe, myid, nsiz
|
|
INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)
|
|
|
|
|
|
#if defined (__MPI)
|
|
group = gid
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8069 )
|
|
CALL mpi_comm_rank( group, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8070 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
|
|
IF ( myid == root ) THEN
|
|
IF ( SIZE( alldata, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
|
|
IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8072 )
|
|
END IF
|
|
IF ( SIZE( mydata, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
|
|
!
|
|
ALLOCATE( nrecv( npe ), ndisp( npe ) )
|
|
!
|
|
nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata, 1 )
|
|
ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata, 1 )
|
|
!
|
|
CALL MPI_GATHERV( mydata, nrecv( myid + 1 ), MPI_INTEGER, &
|
|
alldata, nrecv, ndisp, MPI_INTEGER, root, group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8074 )
|
|
!
|
|
DEALLOCATE( nrecv, ndisp )
|
|
!
|
|
#else
|
|
IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8075 )
|
|
IF ( SIZE( alldata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
|
|
IF ( SIZE( mydata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
|
|
!
|
|
alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_gatherv_im
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_inplace_cplx_array
|
|
!..Ye Luo
|
|
|
|
SUBROUTINE mp_gatherv_inplace_cplx_array(alldata, my_column_type, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX(DP) :: alldata(:,:)
|
|
INTEGER, INTENT(IN) :: my_column_type
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:)
|
|
INTEGER, INTENT(IN) :: root, gid
|
|
INTEGER :: ierr, npe, myid
|
|
|
|
#if defined (__MPI)
|
|
CALL mpi_comm_size( gid, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8069 )
|
|
CALL mpi_comm_rank( gid, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8070 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
|
|
!
|
|
IF (myid==root) THEN
|
|
CALL MPI_GATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
|
|
alldata, recvcount, displs, my_column_type, root, gid, ierr )
|
|
ELSE
|
|
CALL MPI_GATHERV( alldata(1,displs(myid+1)+1), recvcount(myid+1), my_column_type, &
|
|
MPI_IN_PLACE, recvcount, displs, MPI_DATATYPE_NULL, root, gid, ierr )
|
|
ENDIF
|
|
IF (ierr/=0) CALL mp_stop( 8074 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_gatherv_inplace_cplx_array
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_allgatherv_inplace_cplx_array
|
|
!..Ye Luo
|
|
|
|
SUBROUTINE mp_allgatherv_inplace_cplx_array(alldata, my_element_type, recvcount, displs, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX(DP) :: alldata(:,:)
|
|
INTEGER, INTENT(IN) :: my_element_type
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: ierr, npe, myid
|
|
|
|
#if defined (__MPI)
|
|
CALL mpi_comm_size( gid, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8069 )
|
|
CALL mpi_comm_rank( gid, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8070 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
|
|
!
|
|
CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
|
|
alldata, recvcount, displs, my_element_type, gid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8074 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_allgatherv_inplace_cplx_array
|
|
|
|
!.. SdG added 16/08/19
|
|
SUBROUTINE mp_allgatherv_inplace_real_array(alldata, my_element_type, recvcount, displs, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL(DP) :: alldata(:,:)
|
|
INTEGER, INTENT(IN) :: my_element_type
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: ierr, npe, myid
|
|
|
|
#if defined (__MPI)
|
|
CALL mpi_comm_size( gid, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8069 )
|
|
CALL mpi_comm_rank( gid, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8070 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
|
|
!
|
|
CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
|
|
alldata, recvcount, displs, my_element_type, gid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8074 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_allgatherv_inplace_real_array
|
|
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_set_displs( recvcount, displs, ntot, nproc )
|
|
! Given the number of elements on each processor (recvcount), this subroutine
|
|
! sets the correct offsets (displs) to collect them on a single
|
|
! array with contiguous elemets
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT(IN) :: recvcount(:) ! number of elements on each processor
|
|
INTEGER, INTENT(OUT) :: displs(:) ! offsets/displacements
|
|
INTEGER, INTENT(OUT) :: ntot
|
|
INTEGER, INTENT(IN) :: nproc
|
|
INTEGER :: i
|
|
|
|
displs( 1 ) = 0
|
|
!
|
|
#if defined (__MPI)
|
|
IF( nproc < 1 ) CALL mp_stop( 8090 )
|
|
DO i = 2, nproc
|
|
displs( i ) = displs( i - 1 ) + recvcount( i - 1 )
|
|
END DO
|
|
ntot = displs( nproc ) + recvcount( nproc )
|
|
#else
|
|
ntot = recvcount( 1 )
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_set_displs
|
|
|
|
!------------------------------------------------------------------------------!
|
|
|
|
|
|
SUBROUTINE mp_alltoall_c3d( sndbuf, rcvbuf, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX(DP) :: sndbuf( :, :, : )
|
|
COMPLEX(DP) :: rcvbuf( :, :, : )
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe
|
|
|
|
#if defined (__MPI)
|
|
|
|
group = gid
|
|
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8069 )
|
|
|
|
IF ( SIZE( sndbuf, 3 ) < npe ) CALL mp_stop( 8069 )
|
|
IF ( SIZE( rcvbuf, 3 ) < npe ) CALL mp_stop( 8069 )
|
|
|
|
nsiz = SIZE( sndbuf, 1 ) * SIZE( sndbuf, 2 )
|
|
|
|
CALL MPI_ALLTOALL( sndbuf, nsiz, MPI_DOUBLE_COMPLEX, &
|
|
rcvbuf, nsiz, MPI_DOUBLE_COMPLEX, group, ierr )
|
|
|
|
IF (ierr/=0) CALL mp_stop( 8074 )
|
|
|
|
#else
|
|
|
|
rcvbuf = sndbuf
|
|
|
|
#endif
|
|
|
|
RETURN
|
|
END SUBROUTINE mp_alltoall_c3d
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
|
|
SUBROUTINE mp_alltoall_i3d( sndbuf, rcvbuf, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: sndbuf( :, :, : )
|
|
INTEGER :: rcvbuf( :, :, : )
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe
|
|
|
|
#if defined (__MPI)
|
|
|
|
group = gid
|
|
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8069 )
|
|
|
|
IF ( SIZE( sndbuf, 3 ) < npe ) CALL mp_stop( 8069 )
|
|
IF ( SIZE( rcvbuf, 3 ) < npe ) CALL mp_stop( 8069 )
|
|
|
|
nsiz = SIZE( sndbuf, 1 ) * SIZE( sndbuf, 2 )
|
|
|
|
CALL MPI_ALLTOALL( sndbuf, nsiz, MPI_INTEGER, &
|
|
rcvbuf, nsiz, MPI_INTEGER, group, ierr )
|
|
|
|
IF (ierr/=0) CALL mp_stop( 8074 )
|
|
|
|
#else
|
|
|
|
rcvbuf = sndbuf
|
|
|
|
#endif
|
|
|
|
RETURN
|
|
END SUBROUTINE mp_alltoall_i3d
|
|
|
|
SUBROUTINE mp_circular_shift_left_i0( buf, itag, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: buf
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
|
|
INTEGER :: istatus( mpi_status_size )
|
|
!
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8100 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8101 )
|
|
!
|
|
sour = mype + 1
|
|
IF( sour == npe ) sour = 0
|
|
dest = mype - 1
|
|
IF( dest == -1 ) dest = npe - 1
|
|
!
|
|
CALL MPI_Sendrecv_replace( buf, 1, MPI_INTEGER, &
|
|
dest, itag, sour, itag, group, istatus, ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8102 )
|
|
!
|
|
#else
|
|
! do nothing
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_circular_shift_left_i0
|
|
|
|
|
|
SUBROUTINE mp_circular_shift_left_i1( buf, itag, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: buf(:)
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
|
|
INTEGER :: istatus( mpi_status_size )
|
|
!
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8100 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8101 )
|
|
!
|
|
sour = mype + 1
|
|
IF( sour == npe ) sour = 0
|
|
dest = mype - 1
|
|
IF( dest == -1 ) dest = npe - 1
|
|
!
|
|
CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_INTEGER, &
|
|
dest, itag, sour, itag, group, istatus, ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8102 )
|
|
!
|
|
#else
|
|
! do nothing
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_circular_shift_left_i1
|
|
|
|
|
|
SUBROUTINE mp_circular_shift_left_i2( buf, itag, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: buf(:,:)
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
|
|
INTEGER :: istatus( mpi_status_size )
|
|
!
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8100 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8101 )
|
|
!
|
|
sour = mype + 1
|
|
IF( sour == npe ) sour = 0
|
|
dest = mype - 1
|
|
IF( dest == -1 ) dest = npe - 1
|
|
!
|
|
CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_INTEGER, &
|
|
dest, itag, sour, itag, group, istatus, ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8102 )
|
|
!
|
|
#else
|
|
! do nothing
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_circular_shift_left_i2
|
|
|
|
|
|
SUBROUTINE mp_circular_shift_left_r2d( buf, itag, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL(DP) :: buf( :, : )
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
|
|
INTEGER :: istatus( mpi_status_size )
|
|
!
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8100 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8101 )
|
|
!
|
|
sour = mype + 1
|
|
IF( sour == npe ) sour = 0
|
|
dest = mype - 1
|
|
IF( dest == -1 ) dest = npe - 1
|
|
!
|
|
CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_DOUBLE_PRECISION, &
|
|
dest, itag, sour, itag, group, istatus, ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8102 )
|
|
!
|
|
#else
|
|
! do nothing
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_circular_shift_left_r2d
|
|
|
|
SUBROUTINE mp_circular_shift_left_c2d( buf, itag, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX(DP) :: buf( :, : )
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
|
|
INTEGER :: istatus( mpi_status_size )
|
|
!
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8100 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8101 )
|
|
!
|
|
sour = mype + 1
|
|
IF( sour == npe ) sour = 0
|
|
dest = mype - 1
|
|
IF( dest == -1 ) dest = npe - 1
|
|
!
|
|
CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_DOUBLE_COMPLEX, &
|
|
dest, itag, sour, itag, group, istatus, ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8102 )
|
|
!
|
|
#else
|
|
! do nothing
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_circular_shift_left_c2d
|
|
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_circular_shift_left_start
|
|
SUBROUTINE mp_circular_shift_left_start_i0( sendbuf, recvbuf, itag, gid, requests)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: sendbuf, recvbuf
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER, INTENT(INOUT) :: requests(2)
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
|
|
!set null requests
|
|
requests = mpi_request_null
|
|
|
|
!communicator
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8100 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8101 )
|
|
!
|
|
sour = modulo(mype + 1, npe)
|
|
dest = modulo(mype + npe - 1, npe)
|
|
!
|
|
CALL MPI_Irecv( recvbuf, 1, MPI_INTEGER, &
|
|
sour, itag, group, requests(1), ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8102 )
|
|
!
|
|
CALL MPI_Isend( sendbuf, 1, MPI_INTEGER, &
|
|
dest, itag, group, requests(2), ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8103 )
|
|
!
|
|
#else
|
|
|
|
recvbuf = sendbuf
|
|
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_circular_shift_left_start_i0
|
|
|
|
|
|
SUBROUTINE mp_circular_shift_left_start_i1( sendbuf, recvbuf, itag, gid, requests)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: sendbuf( : ), recvbuf( : )
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER, INTENT(INOUT) :: requests(2)
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
|
|
!set null requests
|
|
requests = mpi_request_null
|
|
|
|
!communicator
|
|
group = gid
|
|
!
|
|
IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8100 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8101 )
|
|
!
|
|
sour = modulo(mype + 1, npe)
|
|
dest = modulo(mype + npe - 1, npe)
|
|
!
|
|
CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_INTEGER, &
|
|
sour, itag, group, requests(1), ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8102 )
|
|
!
|
|
CALL MPI_Isend( sendbuf, size(sendbuf), MPI_INTEGER, &
|
|
dest, itag, group, requests(2), ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8103 )
|
|
!
|
|
#else
|
|
|
|
recvbuf = sendbuf
|
|
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_circular_shift_left_start_i1
|
|
|
|
|
|
SUBROUTINE mp_circular_shift_left_start_i2( sendbuf, recvbuf, itag, gid, requests)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: sendbuf( :, : ), recvbuf( :, : )
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER, INTENT(INOUT) :: requests(2)
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
|
|
!set null requests
|
|
requests = mpi_request_null
|
|
|
|
!communicator
|
|
group = gid
|
|
!
|
|
IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8100 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8101 )
|
|
!
|
|
sour = modulo(mype + 1, npe)
|
|
dest = modulo(mype + npe - 1, npe)
|
|
!
|
|
CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_INTEGER, &
|
|
sour, itag, group, requests(1), ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8102 )
|
|
!
|
|
CALL MPI_Isend( sendbuf, size(sendbuf), MPI_INTEGER, &
|
|
dest, itag, group, requests(2), ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8103 )
|
|
!
|
|
#else
|
|
|
|
recvbuf = sendbuf
|
|
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_circular_shift_left_start_i2
|
|
|
|
|
|
SUBROUTINE mp_circular_shift_left_start_r2d( sendbuf, recvbuf, itag, gid, requests)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL(DP) :: sendbuf( :, : ), recvbuf( :, : )
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER, INTENT(INOUT) :: requests(2)
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
|
|
!set null requests
|
|
requests = mpi_request_null
|
|
|
|
!communicator
|
|
group = gid
|
|
!
|
|
IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8100 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8101 )
|
|
!
|
|
sour = modulo(mype + 1, npe)
|
|
dest = modulo(mype + npe - 1, npe)
|
|
!
|
|
CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_DOUBLE_PRECISION, &
|
|
sour, itag, group, requests(1), ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8102 )
|
|
!
|
|
CALL MPI_Isend( sendbuf, size(sendbuf), MPI_DOUBLE_PRECISION, &
|
|
dest, itag, group, requests(2), ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8103 )
|
|
!
|
|
#else
|
|
|
|
recvbuf = sendbuf
|
|
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_circular_shift_left_start_r2d
|
|
|
|
|
|
SUBROUTINE mp_circular_shift_left_start_c2d( sendbuf, recvbuf, itag, gid, requests)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX(DP) :: sendbuf( :, : ), recvbuf( :, : )
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER, INTENT(INOUT) :: requests(2)
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
|
|
!set null requests
|
|
requests = mpi_request_null
|
|
|
|
!communicator
|
|
group = gid
|
|
!
|
|
IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8100 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 8101 )
|
|
!
|
|
sour = modulo(mype + 1, npe)
|
|
dest = modulo(mype + npe - 1, npe)
|
|
!
|
|
CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_DOUBLE_COMPLEX, &
|
|
sour, itag, group, requests(1), ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8102 )
|
|
!
|
|
CALL MPI_Isend( sendbuf, size(sendbuf), MPI_DOUBLE_COMPLEX, &
|
|
dest, itag, group, requests(2), ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 8103 )
|
|
!
|
|
#else
|
|
|
|
recvbuf = sendbuf
|
|
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_circular_shift_left_start_c2d
|
|
!
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_count_nodes
|
|
SUBROUTINE mp_count_nodes(num_nodes, color, key, group)
|
|
!
|
|
! ... This routine counts the number of nodes using
|
|
! ... MPI_GET_PROCESSOR_NAME in the group specified by `group`.
|
|
! ... It returns colors and keys to be used in MPI_COMM_SPLIT.
|
|
! ... When running in parallel, the evaluation of color and key
|
|
! ... is done by all processors.
|
|
! ...
|
|
! ...
|
|
! ... input:
|
|
! ... group Communicator used to count nodes.
|
|
!
|
|
! ... output:
|
|
! ... num_nodes Number of unique nodes in the communicator
|
|
! ... color Integer (positive), same for all processes residing on a node.
|
|
! ... key Integer, unique number identifying each process on the same node.
|
|
! ...
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (OUT) :: num_nodes
|
|
INTEGER, INTENT (OUT) :: color
|
|
INTEGER, INTENT (OUT) :: key
|
|
INTEGER, INTENT (IN) :: group
|
|
#if defined (__MPI)
|
|
CHARACTER(len=MPI_MAX_PROCESSOR_NAME) :: hostname
|
|
CHARACTER(len=MPI_MAX_PROCESSOR_NAME), ALLOCATABLE :: host_list(:)
|
|
#endif
|
|
|
|
LOGICAL, ALLOCATABLE :: found_list(:)
|
|
INTEGER, ALLOCATABLE :: color_list(:)
|
|
INTEGER, ALLOCATABLE :: key_list(:)
|
|
!
|
|
INTEGER :: hostname_len, max_hostname_len, numtask, me, ierr
|
|
!
|
|
! Loops variables
|
|
INTEGER :: i, j, e, s, c, k
|
|
! ...
|
|
ierr = 0
|
|
num_nodes = 1
|
|
color = 1
|
|
key = 0
|
|
!
|
|
#if defined(__MPI)
|
|
!
|
|
CALL MPI_GET_PROCESSOR_NAME(hostname, hostname_len, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8103 )
|
|
|
|
! find total number of ranks and my rank in communicator
|
|
CALL MPI_COMM_SIZE(group, numtask, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8104 )
|
|
!
|
|
CALL MPI_COMM_RANK(group, me, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8105 )
|
|
!
|
|
ALLOCATE(host_list(0:numtask-1))
|
|
!
|
|
host_list(me) = hostname(1:hostname_len)
|
|
!
|
|
! Each process broadcast its name to the others
|
|
DO i=0,numtask-1
|
|
CALL MPI_BCAST(host_list(i), MPI_MAX_PROCESSOR_NAME, MPI_CHARACTER,&
|
|
i, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8106 )
|
|
END DO
|
|
!
|
|
! Simple algorithm to count unique entries.
|
|
!
|
|
ALLOCATE(found_list(0:numtask-1),color_list(0:numtask-1))
|
|
ALLOCATE(key_list(0:numtask-1))
|
|
found_list(:) = .false.
|
|
color_list(:) = -1
|
|
key_list(:) = -1
|
|
!
|
|
! c is the counter for colors
|
|
! k is the counter for keys
|
|
!
|
|
c = 0
|
|
DO i=0,numtask-1
|
|
! if node_counter == .true., this element has already been found,
|
|
! so skip it.
|
|
IF (found_list(i)) CYCLE
|
|
! else increment color counter and reset key counter
|
|
c = c + 1; k = 0
|
|
color_list(i) = c
|
|
key_list(i) = k
|
|
!
|
|
DO j=i+1,numtask-1
|
|
!
|
|
IF ( LLE(host_list(i),host_list(j)) .and. &
|
|
LGE(host_list(i),host_list(j)) ) THEN
|
|
! increment the key, key=0 is the one we are comparing to
|
|
k = k + 1
|
|
! element should not be already found
|
|
IF ( found_list(j) ) CALL mp_stop( 8107 )
|
|
found_list(j) = .true.
|
|
color_list(j) = c
|
|
key_list(j) = k
|
|
END IF
|
|
END DO
|
|
END DO
|
|
! Sanity checks
|
|
IF ( MINVAL(color_list) < 0 ) CALL mp_stop( 8108 )
|
|
IF ( MINVAL(key_list) < 0 ) CALL mp_stop( 8109 )
|
|
!
|
|
color = color_list(me)
|
|
key = key_list(me)
|
|
num_nodes = MAXVAL(color_list)
|
|
DEALLOCATE(host_list,found_list,color_list,key_list)
|
|
!
|
|
#endif
|
|
RETURN
|
|
END SUBROUTINE mp_count_nodes
|
|
!
|
|
FUNCTION mp_get_comm_null( )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: mp_get_comm_null
|
|
mp_get_comm_null = MPI_COMM_NULL
|
|
END FUNCTION mp_get_comm_null
|
|
|
|
FUNCTION mp_get_comm_self( )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: mp_get_comm_self
|
|
mp_get_comm_self = MPI_COMM_SELF
|
|
END FUNCTION mp_get_comm_self
|
|
|
|
SUBROUTINE mp_type_create_cplx_column_section(dummy, start, length, stride, mytype)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
!
|
|
COMPLEX (DP), INTENT(IN) :: dummy
|
|
INTEGER, INTENT(IN) :: start, length, stride
|
|
INTEGER, INTENT(OUT) :: mytype
|
|
!
|
|
#if defined(__MPI)
|
|
INTEGER :: ierr
|
|
!
|
|
CALL MPI_TYPE_CREATE_SUBARRAY(1, [stride], [length], [start], &
|
|
MPI_ORDER_FORTRAN, MPI_DOUBLE_COMPLEX, mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8081 )
|
|
CALL MPI_Type_commit(mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8082 )
|
|
#else
|
|
mytype = 0;
|
|
#endif
|
|
!
|
|
RETURN
|
|
END SUBROUTINE mp_type_create_cplx_column_section
|
|
|
|
SUBROUTINE mp_type_create_real_column_section(dummy, start, length, stride, mytype)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
!
|
|
REAL (DP), INTENT(IN) :: dummy
|
|
INTEGER, INTENT(IN) :: start, length, stride
|
|
INTEGER, INTENT(OUT) :: mytype
|
|
!
|
|
#if defined(__MPI)
|
|
INTEGER :: ierr
|
|
!
|
|
CALL MPI_TYPE_CREATE_SUBARRAY(1, [stride], [length], [start], &
|
|
MPI_ORDER_FORTRAN, MPI_DOUBLE_PRECISION, mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8083 )
|
|
CALL MPI_Type_commit(mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8084 )
|
|
#else
|
|
mytype = 0;
|
|
#endif
|
|
!
|
|
RETURN
|
|
END SUBROUTINE mp_type_create_real_column_section
|
|
|
|
SUBROUTINE mp_type_create_cplx_row_section(dummy, column_start, column_stride, row_length, mytype)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
!
|
|
COMPLEX (DP), INTENT(IN) :: dummy
|
|
INTEGER, INTENT(IN) :: column_start, column_stride, row_length
|
|
INTEGER, INTENT(OUT) :: mytype
|
|
!
|
|
#if defined(__MPI)
|
|
INTEGER :: ierr, temporary
|
|
INTEGER :: strides(2), lengths(2), starts(2)
|
|
INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
|
|
!
|
|
strides(1) = column_stride ; strides(2) = row_length
|
|
lengths(1) = 1 ; lengths(2) = row_length
|
|
starts(1) = column_start ; starts(2) = 0
|
|
CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
|
|
MPI_DOUBLE_COMPLEX, temporary, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8085 )
|
|
CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_COMPLEX, lb, extent, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8085 )
|
|
CALL MPI_TYPE_COMMIT(temporary, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8085 )
|
|
CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8086 )
|
|
CALL MPI_Type_commit(mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8086 )
|
|
#else
|
|
mytype = 0;
|
|
#endif
|
|
!
|
|
RETURN
|
|
END SUBROUTINE mp_type_create_cplx_row_section
|
|
|
|
SUBROUTINE mp_type_create_real_row_section(dummy, column_start, column_stride, row_length, mytype)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
!
|
|
REAL (DP), INTENT(IN) :: dummy
|
|
INTEGER, INTENT(IN) :: column_start, column_stride, row_length
|
|
INTEGER, INTENT(OUT) :: mytype
|
|
!
|
|
#if defined(__MPI)
|
|
INTEGER :: ierr, temporary
|
|
INTEGER :: strides(2), lengths(2), starts(2)
|
|
INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
|
|
!
|
|
strides(1) = column_stride ; strides(2) = row_length
|
|
lengths(1) = 1 ; lengths(2) = row_length
|
|
starts(1) = column_start ; starts(2) = 0
|
|
CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
|
|
MPI_DOUBLE_PRECISION, temporary, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8087 )
|
|
CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_PRECISION, lb, extent, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8087 )
|
|
CALL MPI_TYPE_COMMIT(temporary, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8087 )
|
|
CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8088 )
|
|
CALL MPI_Type_commit(mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8088 )
|
|
#else
|
|
mytype = 0;
|
|
#endif
|
|
!
|
|
RETURN
|
|
END SUBROUTINE mp_type_create_real_row_section
|
|
|
|
SUBROUTINE mp_type_free(mytype)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER :: mytype, ierr
|
|
!
|
|
#if defined(__MPI)
|
|
CALL MPI_TYPE_FREE(mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8089 )
|
|
#endif
|
|
!
|
|
RETURN
|
|
END SUBROUTINE mp_type_free
|
|
!------------------------------------------------------------------------------!
|
|
! GPU specific subroutines (Pietro Bonfa')
|
|
!------------------------------------------------------------------------------!
|
|
! Before hacking on the CUDA part remember that:
|
|
!
|
|
! 1. all mp_* interface should be blocking with respect to both MPI and CUDA.
|
|
! MPI will only wait for completion on the default stream therefore device
|
|
! synchronization must be enforced.
|
|
! 2. Host -> device memory copies of a memory block of 64 KB or less are
|
|
! asynchronous in the sense that they may return before the data is actually
|
|
! available on the GPU. However, the user is still free to change the buffer
|
|
! as soon as those calls return with no ill effects.
|
|
! (https://devtalk.nvidia.com/default/topic/471866/cuda-programming-and-performance/host-device-memory-copies-up-to-64-kb-are-asynchronous/)
|
|
! 3. For transfers from device to either pageable or pinned host memory,
|
|
! the function returns only once the copy has completed.
|
|
! 4. GPU synchronization is always enforced even if no communication takes place.
|
|
!------------------------------------------------------------------------------!
|
|
#ifdef __CUDA
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_bcast
|
|
|
|
SUBROUTINE mp_bcast_i1_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: msg_d
|
|
INTEGER :: msg_h
|
|
INTEGER :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: msglen, ierr
|
|
!
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
group = gid
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_integer_gpu( msg_d, msglen, source, group )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
msg_h = msg_d ! This syncs __MPI case
|
|
CALL bcast_integer( msg_h, msglen, source, group )
|
|
msg_d = msg_h
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_i1_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_iv_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: msg_d(:)
|
|
INTEGER, ALLOCATABLE :: msg_h(:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_integer_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_integer( msg_h, msglen, source, gid )
|
|
msg_d = msg_h; DEALLOCATE( msg_h )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_iv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_im_gpu( msg_d, source, gid )
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: msg_d(:,:)
|
|
INTEGER, ALLOCATABLE :: msg_h(:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_integer_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_integer( msg_h, msglen, source, gid )
|
|
msg_d = msg_h; DEALLOCATE( msg_h )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_im_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_it_gpu( msg_d, source, gid )
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: msg_d(:,:,:)
|
|
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_integer_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
INTEGER, ALLOCATABLE :: msg_h(:,:,:)
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_integer( msg_h, msglen, source, gid )
|
|
msg_d = msg_h; DEALLOCATE( msg_h )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_it_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_i4d_gpu(msg_d, source, gid)
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: msg_d(:,:,:,:)
|
|
INTEGER, ALLOCATABLE :: msg_h(:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_integer_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_integer( msg_h, msglen, source, gid )
|
|
msg_d = msg_h; DEALLOCATE( msg_h )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_i4d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_r1_gpu( msg_d, source, gid )
|
|
IMPLICIT NONE
|
|
REAL (DP), DEVICE :: msg_d
|
|
REAL (DP) :: msg_h
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
msg_h=msg_d ! This syncs __MPI case
|
|
CALL bcast_real( msg_h, msglen, source, gid )
|
|
msg_d = msg_h
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_r1_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_rv_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), DEVICE :: msg_d(:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_real( msg_h, msglen, source, gid )
|
|
msg_d = msg_h ; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_rv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_rm_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), DEVICE :: msg_d(:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
REAL (DP), ALLOCATABLE :: msg_h(:,:)
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_real( msg_h, msglen, source, gid )
|
|
msg_d = msg_h ; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_rm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_rt_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), DEVICE :: msg_d(:,:,:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_real( msg_h, msglen, source, gid )
|
|
msg_d = msg_h ; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_rt_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_r4d_gpu(msg_d, source, gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), DEVICE :: msg_d(:,:,:,:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_real( msg_h, msglen, source, gid )
|
|
msg_d = msg_h ; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_r4d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_r5d_gpu(msg_d, source, gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), DEVICE :: msg_d(:,:,:,:,:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_real( msg_h, msglen, source, gid )
|
|
msg_d = msg_h ; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_r5d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_c1_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), DEVICE :: msg_d
|
|
COMPLEX (DP) :: msg_h
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
msg_h=msg_d ! This syncs __MPI case
|
|
CALL bcast_real( msg_h, 2 * msglen, source, gid )
|
|
msg_d = msg_h
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_c1_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_cv_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), DEVICE :: msg_d(:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_real( msg_h, 2 * msglen, source, gid )
|
|
msg_d = msg_h ; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_cv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_cm_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), DEVICE :: msg_d(:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_real( msg_h, 2 * msglen, source, gid )
|
|
msg_d = msg_h ; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_cm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_ct_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), DEVICE :: msg_d(:,:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_real( msg_h, 2 * msglen, source, gid )
|
|
msg_d = msg_h ; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_ct_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_c4d_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), DEVICE :: msg_d(:,:,:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_real( msg_h, 2 * msglen, source, gid )
|
|
msg_d = msg_h ; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_c4d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_c5d_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), DEVICE :: msg_d(:,:,:,:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_real( msg_h, 2 * msglen, source, gid )
|
|
msg_d = msg_h ; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_c5d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_c6d_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), DEVICE :: msg_d(:,:,:,:,:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_real( msg_h, 2 * msglen, source, gid )
|
|
msg_d = msg_h ; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_c6d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_l_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
LOGICAL, DEVICE :: msg_d
|
|
LOGICAL :: msg_h
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_logical_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
msg_h = msg_d ! This syncs __MPI case
|
|
CALL bcast_logical( msg_h, msglen, source, gid )
|
|
msg_d = msg_h
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_l_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_lv_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
LOGICAL, DEVICE :: msg_d(:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_logical_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
LOGICAL, ALLOCATABLE :: msg_h(:)
|
|
ALLOCATE(msg_h, source=msg_d) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_logical( msg_h, msglen, source, gid )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_lv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_bcast_lm_gpu(msg_d,source,gid)
|
|
IMPLICIT NONE
|
|
LOGICAL, DEVICE :: msg_d(:,:)
|
|
INTEGER, INTENT(IN) :: source
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI case
|
|
CALL bcast_logical_gpu( msg_d, msglen, source, gid )
|
|
RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
|
|
#else
|
|
LOGICAL, ALLOCATABLE :: msg_h(:,:)
|
|
ALLOCATE(msg_h, source=msg_d) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL bcast_logical( msg_h, msglen, source, gid )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_bcast_lm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_get_i1_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: msg_dest_d
|
|
INTEGER, INTENT(IN), DEVICE :: msg_sour_d
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group, ierr
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: nrcv
|
|
INTEGER :: msglen = 1
|
|
|
|
#if ! defined(__GPU_MPI)
|
|
! Call CPU implementation
|
|
INTEGER :: msg_dest_h, msg_sour_h
|
|
!
|
|
msg_dest_h = msg_dest_d; msg_sour_h = msg_sour_d ! This syncs __MPI case
|
|
CALL mp_get_i1(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
|
|
msg_dest_d = msg_dest_h
|
|
#else
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF(dest .NE. sour) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
msglen=1
|
|
CALL MPI_SEND( msg_sour_d, msglen, MPI_INTEGER, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( -8001 )
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
msglen=1
|
|
CALL MPI_RECV( msg_dest_d, msglen, MPI_INTEGER, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( -8002 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( -8003 )
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest_d = msg_sour_d
|
|
msglen = 1
|
|
END IF
|
|
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( -8004 )
|
|
#endif
|
|
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_get_i1_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_get_iv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: msg_dest_d(:)
|
|
INTEGER, INTENT(IN), DEVICE :: msg_sour_d(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
!
|
|
#if ! defined(__GPU_MPI)
|
|
INTEGER, ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
|
|
!
|
|
ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case
|
|
CALL mp_get_iv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
|
|
msg_dest_d = msg_dest_h
|
|
DEALLOCATE(msg_dest_h, msg_sour_h)
|
|
#else
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
msglen = SIZE(msg_sour_d)
|
|
CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_INTEGER, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9001 )
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_INTEGER, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 9002 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9003 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
!msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
|
|
ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
|
|
msglen = SIZE(msg_sour_d)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9004 )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_get_iv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_get_r1_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP), DEVICE :: msg_dest_d
|
|
REAL (DP), INTENT(IN), DEVICE :: msg_sour_d
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
#if ! defined(__GPU_MPI)
|
|
REAL(DP) :: msg_dest_h, msg_sour_h
|
|
!
|
|
msg_dest_h=msg_dest_d; msg_sour_h=msg_sour_d ! This syncs __MPI case
|
|
CALL mp_get_r1(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
|
|
msg_dest_d = msg_dest_h
|
|
#else
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
msglen = 1
|
|
CALL MPI_SEND( msg_sour_d, msglen, MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9005 )
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
msglen = 1
|
|
CALL MPI_RECV( msg_dest_d, msglen, MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 9006 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9007 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest_d = msg_sour_d
|
|
msglen = 1
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9008 )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_get_r1_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_get_rv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP), DEVICE :: msg_dest_d(:)
|
|
REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
!
|
|
#if ! defined(__GPU_MPI)
|
|
REAL (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
|
|
!
|
|
ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case
|
|
CALL mp_get_rv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
|
|
msg_dest_d = msg_dest_h
|
|
DEALLOCATE(msg_dest_h, msg_sour_h)
|
|
#else
|
|
!
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
msglen = SIZE(msg_sour_d)
|
|
CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9009 )
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 9010 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9011 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
!msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
|
|
ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
|
|
msglen = SIZE(msg_sour_d)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9012 )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_get_rv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_get_rm_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP), DEVICE :: msg_dest_d(:,:)
|
|
REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
!
|
|
#if ! defined(__GPU_MPI)
|
|
REAL (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:)
|
|
!
|
|
ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case
|
|
CALL mp_get_rm(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
|
|
msg_dest_d = msg_dest_h
|
|
DEALLOCATE(msg_dest_h, msg_sour_h)
|
|
#else
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9013 )
|
|
msglen = SIZE(msg_sour_d)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 9014 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9015 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
!msg_dest_d(1:SIZE(msg_sour_d,1), 1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:)
|
|
! function cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kdir)
|
|
ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),&
|
|
msg_sour_d, SIZE(msg_sour_d,1),&
|
|
SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), &
|
|
cudaMemcpyDeviceToDevice )
|
|
msglen = SIZE( msg_sour_d )
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9016 )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_get_rm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_get_cv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), DEVICE :: msg_dest_d(:)
|
|
COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
!
|
|
#if ! defined(__GPU_MPI)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
|
|
!
|
|
ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case
|
|
CALL mp_get_cv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
|
|
msg_dest_d = msg_dest_h;
|
|
DEALLOCATE(msg_dest_h, msg_sour_h)
|
|
#else
|
|
!
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF( dest .NE. sour ) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9017 )
|
|
msglen = SIZE(msg_sour_d)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 9018 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9019 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
!msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
|
|
ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
|
|
msglen = SIZE(msg_sour_d)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9020 )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_get_cv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_get_cm_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:)
|
|
COMPLEX (DP), DEVICE :: msg_dest_d(:,:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
!
|
|
#if ! defined(__GPU_MPI)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:)
|
|
!
|
|
ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case
|
|
CALL mp_get_cm(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
|
|
msg_dest_d = msg_dest_h;
|
|
DEALLOCATE(msg_dest_h, msg_sour_h)
|
|
#else
|
|
!
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9021 )
|
|
msglen = SIZE(msg_sour_d)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 9022 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9023 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
!msg_dest_d(1:SIZE(msg_sour_d,1), 1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:)
|
|
ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),&
|
|
msg_sour_d, SIZE(msg_sour_d,1),&
|
|
SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), &
|
|
cudaMemcpyDeviceToDevice )
|
|
msglen = SIZE( msg_sour_d )
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9024 )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_get_cm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_put_i1_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: msg_dest_d
|
|
INTEGER, INTENT(IN), DEVICE :: msg_sour_d
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
!
|
|
#if ! defined(__GPU_MPI)
|
|
INTEGER :: msg_dest_h, msg_sour_h
|
|
!
|
|
msg_dest_h=msg_dest_d ; msg_sour_h=msg_sour_d ! This syncs __MPI case
|
|
CALL mp_put_i1(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
|
|
msg_dest_d = msg_dest_h
|
|
#else
|
|
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF(dest .NE. sour) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour_d, 1, MPI_INTEGER, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9025 )
|
|
msglen = 1
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest_d, 1, MPI_INTEGER, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 9026 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9027 )
|
|
msglen = 1
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
msg_dest_d = msg_sour_d
|
|
msglen = 1
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9028 )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_put_i1_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_put_iv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: msg_dest_d(:)
|
|
INTEGER, INTENT(IN), DEVICE :: msg_sour_d(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
!
|
|
#if ! defined(__GPU_MPI)
|
|
INTEGER, ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
|
|
!
|
|
ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case
|
|
CALL mp_put_iv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
|
|
msg_dest_d = msg_dest_h
|
|
DEALLOCATE(msg_dest_h, msg_sour_h)
|
|
#else
|
|
!
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_INTEGER, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9029 )
|
|
msglen = SIZE(msg_sour_d)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_INTEGER, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 9030 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9031 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
!msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
|
|
ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
|
|
msglen = SIZE(msg_sour_d)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9032 )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_put_iv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_put_rv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP), DEVICE :: msg_dest_d(:)
|
|
REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
!
|
|
#if ! defined(__GPU_MPI)
|
|
REAL (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
|
|
!
|
|
ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case
|
|
CALL mp_put_rv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
|
|
msg_dest_d = msg_dest_h
|
|
DEALLOCATE(msg_dest_h, msg_sour_h)
|
|
#else
|
|
!
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9033 )
|
|
msglen = SIZE(msg_sour_d)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 9034 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9035 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
!msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
|
|
ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
|
|
msglen = SIZE(msg_sour_d)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9036 )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_put_rv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_put_rm_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP), DEVICE :: msg_dest_d(:,:)
|
|
REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
!
|
|
#if ! defined(__GPU_MPI)
|
|
REAL (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:)
|
|
!
|
|
ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case
|
|
CALL mp_put_rm(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
|
|
msg_dest_d = msg_dest_h
|
|
DEALLOCATE(msg_dest_h, msg_sour_h)
|
|
#else
|
|
!
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF(sour .NE. dest) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9037 )
|
|
msglen = SIZE(msg_sour_d)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 9038 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9039 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
!msg_dest_d(1:SIZE(msg_sour_d,1),1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:)
|
|
ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),&
|
|
msg_sour_d, SIZE(msg_sour_d,1),&
|
|
SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), &
|
|
cudaMemcpyDeviceToDevice )
|
|
msglen = SIZE(msg_sour_d)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9040 )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_put_rm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_put_cv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), DEVICE :: msg_dest_d(:)
|
|
COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
|
|
INTEGER, INTENT(IN) :: dest, sour, ip, mpime
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
#if defined(__MPI)
|
|
INTEGER :: istatus(MPI_STATUS_SIZE)
|
|
#endif
|
|
INTEGER :: ierr, nrcv
|
|
INTEGER :: msglen
|
|
!
|
|
#if ! defined(__GPU_MPI)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
|
|
!
|
|
ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d ); ! This syncs __MPI case
|
|
CALL mp_put_cv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
|
|
msg_dest_d = msg_dest_h
|
|
DEALLOCATE(msg_dest_h, msg_sour_h)
|
|
#else
|
|
!
|
|
#if defined(__MPI)
|
|
group = gid
|
|
#endif
|
|
! processors not taking part in the communication have 0 length message
|
|
|
|
msglen = 0
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL and __GPU_MPI
|
|
!
|
|
IF( dest .NE. sour ) THEN
|
|
#if defined(__MPI)
|
|
IF(mpime .EQ. sour) THEN
|
|
CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9041 )
|
|
msglen = SIZE(msg_sour_d)
|
|
ELSE IF(mpime .EQ. dest) THEN
|
|
CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
|
|
IF (ierr/=0) CALL mp_stop( 9042 )
|
|
CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 9043 )
|
|
msglen = nrcv
|
|
END IF
|
|
#endif
|
|
ELSEIF(mpime .EQ. sour)THEN
|
|
!msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
|
|
ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
|
|
msglen = SIZE(msg_sour_d)
|
|
END IF
|
|
#if defined(__MPI)
|
|
CALL MPI_BARRIER(group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9044 )
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI and __GPU_MPI
|
|
RETURN
|
|
END SUBROUTINE mp_put_cv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
!..mp_sum
|
|
SUBROUTINE mp_sum_i1_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT), DEVICE :: msg_d
|
|
INTEGER, msg_h
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
|
|
! No need for final syncronization
|
|
#else
|
|
!
|
|
msg_h = msg_d ! This syncs __MPI case
|
|
CALL reduce_base_integer( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_i1_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_iv_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT), DEVICE :: msg_d(:)
|
|
INTEGER, ALLOCATABLE :: msg_h(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
|
|
! No need for final syncronization
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_integer( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_iv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_im_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT), DEVICE :: msg_d(:,:)
|
|
INTEGER, ALLOCATABLE :: msg_h(:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
|
|
! No need for final syncronization
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_integer( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_im_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_it_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT), DEVICE :: msg_d(:,:,:)
|
|
INTEGER, ALLOCATABLE :: msg_h(:,:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
|
|
! No need for final syncronization
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_integer( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_it_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_r1_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d
|
|
REAL(DP) :: msg_h
|
|
INTEGER, INTENT (IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
|
|
! No need for final syncronization
|
|
#else
|
|
msg_h=msg_d ! This syncs __MPI case
|
|
CALL reduce_base_real( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_r1_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_rv_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:)
|
|
REAL(DP), ALLOCATABLE :: msg_h(:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
|
|
! No need for final syncronization
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_rv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_rm_gpu(msg_d, gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
|
|
! No need for final syncronization
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_rm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_rm_nc_gpu(msg_d, k1, k2, k3, k4, gid)
|
|
! for non-contiguous (nc) arrays
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:,:)
|
|
INTEGER, INTENT (IN) :: k1, k2, k3, k4
|
|
INTEGER, INTENT (IN) :: gid
|
|
REAL (DP), ALLOCATABLE :: msg_buff(:,:)
|
|
#if defined(__GPU_MPI)
|
|
ATTRIBUTES(DEVICE) :: msg_buff
|
|
#endif
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
ALLOCATE( msg_buff(k2-k1+1,k4-k3+1) )
|
|
|
|
msg_buff(1:k2-k1+1,1:k4-k3+1) = msg_d(k1:k2,k3:k4)
|
|
msglen = size(msg_buff)
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
!msglen = size(msg_buff)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( msglen, msg_buff, gid, -1 )
|
|
! No need for final syncronization
|
|
#else
|
|
!ALLOCATE( msg_h, source=msg_buff ) ! This syncs __MPI case
|
|
!msglen = size(msg_h)
|
|
!msglen = size(msg_buff)
|
|
CALL reduce_base_real( msglen, msg_buff, gid, -1 )
|
|
!msg_buff = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
!
|
|
msg_d(k1:k2,k3:k4) = msg_buff(1:k2-k1+1,1:k4-k3+1)
|
|
DEALLOCATE( msg_buff )
|
|
!
|
|
END SUBROUTINE mp_sum_rm_nc_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_root_sum_rm_gpu( msg_d, res_d, root, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (IN) , DEVICE :: msg_d(:,:)
|
|
REAL (DP), INTENT (OUT), DEVICE :: res_d(:,:)
|
|
REAL (DP), ALLOCATABLE :: res_h(:,:), msg_h(:,:)
|
|
INTEGER, INTENT (IN) :: root
|
|
INTEGER, INTENT (IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr, taskid
|
|
#if defined(__MPI)
|
|
!
|
|
CALL mpi_comm_rank( gid, taskid, ierr)
|
|
IF( ierr /= 0 ) CALL mp_stop( 9045 )
|
|
!
|
|
msglen = size(msg_d)
|
|
IF( taskid == root ) THEN
|
|
IF( msglen > size(res_d) ) CALL mp_stop( 9046 )
|
|
END IF
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_to_gpu( msglen, msg_d, res_d, gid, root )
|
|
RETURN ! Sync not needed in this case
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2)));
|
|
CALL reduce_base_real_to( msglen, msg_h, res_h, gid, root )
|
|
IF( taskid == root ) res_d = res_h;
|
|
IF( taskid == root ) DEALLOCATE(res_h)
|
|
DEALLOCATE(msg_h)
|
|
#endif
|
|
|
|
#else
|
|
res_d = msg_d
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_root_sum_rm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_root_sum_cm_gpu( msg_d, res_d, root, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (IN) , DEVICE :: msg_d(:,:)
|
|
COMPLEX (DP), INTENT (OUT), DEVICE :: res_d(:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: res_h(:,:), msg_h(:,:)
|
|
INTEGER, INTENT (IN) :: root
|
|
INTEGER, INTENT (IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr, taskid
|
|
#if defined(__MPI)
|
|
msglen = size(msg_d)
|
|
|
|
CALL mpi_comm_rank( gid, taskid, ierr)
|
|
IF( ierr /= 0 ) CALL mp_stop( 9047 )
|
|
|
|
IF( taskid == root ) THEN
|
|
IF( msglen > size(res_d) ) CALL mp_stop( 9048 )
|
|
END IF
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_to_gpu( 2 * msglen, msg_d, res_d, gid, root )
|
|
RETURN ! Sync not needed in this case
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2)));
|
|
CALL reduce_base_real_to( 2 * msglen, msg_h, res_h, gid, root )
|
|
IF( taskid == root ) res_d = res_h;
|
|
IF( taskid == root ) DEALLOCATE(res_h)
|
|
DEALLOCATE(msg_h)
|
|
#endif
|
|
#else
|
|
res_d = msg_d
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_root_sum_cm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_rmm_gpu( msg_d, res_d, root, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (IN), DEVICE :: msg_d(:,:)
|
|
REAL (DP), INTENT (OUT),DEVICE :: res_d(:,:)
|
|
REAL (DP), ALLOCATABLE :: res_h(:,:), msg_h(:,:)
|
|
INTEGER, INTENT (IN) :: root
|
|
INTEGER, INTENT (IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: msglen
|
|
INTEGER :: taskid, ierr
|
|
|
|
|
|
|
|
#if defined(__MPI)
|
|
|
|
msglen = size(msg_d)
|
|
!
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_rank( group, taskid, ierr)
|
|
IF( ierr /= 0 ) CALL mp_stop( 9049 )
|
|
|
|
IF( taskid == root ) THEN
|
|
IF( msglen > size(res_d) ) CALL mp_stop( 9050 )
|
|
END IF
|
|
!
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_to_gpu( msglen, msg_d, res_d, group, root )
|
|
RETURN ! Sync not needed in this case
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2)));
|
|
CALL reduce_base_real_to( msglen, msg_h, res_h, gid, root )
|
|
IF( taskid == root ) res_d = res_h;
|
|
IF( taskid == root ) DEALLOCATE(res_h)
|
|
DEALLOCATE(msg_h)
|
|
#endif
|
|
!
|
|
#else
|
|
res_d = msg_d
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_sum_rmm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_rt_gpu( msg_d, gid )
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_rt_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_r4d_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_r4d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_c1_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d
|
|
COMPLEX (DP) :: msg_h
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
msg_h=msg_d ! This syncs __MPI case
|
|
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_c1_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_cv_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
!
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs the device after small message copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_cv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_cm_gpu(msg_d, gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_cm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_cm_nc_gpu(msg_d, k1, k2, k3, k4, gid)
|
|
! for non-contiguous (nc) arrays
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:)
|
|
INTEGER, INTENT (IN) :: k1, k2, k3, k4
|
|
INTEGER, INTENT (IN) :: gid
|
|
COMPLEX (DP), ALLOCATABLE :: msg_buff(:,:)
|
|
#if defined(__GPU_MPI)
|
|
ATTRIBUTES(DEVICE) :: msg_buff
|
|
#endif
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
ALLOCATE( msg_buff(k2-k1+1,k4-k3+1) )
|
|
|
|
msg_buff(1:k2-k1+1,1:k4-k3+1) = msg_d(k1:k2,k3:k4)
|
|
msglen = size(msg_buff)
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
!msglen = size(msg_buff)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( 2 * msglen, msg_buff, gid, -1 )
|
|
! No need for final syncronization
|
|
#else
|
|
!ALLOCATE( msg_h, source=msg_buff ) ! This syncs __MPI case
|
|
!msglen = size(msg_h)
|
|
!msglen = size(msg_buff)
|
|
CALL reduce_base_real( 2 * msglen, msg_buff, gid, -1 )
|
|
!msg_buff = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
!
|
|
msg_d(k1:k2,k3:k4) = msg_buff(1:k2-k1+1,1:k4-k3+1)
|
|
DEALLOCATE( msg_buff )
|
|
!
|
|
END SUBROUTINE mp_sum_cm_nc_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_cmm_gpu(msg_d, res_d, gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (IN), DEVICE :: msg_d(:,:)
|
|
COMPLEX (DP), INTENT (OUT), DEVICE :: res_d(:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:), res_h(:,:)
|
|
INTEGER, INTENT (IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_to_gpu( 2 * msglen, msg_d, res_d, gid, -1 )
|
|
RETURN ! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
ALLOCATE( res_h(lbound(msg_h,1):ubound(msg_h,1), lbound(msg_h,2):ubound(msg_h,2)));
|
|
CALL reduce_base_real_to( 2 * msglen, msg_h, res_h, gid, -1 )
|
|
res_d = res_h; DEALLOCATE(msg_h, res_h)
|
|
#endif
|
|
#else
|
|
res_d = msg_d
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_sum_cmm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_ct_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = SIZE(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_ct_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_c4d_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:)
|
|
COMPLEX (DP),ALLOCATABLE :: msg_h(:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_c4d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_c5d_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_c5d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_r5d_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_r5d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_r6d_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:,:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_r6d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_sum_c6d_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:,:)
|
|
COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_sum_c6d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_max_i_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT), DEVICE :: msg_d
|
|
INTEGER :: msg_h
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
msg_h = msg_d ! This syncs __MPI case
|
|
CALL parallel_max_integer( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_max_i_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_max_iv_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT), DEVICE :: msg_d(:)
|
|
INTEGER, ALLOCATABLE :: msg_h(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL parallel_max_integer( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_max_iv_gpu
|
|
!
|
|
!----------------------------------------------------------------------
|
|
!
|
|
SUBROUTINE mp_max_r_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d
|
|
REAL (DP) :: msg_h
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
msg_h = msg_d ! This syncs __MPI case
|
|
CALL parallel_max_real( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_max_r_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_max_rv_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL parallel_max_real( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_max_rv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_min_i_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT), DEVICE :: msg_d
|
|
INTEGER :: msg_h
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
msg_h = msg_d ! This syncs __MPI case
|
|
CALL parallel_min_integer( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_min_i_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_min_iv_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT (INOUT), DEVICE :: msg_d(:)
|
|
INTEGER, ALLOCATABLE :: msg_h(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = SIZE(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL parallel_min_integer( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_min_iv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_min_r_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d
|
|
REAL (DP) :: msg_h
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
msglen = 1
|
|
#if defined(__GPU_MPI)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
msg_h = msg_d ! This syncs __MPI case
|
|
CALL parallel_min_real( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_min_r_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_min_rv_gpu(msg_d,gid)
|
|
IMPLICIT NONE
|
|
REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:)
|
|
REAL (DP), ALLOCATABLE :: msg_h(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
!
|
|
INTEGER :: msglen, ierr
|
|
! Avoid unnecessary communications on __MPI and syncs SERIAL
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
#if defined(__MPI)
|
|
#if defined(__GPU_MPI)
|
|
msglen = size(msg_d)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 )
|
|
! Sync not needed after MPI call
|
|
#else
|
|
ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
|
|
msglen = size(msg_h)
|
|
CALL parallel_min_real( msglen, msg_h, gid, -1 )
|
|
msg_d = msg_h; DEALLOCATE(msg_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs __MPI for small copies
|
|
#endif
|
|
#endif
|
|
END SUBROUTINE mp_min_rv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gather
|
|
|
|
SUBROUTINE mp_gather_i1_gpu(mydata_d, alldata_d, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: mydata_d
|
|
INTEGER, INTENT(IN) :: gid, root
|
|
INTEGER :: group
|
|
INTEGER, INTENT(OUT), DEVICE :: alldata_d(:)
|
|
INTEGER :: ierr
|
|
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
INTEGER :: mydata_h
|
|
INTEGER, ALLOCATABLE :: alldata_h(:)
|
|
ALLOCATE( alldata_h, source=alldata_d ) ! This syncs __MPI
|
|
mydata_h = mydata_d
|
|
CALL mp_gather_i1(mydata_h, alldata_h, root, gid)
|
|
mydata_d = mydata_h; alldata_d = alldata_h
|
|
DEALLOCATE(alldata_h)
|
|
#else
|
|
group = gid
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_GATHER(mydata_d, 1, MPI_INTEGER, alldata_d, 1, MPI_INTEGER, root, group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9051 )
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
!alldata_d(1) = mydata_d
|
|
ierr = cudaMemcpy( alldata_d(1), mydata_d, 1, &
|
|
& cudaMemcpyDeviceToDevice )
|
|
IF (ierr/=0) CALL mp_stop( 9052 )
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_gather_i1_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_gather_iv_gpu(mydata_d, alldata_d, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: mydata_d(:)
|
|
INTEGER, INTENT(IN) :: gid, root
|
|
INTEGER :: group
|
|
INTEGER, INTENT(OUT), DEVICE :: alldata_d(:,:)
|
|
INTEGER :: msglen, ierr, i
|
|
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
INTEGER, ALLOCATABLE :: mydata_h(:)
|
|
INTEGER, ALLOCATABLE :: alldata_h(:,:)
|
|
ALLOCATE( mydata_h, source=mydata_d ) ! This syncs __MPI
|
|
ALLOCATE( alldata_h, source=alldata_d )
|
|
|
|
CALL mp_gather_iv(mydata_h, alldata_h, root, gid)
|
|
mydata_d = mydata_h; alldata_d = alldata_h
|
|
DEALLOCATE(alldata_h, mydata_h)
|
|
#else
|
|
msglen = SIZE(mydata_d)
|
|
IF( msglen .NE. SIZE(alldata_d, 1) ) CALL mp_stop( 9053 )
|
|
group = gid
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_GATHER(mydata_d, msglen, MPI_INTEGER, alldata_d, msglen, MPI_INTEGER, root, group, IERR)
|
|
IF (ierr/=0) CALL mp_stop( 9054 )
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
msglen = SIZE(mydata_d)
|
|
IF( msglen .NE. SIZE(alldata_d, 1) ) CALL mp_stop( 9055 )
|
|
!alldata_d(:,1) = mydata_d(:)
|
|
ierr = cudaMemcpy(alldata_d(:,1) , mydata_d(1), msglen, cudaMemcpyDeviceToDevice )
|
|
IF (ierr/=0) CALL mp_stop( 9056 )
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_gather_iv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_rv
|
|
!
|
|
SUBROUTINE mp_gatherv_rv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL(DP), DEVICE :: mydata_d(:)
|
|
REAL(DP), DEVICE :: alldata_d(:)
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: ierr, npe, myid
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
REAL(DP), ALLOCATABLE :: mydata_h(:)
|
|
REAL(DP), ALLOCATABLE :: alldata_h(:)
|
|
|
|
ALLOCATE(mydata_h, source=mydata_d) ! This syncs __MPI
|
|
ALLOCATE(alldata_h, source=alldata_d)
|
|
CALL mp_gatherv_rv( mydata_h, alldata_h, recvcount, displs, root, gid)
|
|
alldata_d = alldata_h ; mydata_d = mydata_h
|
|
DEALLOCATE(alldata_h , mydata_h)
|
|
#else
|
|
|
|
group = gid
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9057 )
|
|
CALL mpi_comm_rank( group, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9058 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9059 )
|
|
IF ( myid == root ) THEN
|
|
IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9060 )
|
|
END IF
|
|
IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9061 )
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_DOUBLE_PRECISION, &
|
|
alldata_d, recvcount, displs, MPI_DOUBLE_PRECISION, root, group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9062 )
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9063 )
|
|
IF ( SIZE( mydata_d ) < recvcount( 1 ) ) CALL mp_stop( 9064 )
|
|
!
|
|
!alldata_d( 1:recvcount( 1 ) ) = mydata_d( 1:recvcount( 1 ) )
|
|
ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice )
|
|
IF (ierr/=0) CALL mp_stop( 9065 )
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_gatherv_rv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_cv
|
|
!
|
|
SUBROUTINE mp_gatherv_cv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX(DP), DEVICE :: mydata_d(:)
|
|
COMPLEX(DP), DEVICE :: alldata_d(:)
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: ierr, npe, myid
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
COMPLEX(DP), ALLOCATABLE :: mydata_h(:)
|
|
COMPLEX(DP), ALLOCATABLE :: alldata_h(:)
|
|
|
|
ALLOCATE(mydata_h, source=mydata_d) ! This syncs __MPI
|
|
ALLOCATE(alldata_h, source=alldata_d)
|
|
CALL mp_gatherv_cv( mydata_h, alldata_h, recvcount, displs, root, gid)
|
|
alldata_d = alldata_h ; mydata_d = mydata_h
|
|
DEALLOCATE(alldata_h , mydata_h)
|
|
#else
|
|
group = gid
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9066 )
|
|
CALL mpi_comm_rank( group, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9067 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9068 )
|
|
IF ( myid == root ) THEN
|
|
IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9069 )
|
|
END IF
|
|
IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9070 )
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_DOUBLE_COMPLEX, &
|
|
alldata_d, recvcount, displs, MPI_DOUBLE_COMPLEX, root, group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9071 )
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9072 )
|
|
IF ( SIZE( mydata_d ) < recvcount( 1 ) ) CALL mp_stop( 9073 )
|
|
!
|
|
!alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
|
|
ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice )
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_gatherv_cv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_rv_gpu
|
|
!
|
|
|
|
SUBROUTINE mp_gatherv_iv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: mydata_d(:)
|
|
INTEGER, DEVICE :: alldata_d(:)
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: ierr, npe, myid
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
INTEGER, ALLOCATABLE :: mydata_h(:)
|
|
INTEGER, ALLOCATABLE :: alldata_h(:)
|
|
|
|
ALLOCATE(mydata_h, source=mydata_d) ! This syncs __MPI
|
|
ALLOCATE(alldata_h, source=alldata_d)
|
|
CALL mp_gatherv_iv( mydata_h, alldata_h, recvcount, displs, root, gid)
|
|
alldata_d = alldata_h ; mydata_d = mydata_h
|
|
DEALLOCATE(alldata_h , mydata_h)
|
|
#else
|
|
group = gid
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9074 )
|
|
CALL mpi_comm_rank( group, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9075 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9076 )
|
|
IF ( myid == root ) THEN
|
|
IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9077 )
|
|
END IF
|
|
IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9078 )
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_INTEGER, &
|
|
alldata_d, recvcount, displs, MPI_INTEGER, root, group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9079 )
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9080 )
|
|
IF ( SIZE( mydata_d ) < recvcount( 1 ) ) CALL mp_stop( 9081 )
|
|
!
|
|
!alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
|
|
ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice )
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_gatherv_iv_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_rm
|
|
!
|
|
|
|
SUBROUTINE mp_gatherv_rm_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL(DP), DEVICE :: mydata_d(:,:) ! Warning first dimension is supposed constant!
|
|
REAL(DP), DEVICE :: alldata_d(:,:)
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: ierr, npe, myid, nsiz
|
|
INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)
|
|
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
REAL(DP), ALLOCATABLE :: mydata_h(:,:)
|
|
REAL(DP), ALLOCATABLE :: alldata_h(:,:)
|
|
|
|
ALLOCATE(mydata_h, source=mydata_d) ! This syncs __MPI
|
|
ALLOCATE(alldata_h, source=alldata_d)
|
|
CALL mp_gatherv_rm( mydata_h, alldata_h, recvcount, displs, root, gid)
|
|
alldata_d = alldata_h ; mydata_d = mydata_h
|
|
DEALLOCATE(alldata_h , mydata_h)
|
|
#else
|
|
group = gid
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9082 )
|
|
CALL mpi_comm_rank( group, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9083 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9084 )
|
|
IF ( myid == root ) THEN
|
|
IF ( SIZE( alldata_d, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9085 )
|
|
IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9086 )
|
|
END IF
|
|
IF ( SIZE( mydata_d, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 9087 )
|
|
!
|
|
ALLOCATE( nrecv( npe ), ndisp( npe ) )
|
|
!
|
|
nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata_d, 1 )
|
|
ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata_d, 1 )
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_GATHERV( mydata_d, nrecv( myid + 1 ), MPI_DOUBLE_PRECISION, &
|
|
alldata_d, nrecv, ndisp, MPI_DOUBLE_PRECISION, root, group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9088 )
|
|
!
|
|
DEALLOCATE( nrecv, ndisp )
|
|
!
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9089 )
|
|
IF ( SIZE( alldata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9090 )
|
|
IF ( SIZE( mydata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9091 )
|
|
!
|
|
!alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )
|
|
|
|
ierr = cudaMemcpy2D(alldata_d, SIZE(alldata_d,1),&
|
|
mydata_d, SIZE(mydata_d,1),&
|
|
SIZE(mydata_d,1), recvcount( 1 ), &
|
|
cudaMemcpyDeviceToDevice )
|
|
|
|
IF (ierr/=0) CALL mp_stop( 9092 )
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_gatherv_rm_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_im
|
|
!
|
|
SUBROUTINE mp_gatherv_im_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: mydata_d(:,:) ! Warning first dimension is supposed constant!
|
|
INTEGER, DEVICE :: alldata_d(:,:)
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: group
|
|
INTEGER :: ierr, npe, myid, nsiz
|
|
INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)
|
|
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
INTEGER, ALLOCATABLE :: mydata_h(:,:)
|
|
INTEGER, ALLOCATABLE :: alldata_h(:,:)
|
|
|
|
ALLOCATE(mydata_h, source=mydata_d) ! This syncs __MPI
|
|
ALLOCATE(alldata_h, source=alldata_d)
|
|
CALL mp_gatherv_im( mydata_h, alldata_h, recvcount, displs, root, gid)
|
|
alldata_d = alldata_h ; mydata_d = mydata_h
|
|
DEALLOCATE(alldata_h , mydata_h)
|
|
#else
|
|
group = gid
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9093 )
|
|
CALL mpi_comm_rank( group, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9094 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9095 )
|
|
IF ( myid == root ) THEN
|
|
IF ( SIZE( alldata_d, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9096 )
|
|
IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9097 )
|
|
END IF
|
|
IF ( SIZE( mydata_d, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 9098 )
|
|
!
|
|
ALLOCATE( nrecv( npe ), ndisp( npe ) )
|
|
!
|
|
nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata_d, 1 )
|
|
ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata_d, 1 )
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_GATHERV( mydata_d, nrecv( myid + 1 ), MPI_INTEGER, &
|
|
alldata_d, nrecv, ndisp, MPI_INTEGER, root, group, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9099 )
|
|
!
|
|
DEALLOCATE( nrecv, ndisp )
|
|
!
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9100 )
|
|
IF ( SIZE( alldata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9101 )
|
|
IF ( SIZE( mydata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9102 )
|
|
!
|
|
!alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )
|
|
|
|
ierr = cudaMemcpy2D(alldata_d, SIZE(alldata_d,1),&
|
|
mydata_d, SIZE(mydata_d,1),&
|
|
SIZE(mydata_d,1), recvcount( 1 ), &
|
|
cudaMemcpyDeviceToDevice )
|
|
|
|
IF (ierr/=0) CALL mp_stop( 9103 )
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_gatherv_im_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_alltoall_c3d_gpu( sndbuf_d, rcvbuf_d, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX(DP), DEVICE :: sndbuf_d( :, :, : )
|
|
COMPLEX(DP), DEVICE :: rcvbuf_d( :, :, : )
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
COMPLEX(DP), ALLOCATABLE :: sndbuf_h(:,:,:)
|
|
COMPLEX(DP), ALLOCATABLE :: rcvbuf_h(:,:,:)
|
|
|
|
ALLOCATE(sndbuf_h, source=sndbuf_d) ! This syncs __MPI
|
|
ALLOCATE(rcvbuf_h, source=rcvbuf_d)
|
|
CALL mp_alltoall_c3d( sndbuf_h, rcvbuf_h, gid )
|
|
sndbuf_d = sndbuf_h ; rcvbuf_d = rcvbuf_h
|
|
DEALLOCATE(sndbuf_h , rcvbuf_h)
|
|
#else
|
|
group = gid
|
|
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9104 )
|
|
|
|
IF ( SIZE( sndbuf_d, 3 ) < npe ) CALL mp_stop( 9105 )
|
|
IF ( SIZE( rcvbuf_d, 3 ) < npe ) CALL mp_stop( 9106 )
|
|
|
|
nsiz = SIZE( sndbuf_d, 1 ) * SIZE( sndbuf_d, 2 )
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
!
|
|
CALL MPI_ALLTOALL( sndbuf_d, nsiz, MPI_DOUBLE_COMPLEX, &
|
|
rcvbuf_d, nsiz, MPI_DOUBLE_COMPLEX, group, ierr )
|
|
|
|
IF (ierr/=0) CALL mp_stop( 9107 )
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
rcvbuf_d = sndbuf_d
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_alltoall_c3d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_alltoall_i3d_gpu( sndbuf_d, rcvbuf_d, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: sndbuf_d( :, :, : )
|
|
INTEGER, DEVICE :: rcvbuf_d( :, :, : )
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
INTEGER, ALLOCATABLE :: sndbuf_h(:,:,:)
|
|
INTEGER, ALLOCATABLE :: rcvbuf_h(:,:,:)
|
|
|
|
ALLOCATE(sndbuf_h, source=sndbuf_d) ! This syncs __MPI
|
|
ALLOCATE(rcvbuf_h, source=rcvbuf_d)
|
|
CALL mp_alltoall_i3d( sndbuf_h, rcvbuf_h, gid )
|
|
sndbuf_d = sndbuf_h ; rcvbuf_d = rcvbuf_h
|
|
DEALLOCATE(sndbuf_h , rcvbuf_h)
|
|
#else
|
|
group = gid
|
|
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9108 )
|
|
|
|
IF ( SIZE( sndbuf_d, 3 ) < npe ) CALL mp_stop( 9109 )
|
|
IF ( SIZE( rcvbuf_d, 3 ) < npe ) CALL mp_stop( 9110 )
|
|
|
|
nsiz = SIZE( sndbuf_d, 1 ) * SIZE( sndbuf_d, 2 )
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
!
|
|
CALL MPI_ALLTOALL( sndbuf_d, nsiz, MPI_INTEGER, &
|
|
rcvbuf_d, nsiz, MPI_INTEGER, group, ierr )
|
|
|
|
IF (ierr/=0) CALL mp_stop( 9111 )
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
|
|
rcvbuf_d = sndbuf_d
|
|
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_alltoall_i3d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_circular_shift_left_i0_gpu( buf_d, itag, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: buf_d
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
INTEGER :: buf_h
|
|
buf_h = buf_d ! This syncs __MPI
|
|
CALL mp_circular_shift_left_i0( buf_h, itag, gid )
|
|
buf_d = buf_h
|
|
#else
|
|
INTEGER :: istatus( mpi_status_size )
|
|
!
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9112 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9113 )
|
|
!
|
|
sour = mype + 1
|
|
IF( sour == npe ) sour = 0
|
|
dest = mype - 1
|
|
IF( dest == -1 ) dest = npe - 1
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_Sendrecv_replace( buf_d, 1, MPI_INTEGER, &
|
|
dest, itag, sour, itag, group, istatus, ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 9114 )
|
|
!
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
! do nothing
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_circular_shift_left_i0_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_circular_shift_left_i1_gpu( buf_d, itag, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: buf_d(:)
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
INTEGER, ALLOCATABLE :: buf_h(:)
|
|
ALLOCATE(buf_h, source=buf_d) ! This syncs __MPI
|
|
CALL mp_circular_shift_left_i1( buf_h, itag, gid )
|
|
buf_d = buf_h; DEALLOCATE(buf_h)
|
|
#else
|
|
INTEGER :: istatus( mpi_status_size )
|
|
!
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9115 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9116 )
|
|
!
|
|
sour = mype + 1
|
|
IF( sour == npe ) sour = 0
|
|
dest = mype - 1
|
|
IF( dest == -1 ) dest = npe - 1
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_INTEGER, &
|
|
dest, itag, sour, itag, group, istatus, ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 9117 )
|
|
!
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
! do nothing
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_circular_shift_left_i1_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_circular_shift_left_i2_gpu( buf_d, itag, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
INTEGER, DEVICE :: buf_d(:,:)
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
INTEGER, ALLOCATABLE :: buf_h(:,:)
|
|
ALLOCATE(buf_h, source=buf_d) ! This syncs __MPI
|
|
CALL mp_circular_shift_left_i2( buf_h, itag, gid )
|
|
buf_d = buf_h; DEALLOCATE(buf_h)
|
|
#else
|
|
INTEGER :: istatus( mpi_status_size )
|
|
!
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9118 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9119 )
|
|
!
|
|
sour = mype + 1
|
|
IF( sour == npe ) sour = 0
|
|
dest = mype - 1
|
|
IF( dest == -1 ) dest = npe - 1
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_INTEGER, &
|
|
dest, itag, sour, itag, group, istatus, ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 9120 )
|
|
!
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
! do nothing
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_circular_shift_left_i2_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_circular_shift_left_r2d_gpu( buf_d, itag, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL(DP), DEVICE :: buf_d( :, : )
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
REAL(DP), ALLOCATABLE :: buf_h(:, :)
|
|
ALLOCATE(buf_h, source=buf_d) ! This syncs __MPI
|
|
CALL mp_circular_shift_left_r2d( buf_h, itag, gid )
|
|
buf_d = buf_h; DEALLOCATE(buf_h)
|
|
#else
|
|
INTEGER :: istatus( mpi_status_size )
|
|
!
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9121 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9122 )
|
|
!
|
|
sour = mype + 1
|
|
IF( sour == npe ) sour = 0
|
|
dest = mype - 1
|
|
IF( dest == -1 ) dest = npe - 1
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_DOUBLE_PRECISION, &
|
|
dest, itag, sour, itag, group, istatus, ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 9123 )
|
|
!
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
! do nothing
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_circular_shift_left_r2d_gpu
|
|
!
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
SUBROUTINE mp_circular_shift_left_c2d_gpu( buf_d, itag, gid )
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX(DP), DEVICE :: buf_d( :, : )
|
|
INTEGER, INTENT(IN) :: itag
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
COMPLEX(DP), ALLOCATABLE :: buf_h(:, :)
|
|
ALLOCATE(buf_h, source=buf_d) ! This syncs __MPI
|
|
CALL mp_circular_shift_left_c2d( buf_h, itag, gid )
|
|
buf_d = buf_h; DEALLOCATE(buf_h)
|
|
#else
|
|
INTEGER :: istatus( mpi_status_size )
|
|
!
|
|
group = gid
|
|
!
|
|
CALL mpi_comm_size( group, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9124 )
|
|
CALL mpi_comm_rank( group, mype, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9125 )
|
|
!
|
|
sour = mype + 1
|
|
IF( sour == npe ) sour = 0
|
|
dest = mype - 1
|
|
IF( dest == -1 ) dest = npe - 1
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_DOUBLE_COMPLEX, &
|
|
dest, itag, sour, itag, group, istatus, ierr)
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 9126 )
|
|
!
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#else
|
|
! do nothing
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_circular_shift_left_c2d_gpu
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_gatherv_inplace_cplx_array
|
|
!
|
|
|
|
SUBROUTINE mp_gatherv_inplace_cplx_array_gpu(alldata_d, my_column_type, recvcount, displs, root, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX(DP), DEVICE :: alldata_d(:,:)
|
|
INTEGER, INTENT(IN) :: my_column_type
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:)
|
|
INTEGER, INTENT(IN) :: root, gid
|
|
INTEGER :: ierr, npe, myid
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
COMPLEX(DP), ALLOCATABLE :: alldata_h(:, :)
|
|
!
|
|
! Avoid unnecessary communications on __MPI
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
ALLOCATE(alldata_h, source=alldata_d) ! This syncs __MPI
|
|
CALL mp_gatherv_inplace_cplx_array(alldata_h, my_column_type, recvcount, displs, root, gid)
|
|
alldata_d = alldata_h; DEALLOCATE(alldata_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs in case of small data chunks
|
|
RETURN
|
|
#else
|
|
CALL mpi_comm_size( gid, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9127 )
|
|
CALL mpi_comm_rank( gid, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9128 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9129 )
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
IF (myid==root) THEN
|
|
CALL MPI_GATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
|
|
alldata_d, recvcount, displs, my_column_type, root, gid, ierr )
|
|
ELSE
|
|
CALL MPI_GATHERV( alldata_d(1,displs(myid+1)+1), recvcount(myid+1), my_column_type, &
|
|
MPI_IN_PLACE, recvcount, displs, MPI_DATATYPE_NULL, root, gid, ierr )
|
|
ENDIF
|
|
!
|
|
IF (ierr/=0) CALL mp_stop( 9130 )
|
|
!
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL
|
|
END SUBROUTINE mp_gatherv_inplace_cplx_array_gpu
|
|
|
|
!------------------------------------------------------------------------------!
|
|
!..mp_allgatherv_inplace_cplx_array
|
|
!
|
|
|
|
SUBROUTINE mp_allgatherv_inplace_cplx_array_gpu(alldata_d, my_element_type, recvcount, displs, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
COMPLEX(DP), DEVICE :: alldata_d(:,:)
|
|
INTEGER, INTENT(IN) :: my_element_type
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: ierr, npe, myid
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
COMPLEX(DP), ALLOCATABLE :: alldata_h(:, :)
|
|
!
|
|
! Avoid unnecessary communications on __MPI
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
ALLOCATE(alldata_h, source=alldata_d)! This syncs __MPI
|
|
CALL mp_allgatherv_inplace_cplx_array(alldata_h, my_element_type, recvcount, displs, gid)
|
|
alldata_d = alldata_h; DEALLOCATE(alldata_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs in case of small data chunks
|
|
RETURN
|
|
#else
|
|
CALL mpi_comm_size( gid, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9131 )
|
|
CALL mpi_comm_rank( gid, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9132 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9133 )
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
|
|
alldata_d, recvcount, displs, my_element_type, gid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9134 )
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_allgatherv_inplace_cplx_array_gpu
|
|
!..mp_allgatherv_inplace_real_array
|
|
!
|
|
|
|
SUBROUTINE mp_allgatherv_inplace_real_array_gpu(alldata_d, my_element_type, recvcount, displs, gid)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
REAL(DP), DEVICE :: alldata_d(:,:)
|
|
INTEGER, INTENT(IN) :: my_element_type
|
|
INTEGER, INTENT(IN) :: recvcount(:), displs(:)
|
|
INTEGER, INTENT(IN) :: gid
|
|
INTEGER :: ierr, npe, myid
|
|
|
|
#if defined (__MPI)
|
|
#if ! defined(__GPU_MPI)
|
|
REAL(DP), ALLOCATABLE :: alldata_h(:, :)
|
|
!
|
|
! Avoid unnecessary communications on __MPI
|
|
IF ( mp_size(gid) == 1 ) THEN
|
|
ierr = cudaDeviceSynchronize()
|
|
RETURN
|
|
END IF
|
|
!
|
|
ALLOCATE(alldata_h, source=alldata_d)! This syncs __MPI
|
|
CALL mp_allgatherv_inplace_real_array(alldata_h, my_element_type, recvcount, displs, gid)
|
|
alldata_d = alldata_h; DEALLOCATE(alldata_h)
|
|
ierr = cudaDeviceSynchronize() ! This syncs in case of small data chunks
|
|
RETURN
|
|
#else
|
|
CALL mpi_comm_size( gid, npe, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9131 )
|
|
CALL mpi_comm_rank( gid, myid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9132 )
|
|
!
|
|
IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9133 )
|
|
!
|
|
ierr = cudaDeviceSynchronize() ! This syncs __GPU_MPI
|
|
CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
|
|
alldata_d, recvcount, displs, my_element_type, gid, ierr )
|
|
IF (ierr/=0) CALL mp_stop( 9134 )
|
|
RETURN ! Sync not needed after MPI call
|
|
#endif
|
|
#endif
|
|
ierr = cudaDeviceSynchronize() ! This syncs SERIAL, __MPI
|
|
END SUBROUTINE mp_allgatherv_inplace_real_array_gpu
|
|
|
|
SUBROUTINE mp_type_create_cplx_column_section_gpu(dummy, start, length, stride, mytype)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
!
|
|
COMPLEX (DP), DEVICE, INTENT(IN) :: dummy
|
|
INTEGER, INTENT(IN) :: start, length, stride
|
|
INTEGER, INTENT(OUT) :: mytype
|
|
!
|
|
#if defined(__MPI)
|
|
INTEGER :: ierr
|
|
!
|
|
CALL MPI_TYPE_CREATE_SUBARRAY(1, [stride], [length], [start], &
|
|
MPI_ORDER_FORTRAN, MPI_DOUBLE_COMPLEX, mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8081 )
|
|
CALL MPI_Type_commit(mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8082 )
|
|
#else
|
|
mytype = 0;
|
|
#endif
|
|
!
|
|
RETURN
|
|
END SUBROUTINE mp_type_create_cplx_column_section_gpu
|
|
|
|
SUBROUTINE mp_type_create_real_column_section_gpu(dummy, start, length, stride, mytype)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
!
|
|
REAL (DP), DEVICE, INTENT(IN) :: dummy
|
|
INTEGER, INTENT(IN) :: start, length, stride
|
|
INTEGER, INTENT(OUT) :: mytype
|
|
!
|
|
#if defined(__MPI)
|
|
INTEGER :: ierr
|
|
!
|
|
CALL MPI_TYPE_CREATE_SUBARRAY(1, [stride], [length], [start], &
|
|
MPI_ORDER_FORTRAN, MPI_DOUBLE_PRECISION, mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8083 )
|
|
CALL MPI_Type_commit(mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8084 )
|
|
#else
|
|
mytype = 0;
|
|
#endif
|
|
!
|
|
RETURN
|
|
END SUBROUTINE mp_type_create_real_column_section_gpu
|
|
|
|
SUBROUTINE mp_type_create_cplx_row_section_gpu(dummy, column_start, column_stride, row_length, mytype)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
!
|
|
COMPLEX (DP), DEVICE, INTENT(IN) :: dummy
|
|
INTEGER, INTENT(IN) :: column_start, column_stride, row_length
|
|
INTEGER, INTENT(OUT) :: mytype
|
|
!
|
|
#if defined(__MPI)
|
|
INTEGER :: ierr, temporary
|
|
INTEGER :: strides(2), lengths(2), starts(2)
|
|
INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
|
|
!
|
|
strides(1) = column_stride ; strides(2) = row_length
|
|
lengths(1) = 1 ; lengths(2) = row_length
|
|
starts(1) = column_start ; starts(2) = 0
|
|
CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
|
|
MPI_DOUBLE_COMPLEX, temporary, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8085 )
|
|
CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_COMPLEX, lb, extent, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8085 )
|
|
CALL MPI_TYPE_COMMIT(temporary, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8085 )
|
|
CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8086 )
|
|
CALL MPI_Type_commit(mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8086 )
|
|
#else
|
|
mytype = 0;
|
|
#endif
|
|
!
|
|
RETURN
|
|
END SUBROUTINE mp_type_create_cplx_row_section_gpu
|
|
|
|
SUBROUTINE mp_type_create_real_row_section_gpu(dummy, column_start, column_stride, row_length, mytype)
|
|
USE parallel_include
|
|
IMPLICIT NONE
|
|
!
|
|
REAL (DP), DEVICE, INTENT(IN) :: dummy
|
|
INTEGER, INTENT(IN) :: column_start, column_stride, row_length
|
|
INTEGER, INTENT(OUT) :: mytype
|
|
!
|
|
#if defined(__MPI)
|
|
INTEGER :: ierr, temporary
|
|
INTEGER :: strides(2), lengths(2), starts(2)
|
|
INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
|
|
!
|
|
strides(1) = column_stride ; strides(2) = row_length
|
|
lengths(1) = 1 ; lengths(2) = row_length
|
|
starts(1) = column_start ; starts(2) = 0
|
|
CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
|
|
MPI_DOUBLE_PRECISION, temporary, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8087 )
|
|
CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_PRECISION, lb, extent, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8087 )
|
|
CALL MPI_TYPE_COMMIT(temporary, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8087 )
|
|
CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8088 )
|
|
CALL MPI_Type_commit(mytype, ierr)
|
|
IF (ierr/=0) CALL mp_stop( 8088 )
|
|
#else
|
|
mytype = 0;
|
|
#endif
|
|
!
|
|
RETURN
|
|
END SUBROUTINE mp_type_create_real_row_section_gpu
|
|
|
|
!------------------------------------------------------------------------------!
|
|
|
|
#endif
|
|
!------------------------------------------------------------------------------!
|
|
END MODULE mp
|
|
!------------------------------------------------------------------------------!
|
|
!
|
|
! Script to generate stop messages:
|
|
! # coding: utf-8
|
|
! import re
|
|
! import sys
|
|
! i = 8000
|
|
! def replace(match):
|
|
! global i
|
|
! i += 1
|
|
! return 'mp_stop( {0} )'.format(i)
|
|
!
|
|
! with open(sys.argv[1],'r') as f:
|
|
! data = re.sub(r"mp_stop\(\s?\d+\s?\)", replace, f.read())
|
|
! with open(sys.argv[1]+'.new','w') as fo:
|
|
! fo.write(data)
|