mirror of https://gitlab.com/QEF/q-e.git
- eliminated module task groups (subroutines moved to fft modules)
- added variables to fft type to store info about data distribution git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@7433 c92efa57-630b-4861-b058-cf58834340f0
This commit is contained in:
parent
aa3843a34a
commit
8fb1ad0998
|
@ -70,7 +70,6 @@ sic.o \
|
|||
splinelib.o \
|
||||
stick_base.o \
|
||||
stick_set.o \
|
||||
task_groups.o \
|
||||
timestep.o \
|
||||
version.o \
|
||||
upf.o \
|
||||
|
|
|
@ -39,6 +39,7 @@
|
|||
PUBLIC :: dfftp, dffts, dfftb, fft_dlay_descriptor
|
||||
PUBLIC :: cgather_sym, cgather_smooth, cscatter_sym, cscatter_smooth
|
||||
PUBLIC :: gather_smooth, scatter_smooth
|
||||
PUBLIC :: tg_gather
|
||||
|
||||
|
||||
|
||||
|
@ -1164,6 +1165,67 @@ SUBROUTINE scatter_smooth( f_in, f_out )
|
|||
!
|
||||
END SUBROUTINE scatter_smooth
|
||||
|
||||
|
||||
!
|
||||
SUBROUTINE tg_gather( dffts, v, tg_v )
|
||||
!
|
||||
USE parallel_include
|
||||
!
|
||||
USE fft_types, ONLY : fft_dlay_descriptor
|
||||
|
||||
! T.G.
|
||||
! NOGRP: Number of processors per orbital task group
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
TYPE(fft_dlay_descriptor), INTENT(in) :: dffts
|
||||
|
||||
REAL(DP) :: v(:)
|
||||
REAL(DP) :: tg_v(:)
|
||||
|
||||
INTEGER :: nsiz, i, ierr, nsiz_tg
|
||||
INTEGER :: recv_cnt( dffts%nogrp ), recv_displ( dffts%nogrp )
|
||||
|
||||
nsiz_tg = dffts%tg_nnr * dffts%nogrp
|
||||
|
||||
IF( size( tg_v ) < nsiz_tg ) &
|
||||
CALL errore( ' tg_gather ', ' tg_v too small ', ( nsiz_tg - size( tg_v ) ) )
|
||||
|
||||
nsiz = dffts%npp( dffts%myid+1 ) * dffts%nr1x * dffts%nr2x
|
||||
|
||||
IF( size( v ) < nsiz ) &
|
||||
CALL errore( ' tg_gather ', ' v too small ', ( nsiz - size( v ) ) )
|
||||
|
||||
!
|
||||
! The potential in v is distributed accros all processors
|
||||
! We need to redistribute it so that it is completely contained in the
|
||||
! processors of an orbital TASK-GROUP
|
||||
!
|
||||
recv_cnt(1) = dffts%npp( dffts%nolist(1) + 1 ) * dffts%nr1x * dffts%nr2x
|
||||
recv_displ(1) = 0
|
||||
DO i = 2, dffts%nogrp
|
||||
recv_cnt(i) = dffts%npp( dffts%nolist(i) + 1 ) * dffts%nr1x * dffts%nr2x
|
||||
recv_displ(i) = recv_displ(i-1) + recv_cnt(i-1)
|
||||
ENDDO
|
||||
|
||||
! clean only elements that will not be overwritten
|
||||
!
|
||||
DO i = recv_displ(dffts%nogrp) + recv_cnt( dffts%nogrp ) + 1, size( tg_v )
|
||||
tg_v( i ) = 0.0d0
|
||||
ENDDO
|
||||
|
||||
#if defined (__PARA) && defined (__MPI)
|
||||
|
||||
CALL MPI_Allgatherv( v(1), nsiz, MPI_DOUBLE_PRECISION, &
|
||||
tg_v(1), recv_cnt, recv_displ, MPI_DOUBLE_PRECISION, dffts%ogrp_comm, IERR)
|
||||
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' tg_gather ', ' MPI_Allgatherv ', abs( ierr ) )
|
||||
|
||||
#endif
|
||||
|
||||
END SUBROUTINE tg_gather
|
||||
|
||||
!=----------------------------------------------------------------------=!
|
||||
END MODULE fft_base
|
||||
!=----------------------------------------------------------------------=!
|
||||
|
|
|
@ -52,8 +52,20 @@ MODULE fft_types
|
|||
INTEGER, POINTER :: imax3(:) ! the last local plane
|
||||
INTEGER, POINTER :: np3(:) ! number of local plane for the box fft
|
||||
!
|
||||
! fft parallelization
|
||||
!
|
||||
INTEGER :: myid ! my processor id
|
||||
INTEGER :: comm ! communicator of the fft gruop
|
||||
INTEGER :: nproc ! number of processor in the fft group
|
||||
!
|
||||
! task groups
|
||||
!
|
||||
INTEGER :: NOGRP
|
||||
INTEGER :: NPGRP
|
||||
INTEGER :: ogrp_comm
|
||||
INTEGER :: pgrp_comm
|
||||
INTEGER, POINTER :: nolist(:) ! number of sticks per task group ( wave func )
|
||||
!
|
||||
LOGICAL :: have_task_groups
|
||||
INTEGER :: tg_nnr ! maximum among nnr
|
||||
INTEGER, POINTER :: tg_nsw(:) ! number of sticks per task group ( wave func )
|
||||
|
@ -72,9 +84,9 @@ MODULE fft_types
|
|||
|
||||
CONTAINS
|
||||
|
||||
SUBROUTINE fft_dlay_allocate( desc, nproc, nx, ny )
|
||||
SUBROUTINE fft_dlay_allocate( desc, myid, nproc, comm, nx, ny )
|
||||
TYPE (fft_dlay_descriptor) :: desc
|
||||
INTEGER, INTENT(in) :: nproc, nx, ny
|
||||
INTEGER, INTENT(in) :: myid, nproc, comm, nx, ny ! myid starting from 0
|
||||
ALLOCATE( desc%nsp( nproc ) )
|
||||
ALLOCATE( desc%nsw( nproc ) )
|
||||
ALLOCATE( desc%ngl( nproc ) )
|
||||
|
@ -101,7 +113,15 @@ CONTAINS
|
|||
|
||||
desc%id = 0
|
||||
|
||||
desc%myid = myid
|
||||
desc%comm = comm
|
||||
desc%nproc = nproc
|
||||
desc%have_task_groups = .false.
|
||||
desc%NOGRP = 0
|
||||
desc%NPGRP = 0
|
||||
desc%ogrp_comm = 0
|
||||
desc%pgrp_comm = 0
|
||||
NULLIFY( desc%nolist )
|
||||
NULLIFY( desc%tg_nsw )
|
||||
NULLIFY( desc%tg_npp )
|
||||
NULLIFY( desc%tg_snd )
|
||||
|
@ -128,6 +148,7 @@ CONTAINS
|
|||
IF ( associated( desc%iplw ) ) DEALLOCATE( desc%iplw )
|
||||
desc%id = 0
|
||||
IF( desc%have_task_groups ) THEN
|
||||
IF ( associated( desc%nolist ) ) DEALLOCATE( desc%nolist )
|
||||
IF ( associated( desc%tg_nsw ) ) DEALLOCATE( desc%tg_nsw )
|
||||
IF ( associated( desc%tg_npp ) ) DEALLOCATE( desc%tg_npp )
|
||||
IF ( associated( desc%tg_snd ) ) DEALLOCATE( desc%tg_snd )
|
||||
|
@ -141,9 +162,9 @@ CONTAINS
|
|||
|
||||
!=----------------------------------------------------------------------------=!
|
||||
|
||||
SUBROUTINE fft_box_allocate( desc, nproc, nat )
|
||||
SUBROUTINE fft_box_allocate( desc, myid, nproc, comm, nat )
|
||||
TYPE (fft_dlay_descriptor) :: desc
|
||||
INTEGER, INTENT(in) :: nat, nproc
|
||||
INTEGER, INTENT(in) :: nat, nproc, myid, comm ! myid starting from 0
|
||||
ALLOCATE( desc%irb( 3, nat ) )
|
||||
ALLOCATE( desc%imin3( nat ) )
|
||||
ALLOCATE( desc%imax3( nat ) )
|
||||
|
@ -156,6 +177,9 @@ CONTAINS
|
|||
desc%npp = 0
|
||||
desc%ipp = 0
|
||||
desc%np3 = 0
|
||||
desc%myid = myid
|
||||
desc%nproc = nproc
|
||||
desc%comm = comm
|
||||
desc%have_task_groups = .false.
|
||||
END SUBROUTINE fft_box_allocate
|
||||
|
||||
|
@ -174,7 +198,7 @@ CONTAINS
|
|||
!=----------------------------------------------------------------------------=!
|
||||
|
||||
SUBROUTINE fft_dlay_set( desc, tk, nst, nr1, nr2, nr3, nr1x, nr2x, nr3x, me, &
|
||||
nproc, nogrp, ub, lb, idx, in1, in2, ncp, ncpw, ngp, ngpw, st, stw )
|
||||
nproc, comm, nogrp, ub, lb, idx, in1, in2, ncp, ncpw, ngp, ngpw, st, stw )
|
||||
|
||||
TYPE (fft_dlay_descriptor) :: desc
|
||||
|
||||
|
@ -184,6 +208,7 @@ CONTAINS
|
|||
INTEGER, INTENT(in) :: nr1x, nr2x, nr3x ! padded size of real space grid
|
||||
INTEGER, INTENT(in) :: me ! processor index (starting from 1)
|
||||
INTEGER, INTENT(in) :: nproc ! number of processors
|
||||
INTEGER, INTENT(in) :: comm ! communicator
|
||||
INTEGER, INTENT(in) :: nogrp ! number of processors in task-group
|
||||
INTEGER, INTENT(in) :: ub(3), lb(3) ! upper and lower bound of real space indices
|
||||
INTEGER, INTENT(in) :: idx(:)
|
||||
|
@ -216,6 +241,15 @@ CONTAINS
|
|||
IF( ( size( ncp ) < nproc ) .or. ( size( ngp ) < nproc ) ) &
|
||||
CALL errore( ' fft_dlay_set ', ' wrong stick dimensions ', 4 )
|
||||
|
||||
IF( desc%nproc /= nproc ) &
|
||||
CALL errore( ' fft_dlay_set ', ' wrong number of processor ', 4 )
|
||||
|
||||
IF( desc%myid /= (me - 1) ) &
|
||||
CALL errore( ' fft_dlay_set ', ' wrong processor index ', 4 )
|
||||
|
||||
IF( desc%comm /= comm ) &
|
||||
CALL errore( ' fft_dlay_set ', ' wrong communicator ', 4 )
|
||||
|
||||
desc%have_task_groups = .false.
|
||||
|
||||
! Set the number of "xy" planes for each processor
|
||||
|
@ -430,13 +464,13 @@ CONTAINS
|
|||
!=----------------------------------------------------------------------------=!
|
||||
|
||||
SUBROUTINE fft_box_set( desc, nr1b, nr2b, nr3b, nr1bx, nr2bx, nr3bx, nat, &
|
||||
irb, me, nproc, npp, ipp )
|
||||
irb, me, nproc, comm, npp, ipp )
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
TYPE (fft_dlay_descriptor) :: desc
|
||||
|
||||
INTEGER, INTENT(in) :: nat, me, nproc
|
||||
INTEGER, INTENT(in) :: nat, me, nproc, comm
|
||||
INTEGER, INTENT(in) :: irb( :, : )
|
||||
INTEGER, INTENT(in) :: npp( : )
|
||||
INTEGER, INTENT(in) :: ipp( : )
|
||||
|
@ -452,6 +486,16 @@ CONTAINS
|
|||
IF( nproc > size( desc%npp ) ) &
|
||||
CALL errore(" fft_box_set ", " inconsistent dimensions ", 2 )
|
||||
|
||||
IF( desc%nproc /= nproc ) &
|
||||
CALL errore( ' fft_dlay_set ', ' wrong number of processor ', 4 )
|
||||
|
||||
IF( desc%myid /= (me - 1) ) &
|
||||
CALL errore( ' fft_dlay_set ', ' wrong processor index ', 4 )
|
||||
|
||||
IF( desc%comm /= comm ) &
|
||||
CALL errore( ' fft_dlay_set ', ' wrong communicator ', 4 )
|
||||
|
||||
|
||||
desc%nr1 = nr1b
|
||||
desc%nr2 = nr2b
|
||||
desc%nr3 = nr3b
|
||||
|
|
|
@ -215,18 +215,13 @@ splinelib.o : kind.o
|
|||
stick_base.o : io_global.o
|
||||
stick_base.o : kind.o
|
||||
stick_base.o : mp.o
|
||||
stick_base.o : mp_global.o
|
||||
stick_set.o : fft_types.o
|
||||
stick_set.o : griddim.o
|
||||
stick_set.o : io_global.o
|
||||
stick_set.o : kind.o
|
||||
stick_set.o : mp_global.o
|
||||
stick_set.o : parallel_include.o
|
||||
stick_set.o : stick_base.o
|
||||
task_groups.o : fft_types.o
|
||||
task_groups.o : io_global.o
|
||||
task_groups.o : kind.o
|
||||
task_groups.o : mp_global.o
|
||||
task_groups.o : parallel_include.o
|
||||
timestep.o : kind.o
|
||||
upf.o : ../iotk/src/iotk_module.o
|
||||
upf.o : kind.o
|
||||
|
|
|
@ -22,7 +22,10 @@
|
|||
USE io_global, ONLY: ionode, stdout
|
||||
USE fft_types, ONLY: fft_dlay_descriptor, fft_dlay_allocate, &
|
||||
fft_dlay_set, fft_dlay_scalar
|
||||
USE mp_global, ONLY: me_pool, nproc_pool, intra_pool_comm, nogrp
|
||||
USE mp_global, ONLY: me_pool, nproc_pool, intra_pool_comm, nogrp, use_task_groups
|
||||
USE mp_global, ONLY : me_pool, nproc_pool, intra_pool_comm
|
||||
USE mp_global, ONLY : NOGRP, NPGRP, ogrp_comm, pgrp_comm
|
||||
USE mp_global, ONLY : nolist
|
||||
|
||||
PRIVATE
|
||||
SAVE
|
||||
|
@ -196,13 +199,13 @@
|
|||
|
||||
#if defined __PARA
|
||||
|
||||
CALL fft_dlay_allocate( dfftp, nproc_pool, nr1x, nr2x )
|
||||
CALL fft_dlay_allocate( dffts, nproc_pool, nr1sx, nr2sx )
|
||||
CALL fft_dlay_allocate( dfftp, me_pool, nproc_pool, intra_pool_comm, nr1x, nr2x )
|
||||
CALL fft_dlay_allocate( dffts, me_pool, nproc_pool, intra_pool_comm, nr1sx, nr2sx )
|
||||
|
||||
CALL fft_dlay_set( dfftp, tk, nst, nr1, nr2, nr3, nr1x, nr2x, nr3x, (me_pool+1), &
|
||||
nproc_pool, nogrp, ub, lb, idx, ist(:,1), ist(:,2), nstp, nstpw, sstp, sstpw, st, stw )
|
||||
nproc_pool, intra_pool_comm, nogrp, ub, lb, idx, ist(:,1), ist(:,2), nstp, nstpw, sstp, sstpw, st, stw )
|
||||
CALL fft_dlay_set( dffts, tk, nsts, nr1s, nr2s, nr3s, nr1sx, nr2sx, nr3sx, (me_pool+1), &
|
||||
nproc_pool, nogrp, ub, lb, idx, ist(:,1), ist(:,2), nstps, nstpw, sstps, sstpw, sts, stw )
|
||||
nproc_pool, intra_pool_comm, nogrp, ub, lb, idx, ist(:,1), ist(:,2), nstps, nstpw, sstps, sstpw, sts, stw )
|
||||
|
||||
#else
|
||||
|
||||
|
@ -215,8 +218,8 @@
|
|||
IF( ngm_ /= ngm ) CALL errore( ' pstickset ', ' inconsistent ngm ', abs( ngm - ngm_ ) )
|
||||
IF( ngs_ /= ngs ) CALL errore( ' pstickset ', ' inconsistent ngs ', abs( ngs - ngs_ ) )
|
||||
|
||||
CALL fft_dlay_allocate( dfftp, nproc_pool, max(nr1x, nr3x), nr2x )
|
||||
CALL fft_dlay_allocate( dffts, nproc_pool, max(nr1sx, nr3sx), nr2sx )
|
||||
CALL fft_dlay_allocate( dfftp, me_pool, nproc_pool, intra_pool_comm, max(nr1x, nr3x), nr2x )
|
||||
CALL fft_dlay_allocate( dffts, me_pool, nproc_pool, intra_pool_comm, max(nr1sx, nr3sx), nr2sx )
|
||||
|
||||
CALL fft_dlay_scalar( dfftp, ub, lb, nr1, nr2, nr3, nr1x, nr2x, nr3x, stw )
|
||||
CALL fft_dlay_scalar( dffts, ub, lb, nr1s, nr2s, nr3s, nr1sx, nr2sx, nr3sx, stw )
|
||||
|
@ -231,6 +234,15 @@
|
|||
! ... Maximum number of sticks (wave func.)
|
||||
nstpwx = maxval( nstpw )
|
||||
|
||||
IF( use_task_groups ) THEN
|
||||
!
|
||||
! Initialize task groups.
|
||||
! Note that this call modify dffts adding task group data.
|
||||
!
|
||||
CALL task_groups_init( dffts )
|
||||
!
|
||||
END IF
|
||||
|
||||
IF (ionode) WRITE( stdout,118)
|
||||
118 FORMAT(3X,' n.st n.stw n.sts n.g n.gw n.gs')
|
||||
WRITE( stdout,121) minval(nstp), minval(nstpw), minval(nstps), minval(sstp), minval(sstpw), minval(sstps)
|
||||
|
@ -264,6 +276,118 @@
|
|||
RETURN
|
||||
END SUBROUTINE pstickset
|
||||
|
||||
|
||||
!-----------------------------------------
|
||||
! Task groups Contributed by C. Bekas, October 2005
|
||||
! Revised by C. Cavazzoni
|
||||
!--------------------------------------------
|
||||
|
||||
SUBROUTINE task_groups_init( dffts )
|
||||
|
||||
USE parallel_include
|
||||
!
|
||||
USE io_global, ONLY : stdout
|
||||
USE fft_types, ONLY : fft_dlay_descriptor
|
||||
|
||||
! T.G.
|
||||
! NPGRP: Number of processors per group
|
||||
! NOGRP: Number of processors per orbital task group
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
TYPE(fft_dlay_descriptor), INTENT(inout) :: dffts
|
||||
|
||||
!----------------------------------
|
||||
!Local Variables declaration
|
||||
!----------------------------------
|
||||
|
||||
INTEGER :: I
|
||||
INTEGER :: IERR
|
||||
INTEGER :: num_planes, num_sticks
|
||||
INTEGER :: nnrsx_vec ( nproc_pool )
|
||||
INTEGER :: pgroup( nproc_pool )
|
||||
INTEGER :: strd
|
||||
|
||||
!
|
||||
IF ( nogrp > 1 ) WRITE( stdout, 100 ) nogrp, npgrp
|
||||
|
||||
100 FORMAT( /,3X,'Task Groups are in USE',/,3X,'groups and procs/group : ',I5,I5 )
|
||||
|
||||
!Find maximum chunk of local data concerning coefficients of eigenfunctions in g-space
|
||||
|
||||
#if defined __MPI
|
||||
CALL MPI_Allgather( dffts%nnr, 1, MPI_INTEGER, nnrsx_vec, 1, MPI_INTEGER, intra_pool_comm, IERR)
|
||||
strd = maxval( nnrsx_vec( 1:nproc_pool ) )
|
||||
#else
|
||||
strd = dffts%nnr
|
||||
#endif
|
||||
|
||||
IF( strd /= dffts%tg_nnr ) CALL errore( ' task_groups_init ', ' inconsistent nnr ', 1 )
|
||||
|
||||
!-------------------------------------------------------------------------------------
|
||||
!C. Bekas...TASK GROUP RELATED. FFT DATA STRUCTURES ARE ALREADY DEFINED ABOVE
|
||||
!-------------------------------------------------------------------------------------
|
||||
!dfft%nsw(me) holds the number of z-sticks for the current processor per wave-function
|
||||
!We can either send these in the group with an mpi_allgather...or put the
|
||||
!in the PSIS vector (in special positions) and send them with them.
|
||||
!Otherwise we can do this once at the beginning, before the loop.
|
||||
!we choose to do the latter one.
|
||||
!-------------------------------------------------------------------------------------
|
||||
!
|
||||
dffts%nogrp = nogrp
|
||||
dffts%npgrp = npgrp
|
||||
dffts%ogrp_comm = ogrp_comm
|
||||
dffts%pgrp_comm = pgrp_comm
|
||||
!
|
||||
ALLOCATE( dffts%tg_nsw(nproc_pool))
|
||||
ALLOCATE( dffts%tg_npp(nproc_pool))
|
||||
ALLOCATE( dffts%nolist(nogrp))
|
||||
|
||||
num_sticks = 0
|
||||
num_planes = 0
|
||||
DO i = 1, nogrp
|
||||
dffts%nolist( i ) = nolist( i )
|
||||
num_sticks = num_sticks + dffts%nsw( nolist(i) + 1 )
|
||||
num_planes = num_planes + dffts%npp( nolist(i) + 1 )
|
||||
ENDDO
|
||||
|
||||
#if defined __MPI
|
||||
CALL MPI_ALLGATHER(num_sticks, 1, MPI_INTEGER, dffts%tg_nsw(1), 1, MPI_INTEGER, intra_pool_comm, IERR)
|
||||
CALL MPI_ALLGATHER(num_planes, 1, MPI_INTEGER, dffts%tg_npp(1), 1, MPI_INTEGER, intra_pool_comm, IERR)
|
||||
#else
|
||||
dffts%tg_nsw(1) = num_sticks
|
||||
dffts%tg_npp(1) = num_planes
|
||||
#endif
|
||||
|
||||
ALLOCATE( dffts%tg_snd( nogrp ) )
|
||||
ALLOCATE( dffts%tg_rcv( nogrp ) )
|
||||
ALLOCATE( dffts%tg_psdsp( nogrp ) )
|
||||
ALLOCATE( dffts%tg_usdsp( nogrp ) )
|
||||
ALLOCATE( dffts%tg_rdsp( nogrp ) )
|
||||
|
||||
dffts%tg_snd(1) = dffts%nr3x * dffts%nsw( me_pool + 1 )
|
||||
IF( dffts%nr3x * dffts%nsw( me_pool + 1 ) > dffts%tg_nnr ) THEN
|
||||
CALL errore( ' task_groups_init ', ' inconsistent dffts%tg_nnr ', 1 )
|
||||
ENDIF
|
||||
dffts%tg_psdsp(1) = 0
|
||||
dffts%tg_usdsp(1) = 0
|
||||
dffts%tg_rcv(1) = dffts%nr3x * dffts%nsw( nolist(1) + 1 )
|
||||
dffts%tg_rdsp(1) = 0
|
||||
DO i = 2, nogrp
|
||||
dffts%tg_snd(i) = dffts%nr3x * dffts%nsw( me_pool + 1 )
|
||||
dffts%tg_psdsp(i) = dffts%tg_psdsp(i-1) + dffts%tg_nnr
|
||||
dffts%tg_usdsp(i) = dffts%tg_usdsp(i-1) + dffts%tg_snd(i-1)
|
||||
dffts%tg_rcv(i) = dffts%nr3x * dffts%nsw( nolist(i) + 1 )
|
||||
dffts%tg_rdsp(i) = dffts%tg_rdsp(i-1) + dffts%tg_rcv(i-1)
|
||||
ENDDO
|
||||
|
||||
dffts%have_task_groups = .true.
|
||||
|
||||
RETURN
|
||||
|
||||
END SUBROUTINE task_groups_init
|
||||
|
||||
|
||||
!=----------------------------------------------------------------------=
|
||||
END MODULE stick_set
|
||||
!=----------------------------------------------------------------------=
|
||||
|
|
|
@ -1,222 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2002-2004 PWSCF-FPMD-CP90 group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!-----------------------------------------
|
||||
! Contributed by C. Bekas, October 2005
|
||||
! Revised by C. Cavazzoni
|
||||
!--------------------------------------------
|
||||
|
||||
MODULE task_groups
|
||||
|
||||
USE kinds, ONLY: DP
|
||||
|
||||
IMPLICIT NONE
|
||||
SAVE
|
||||
|
||||
|
||||
CONTAINS
|
||||
|
||||
|
||||
!========================================================================================
|
||||
! ADDED SUBROUTINEs FOR TASK GROUP PARALLIZATION
|
||||
! C. Bekas, IBM Research, Zurich
|
||||
! - GROUPS: Define and initialize Task Groups
|
||||
! - tg_ivfftw: Inverse FFT driver for Task Groups
|
||||
!=======================================================================================
|
||||
|
||||
|
||||
!-----------------------------------------------------------------------
|
||||
! SUBROUTINE GROUPS (added by C. Bekas)
|
||||
! Define groups for task group parallilization
|
||||
!-----------------------------------------------------------------------
|
||||
|
||||
SUBROUTINE task_groups_init( dffts )
|
||||
|
||||
USE parallel_include
|
||||
!
|
||||
USE mp_global, ONLY : me_pool, nproc_pool, intra_pool_comm
|
||||
USE mp_global, ONLY : NOGRP, NPGRP, ogrp_comm, pgrp_comm
|
||||
USE mp_global, ONLY : nolist, nplist
|
||||
USE io_global, ONLY : stdout
|
||||
USE fft_types, ONLY : fft_dlay_descriptor
|
||||
|
||||
! T.G.
|
||||
! NPGRP: Number of processors per group
|
||||
! NOGRP: Number of processors per orbital task group
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
TYPE(fft_dlay_descriptor), INTENT(inout) :: dffts
|
||||
|
||||
!----------------------------------
|
||||
!Local Variables declaration
|
||||
!----------------------------------
|
||||
|
||||
INTEGER :: I
|
||||
INTEGER :: IERR
|
||||
INTEGER :: num_planes, num_sticks
|
||||
INTEGER :: nnrsx_vec ( nproc_pool )
|
||||
INTEGER :: pgroup( nproc_pool )
|
||||
INTEGER :: strd
|
||||
|
||||
!
|
||||
IF ( nogrp > 1 ) WRITE( stdout, 100 ) nogrp, npgrp
|
||||
|
||||
100 FORMAT( /,3X,'Task Groups are in USE',/,3X,'groups and procs/group : ',I5,I5 )
|
||||
|
||||
!Find maximum chunk of local data concerning coefficients of eigenfunctions in g-space
|
||||
|
||||
#if defined __MPI
|
||||
CALL MPI_Allgather( dffts%nnr, 1, MPI_INTEGER, nnrsx_vec, 1, MPI_INTEGER, intra_pool_comm, IERR)
|
||||
strd = maxval( nnrsx_vec( 1:nproc_pool ) )
|
||||
#else
|
||||
strd = dffts%nnr
|
||||
#endif
|
||||
|
||||
IF( strd /= dffts%tg_nnr ) CALL errore( ' task_groups_init ', ' inconsistent nnr ', 1 )
|
||||
|
||||
!-------------------------------------------------------------------------------------
|
||||
!C. Bekas...TASK GROUP RELATED. FFT DATA STRUCTURES ARE ALREADY DEFINED ABOVE
|
||||
!-------------------------------------------------------------------------------------
|
||||
!dfft%nsw(me) holds the number of z-sticks for the current processor per wave-function
|
||||
!We can either send these in the group with an mpi_allgather...or put the
|
||||
!in the PSIS vector (in special positions) and send them with them.
|
||||
!Otherwise we can do this once at the beginning, before the loop.
|
||||
!we choose to do the latter one.
|
||||
!-------------------------------------------------------------------------------------
|
||||
!
|
||||
ALLOCATE( dffts%tg_nsw(nproc_pool))
|
||||
ALLOCATE( dffts%tg_npp(nproc_pool))
|
||||
|
||||
num_sticks = 0
|
||||
num_planes = 0
|
||||
DO i = 1, nogrp
|
||||
num_sticks = num_sticks + dffts%nsw( nolist(i) + 1 )
|
||||
num_planes = num_planes + dffts%npp( nolist(i) + 1 )
|
||||
ENDDO
|
||||
|
||||
#if defined __MPI
|
||||
CALL MPI_ALLGATHER(num_sticks, 1, MPI_INTEGER, dffts%tg_nsw(1), 1, MPI_INTEGER, intra_pool_comm, IERR)
|
||||
CALL MPI_ALLGATHER(num_planes, 1, MPI_INTEGER, dffts%tg_npp(1), 1, MPI_INTEGER, intra_pool_comm, IERR)
|
||||
#else
|
||||
dffts%tg_nsw(1) = num_sticks
|
||||
dffts%tg_npp(1) = num_planes
|
||||
#endif
|
||||
|
||||
ALLOCATE( dffts%tg_snd( nogrp ) )
|
||||
ALLOCATE( dffts%tg_rcv( nogrp ) )
|
||||
ALLOCATE( dffts%tg_psdsp( nogrp ) )
|
||||
ALLOCATE( dffts%tg_usdsp( nogrp ) )
|
||||
ALLOCATE( dffts%tg_rdsp( nogrp ) )
|
||||
|
||||
dffts%tg_snd(1) = dffts%nr3x * dffts%nsw( me_pool + 1 )
|
||||
IF( dffts%nr3x * dffts%nsw( me_pool + 1 ) > dffts%tg_nnr ) THEN
|
||||
CALL errore( ' task_groups_init ', ' inconsistent dffts%tg_nnr ', 1 )
|
||||
ENDIF
|
||||
dffts%tg_psdsp(1) = 0
|
||||
dffts%tg_usdsp(1) = 0
|
||||
dffts%tg_rcv(1) = dffts%nr3x * dffts%nsw( nolist(1) + 1 )
|
||||
dffts%tg_rdsp(1) = 0
|
||||
DO i = 2, nogrp
|
||||
dffts%tg_snd(i) = dffts%nr3x * dffts%nsw( me_pool + 1 )
|
||||
dffts%tg_psdsp(i) = dffts%tg_psdsp(i-1) + dffts%tg_nnr
|
||||
dffts%tg_usdsp(i) = dffts%tg_usdsp(i-1) + dffts%tg_snd(i-1)
|
||||
dffts%tg_rcv(i) = dffts%nr3x * dffts%nsw( nolist(i) + 1 )
|
||||
dffts%tg_rdsp(i) = dffts%tg_rdsp(i-1) + dffts%tg_rcv(i-1)
|
||||
ENDDO
|
||||
|
||||
! ALLOCATE( dffts%tg_sca_snd( nproc_pool / nogrp ) )
|
||||
! ALLOCATE( dffts%tg_sca_rcv( nproc_pool / nogrp ) )
|
||||
! ALLOCATE( dffts%tg_sca_sdsp( nproc_pool / nogrp ) )
|
||||
! ALLOCATE( dffts%tg_sca_rdsp( nproc_pool / nogrp ) )
|
||||
! ALLOCATE( dffts%tg_sca_off( nproc_pool / nogrp ) )
|
||||
|
||||
! do i = 1, nproc_pool / nogrp
|
||||
! dffts%tg_sca_snd (i) = dffts%tg_npp ( nplist( i ) + 1 ) * dffts%tg_nsw ( me_pool + 1 )
|
||||
! dffts%tg_sca_rcv (i) = dffts%tg_npp ( me_pool + 1 ) * dffts%tg_nsw ( nplist( i ) + 1 )
|
||||
! end do
|
||||
! dffts%tg_sca_off(1) = 0
|
||||
! do i = 2, nproc_pool / nogrp
|
||||
! dffts%tg_sca_off(i) = dffts%tg_sca_off(i - 1) + dffts%tg_npp ( nplist( i - 1 ) + 1 )
|
||||
! end do
|
||||
! dffts%tg_sca_sdsp (1) = 0
|
||||
! dffts%tg_sca_rdsp (1) = 0
|
||||
! do i = 2, nproc_pool / nogrp
|
||||
! dffts%tg_sca_sdsp (i) = dffts%tg_sca_sdsp (i - 1) + dffts%tg_sca_snd (i - 1)
|
||||
! dffts%tg_sca_rdsp (i) = dffts%tg_sca_rdsp (i - 1) + dffts%tg_sca_rcv (i - 1)
|
||||
! enddo
|
||||
|
||||
dffts%have_task_groups = .true.
|
||||
|
||||
RETURN
|
||||
|
||||
END SUBROUTINE task_groups_init
|
||||
|
||||
!
|
||||
|
||||
SUBROUTINE tg_gather( dffts, v, tg_v )
|
||||
!
|
||||
USE parallel_include
|
||||
!
|
||||
USE mp_global, ONLY : me_pool, nogrp, ogrp_comm, nolist
|
||||
USE fft_types, ONLY : fft_dlay_descriptor
|
||||
|
||||
! T.G.
|
||||
! NOGRP: Number of processors per orbital task group
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
TYPE(fft_dlay_descriptor), INTENT(in) :: dffts
|
||||
|
||||
REAL(DP) :: v(:)
|
||||
REAL(DP) :: tg_v(:)
|
||||
|
||||
INTEGER :: nsiz, i, ierr, nsiz_tg
|
||||
INTEGER :: recv_cnt( nogrp ), recv_displ( nogrp )
|
||||
|
||||
nsiz_tg = dffts%tg_nnr * nogrp
|
||||
|
||||
IF( size( tg_v ) < nsiz_tg ) &
|
||||
CALL errore( ' tg_gather ', ' tg_v too small ', ( nsiz_tg - size( tg_v ) ) )
|
||||
|
||||
nsiz = dffts%npp( me_pool+1 ) * dffts%nr1x * dffts%nr2x
|
||||
|
||||
IF( size( v ) < nsiz ) &
|
||||
CALL errore( ' tg_gather ', ' v too small ', ( nsiz - size( v ) ) )
|
||||
|
||||
!
|
||||
! The potential in v is distributed accros all processors
|
||||
! We need to redistribute it so that it is completely contained in the
|
||||
! processors of an orbital TASK-GROUP
|
||||
!
|
||||
recv_cnt(1) = dffts%npp( nolist(1) + 1 ) * dffts%nr1x * dffts%nr2x
|
||||
recv_displ(1) = 0
|
||||
DO i = 2, nogrp
|
||||
recv_cnt(i) = dffts%npp( nolist(i) + 1 ) * dffts%nr1x * dffts%nr2x
|
||||
recv_displ(i) = recv_displ(i-1) + recv_cnt(i-1)
|
||||
ENDDO
|
||||
|
||||
! clean only elements that will not be overwritten
|
||||
!
|
||||
DO i = recv_displ(nogrp) + recv_cnt( nogrp ) + 1, size( tg_v )
|
||||
tg_v( i ) = 0.0d0
|
||||
ENDDO
|
||||
|
||||
#if defined (__PARA) && defined (__MPI)
|
||||
|
||||
CALL MPI_Allgatherv( v(1), nsiz, MPI_DOUBLE_PRECISION, &
|
||||
tg_v(1), recv_cnt, recv_displ, MPI_DOUBLE_PRECISION, ogrp_comm, IERR)
|
||||
|
||||
IF( ierr /= 0 ) &
|
||||
CALL errore( ' tg_gather ', ' MPI_Allgatherv ', abs( ierr ) )
|
||||
|
||||
#endif
|
||||
|
||||
END SUBROUTINE
|
||||
|
||||
|
||||
END MODULE task_groups
|
Loading…
Reference in New Issue