- eliminated module task groups (subroutines moved to fft modules)

- added variables to fft type to store info about data distribution


git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@7433 c92efa57-630b-4861-b058-cf58834340f0
This commit is contained in:
ccavazzoni 2011-01-22 18:57:51 +00:00
parent aa3843a34a
commit 8fb1ad0998
6 changed files with 245 additions and 243 deletions

View File

@ -70,7 +70,6 @@ sic.o \
splinelib.o \
stick_base.o \
stick_set.o \
task_groups.o \
timestep.o \
version.o \
upf.o \

View File

@ -39,6 +39,7 @@
PUBLIC :: dfftp, dffts, dfftb, fft_dlay_descriptor
PUBLIC :: cgather_sym, cgather_smooth, cscatter_sym, cscatter_smooth
PUBLIC :: gather_smooth, scatter_smooth
PUBLIC :: tg_gather
@ -1164,6 +1165,67 @@ SUBROUTINE scatter_smooth( f_in, f_out )
!
END SUBROUTINE scatter_smooth
!
SUBROUTINE tg_gather( dffts, v, tg_v )
!
USE parallel_include
!
USE fft_types, ONLY : fft_dlay_descriptor
! T.G.
! NOGRP: Number of processors per orbital task group
IMPLICIT NONE
TYPE(fft_dlay_descriptor), INTENT(in) :: dffts
REAL(DP) :: v(:)
REAL(DP) :: tg_v(:)
INTEGER :: nsiz, i, ierr, nsiz_tg
INTEGER :: recv_cnt( dffts%nogrp ), recv_displ( dffts%nogrp )
nsiz_tg = dffts%tg_nnr * dffts%nogrp
IF( size( tg_v ) < nsiz_tg ) &
CALL errore( ' tg_gather ', ' tg_v too small ', ( nsiz_tg - size( tg_v ) ) )
nsiz = dffts%npp( dffts%myid+1 ) * dffts%nr1x * dffts%nr2x
IF( size( v ) < nsiz ) &
CALL errore( ' tg_gather ', ' v too small ', ( nsiz - size( v ) ) )
!
! The potential in v is distributed accros all processors
! We need to redistribute it so that it is completely contained in the
! processors of an orbital TASK-GROUP
!
recv_cnt(1) = dffts%npp( dffts%nolist(1) + 1 ) * dffts%nr1x * dffts%nr2x
recv_displ(1) = 0
DO i = 2, dffts%nogrp
recv_cnt(i) = dffts%npp( dffts%nolist(i) + 1 ) * dffts%nr1x * dffts%nr2x
recv_displ(i) = recv_displ(i-1) + recv_cnt(i-1)
ENDDO
! clean only elements that will not be overwritten
!
DO i = recv_displ(dffts%nogrp) + recv_cnt( dffts%nogrp ) + 1, size( tg_v )
tg_v( i ) = 0.0d0
ENDDO
#if defined (__PARA) && defined (__MPI)
CALL MPI_Allgatherv( v(1), nsiz, MPI_DOUBLE_PRECISION, &
tg_v(1), recv_cnt, recv_displ, MPI_DOUBLE_PRECISION, dffts%ogrp_comm, IERR)
IF( ierr /= 0 ) &
CALL errore( ' tg_gather ', ' MPI_Allgatherv ', abs( ierr ) )
#endif
END SUBROUTINE tg_gather
!=----------------------------------------------------------------------=!
END MODULE fft_base
!=----------------------------------------------------------------------=!

View File

@ -52,8 +52,20 @@ MODULE fft_types
INTEGER, POINTER :: imax3(:) ! the last local plane
INTEGER, POINTER :: np3(:) ! number of local plane for the box fft
!
! fft parallelization
!
INTEGER :: myid ! my processor id
INTEGER :: comm ! communicator of the fft gruop
INTEGER :: nproc ! number of processor in the fft group
!
! task groups
!
INTEGER :: NOGRP
INTEGER :: NPGRP
INTEGER :: ogrp_comm
INTEGER :: pgrp_comm
INTEGER, POINTER :: nolist(:) ! number of sticks per task group ( wave func )
!
LOGICAL :: have_task_groups
INTEGER :: tg_nnr ! maximum among nnr
INTEGER, POINTER :: tg_nsw(:) ! number of sticks per task group ( wave func )
@ -72,9 +84,9 @@ MODULE fft_types
CONTAINS
SUBROUTINE fft_dlay_allocate( desc, nproc, nx, ny )
SUBROUTINE fft_dlay_allocate( desc, myid, nproc, comm, nx, ny )
TYPE (fft_dlay_descriptor) :: desc
INTEGER, INTENT(in) :: nproc, nx, ny
INTEGER, INTENT(in) :: myid, nproc, comm, nx, ny ! myid starting from 0
ALLOCATE( desc%nsp( nproc ) )
ALLOCATE( desc%nsw( nproc ) )
ALLOCATE( desc%ngl( nproc ) )
@ -101,7 +113,15 @@ CONTAINS
desc%id = 0
desc%myid = myid
desc%comm = comm
desc%nproc = nproc
desc%have_task_groups = .false.
desc%NOGRP = 0
desc%NPGRP = 0
desc%ogrp_comm = 0
desc%pgrp_comm = 0
NULLIFY( desc%nolist )
NULLIFY( desc%tg_nsw )
NULLIFY( desc%tg_npp )
NULLIFY( desc%tg_snd )
@ -128,6 +148,7 @@ CONTAINS
IF ( associated( desc%iplw ) ) DEALLOCATE( desc%iplw )
desc%id = 0
IF( desc%have_task_groups ) THEN
IF ( associated( desc%nolist ) ) DEALLOCATE( desc%nolist )
IF ( associated( desc%tg_nsw ) ) DEALLOCATE( desc%tg_nsw )
IF ( associated( desc%tg_npp ) ) DEALLOCATE( desc%tg_npp )
IF ( associated( desc%tg_snd ) ) DEALLOCATE( desc%tg_snd )
@ -141,9 +162,9 @@ CONTAINS
!=----------------------------------------------------------------------------=!
SUBROUTINE fft_box_allocate( desc, nproc, nat )
SUBROUTINE fft_box_allocate( desc, myid, nproc, comm, nat )
TYPE (fft_dlay_descriptor) :: desc
INTEGER, INTENT(in) :: nat, nproc
INTEGER, INTENT(in) :: nat, nproc, myid, comm ! myid starting from 0
ALLOCATE( desc%irb( 3, nat ) )
ALLOCATE( desc%imin3( nat ) )
ALLOCATE( desc%imax3( nat ) )
@ -156,6 +177,9 @@ CONTAINS
desc%npp = 0
desc%ipp = 0
desc%np3 = 0
desc%myid = myid
desc%nproc = nproc
desc%comm = comm
desc%have_task_groups = .false.
END SUBROUTINE fft_box_allocate
@ -174,7 +198,7 @@ CONTAINS
!=----------------------------------------------------------------------------=!
SUBROUTINE fft_dlay_set( desc, tk, nst, nr1, nr2, nr3, nr1x, nr2x, nr3x, me, &
nproc, nogrp, ub, lb, idx, in1, in2, ncp, ncpw, ngp, ngpw, st, stw )
nproc, comm, nogrp, ub, lb, idx, in1, in2, ncp, ncpw, ngp, ngpw, st, stw )
TYPE (fft_dlay_descriptor) :: desc
@ -184,6 +208,7 @@ CONTAINS
INTEGER, INTENT(in) :: nr1x, nr2x, nr3x ! padded size of real space grid
INTEGER, INTENT(in) :: me ! processor index (starting from 1)
INTEGER, INTENT(in) :: nproc ! number of processors
INTEGER, INTENT(in) :: comm ! communicator
INTEGER, INTENT(in) :: nogrp ! number of processors in task-group
INTEGER, INTENT(in) :: ub(3), lb(3) ! upper and lower bound of real space indices
INTEGER, INTENT(in) :: idx(:)
@ -216,6 +241,15 @@ CONTAINS
IF( ( size( ncp ) < nproc ) .or. ( size( ngp ) < nproc ) ) &
CALL errore( ' fft_dlay_set ', ' wrong stick dimensions ', 4 )
IF( desc%nproc /= nproc ) &
CALL errore( ' fft_dlay_set ', ' wrong number of processor ', 4 )
IF( desc%myid /= (me - 1) ) &
CALL errore( ' fft_dlay_set ', ' wrong processor index ', 4 )
IF( desc%comm /= comm ) &
CALL errore( ' fft_dlay_set ', ' wrong communicator ', 4 )
desc%have_task_groups = .false.
! Set the number of "xy" planes for each processor
@ -430,13 +464,13 @@ CONTAINS
!=----------------------------------------------------------------------------=!
SUBROUTINE fft_box_set( desc, nr1b, nr2b, nr3b, nr1bx, nr2bx, nr3bx, nat, &
irb, me, nproc, npp, ipp )
irb, me, nproc, comm, npp, ipp )
IMPLICIT NONE
TYPE (fft_dlay_descriptor) :: desc
INTEGER, INTENT(in) :: nat, me, nproc
INTEGER, INTENT(in) :: nat, me, nproc, comm
INTEGER, INTENT(in) :: irb( :, : )
INTEGER, INTENT(in) :: npp( : )
INTEGER, INTENT(in) :: ipp( : )
@ -452,6 +486,16 @@ CONTAINS
IF( nproc > size( desc%npp ) ) &
CALL errore(" fft_box_set ", " inconsistent dimensions ", 2 )
IF( desc%nproc /= nproc ) &
CALL errore( ' fft_dlay_set ', ' wrong number of processor ', 4 )
IF( desc%myid /= (me - 1) ) &
CALL errore( ' fft_dlay_set ', ' wrong processor index ', 4 )
IF( desc%comm /= comm ) &
CALL errore( ' fft_dlay_set ', ' wrong communicator ', 4 )
desc%nr1 = nr1b
desc%nr2 = nr2b
desc%nr3 = nr3b

View File

@ -215,18 +215,13 @@ splinelib.o : kind.o
stick_base.o : io_global.o
stick_base.o : kind.o
stick_base.o : mp.o
stick_base.o : mp_global.o
stick_set.o : fft_types.o
stick_set.o : griddim.o
stick_set.o : io_global.o
stick_set.o : kind.o
stick_set.o : mp_global.o
stick_set.o : parallel_include.o
stick_set.o : stick_base.o
task_groups.o : fft_types.o
task_groups.o : io_global.o
task_groups.o : kind.o
task_groups.o : mp_global.o
task_groups.o : parallel_include.o
timestep.o : kind.o
upf.o : ../iotk/src/iotk_module.o
upf.o : kind.o

View File

@ -22,7 +22,10 @@
USE io_global, ONLY: ionode, stdout
USE fft_types, ONLY: fft_dlay_descriptor, fft_dlay_allocate, &
fft_dlay_set, fft_dlay_scalar
USE mp_global, ONLY: me_pool, nproc_pool, intra_pool_comm, nogrp
USE mp_global, ONLY: me_pool, nproc_pool, intra_pool_comm, nogrp, use_task_groups
USE mp_global, ONLY : me_pool, nproc_pool, intra_pool_comm
USE mp_global, ONLY : NOGRP, NPGRP, ogrp_comm, pgrp_comm
USE mp_global, ONLY : nolist
PRIVATE
SAVE
@ -196,13 +199,13 @@
#if defined __PARA
CALL fft_dlay_allocate( dfftp, nproc_pool, nr1x, nr2x )
CALL fft_dlay_allocate( dffts, nproc_pool, nr1sx, nr2sx )
CALL fft_dlay_allocate( dfftp, me_pool, nproc_pool, intra_pool_comm, nr1x, nr2x )
CALL fft_dlay_allocate( dffts, me_pool, nproc_pool, intra_pool_comm, nr1sx, nr2sx )
CALL fft_dlay_set( dfftp, tk, nst, nr1, nr2, nr3, nr1x, nr2x, nr3x, (me_pool+1), &
nproc_pool, nogrp, ub, lb, idx, ist(:,1), ist(:,2), nstp, nstpw, sstp, sstpw, st, stw )
nproc_pool, intra_pool_comm, nogrp, ub, lb, idx, ist(:,1), ist(:,2), nstp, nstpw, sstp, sstpw, st, stw )
CALL fft_dlay_set( dffts, tk, nsts, nr1s, nr2s, nr3s, nr1sx, nr2sx, nr3sx, (me_pool+1), &
nproc_pool, nogrp, ub, lb, idx, ist(:,1), ist(:,2), nstps, nstpw, sstps, sstpw, sts, stw )
nproc_pool, intra_pool_comm, nogrp, ub, lb, idx, ist(:,1), ist(:,2), nstps, nstpw, sstps, sstpw, sts, stw )
#else
@ -215,8 +218,8 @@
IF( ngm_ /= ngm ) CALL errore( ' pstickset ', ' inconsistent ngm ', abs( ngm - ngm_ ) )
IF( ngs_ /= ngs ) CALL errore( ' pstickset ', ' inconsistent ngs ', abs( ngs - ngs_ ) )
CALL fft_dlay_allocate( dfftp, nproc_pool, max(nr1x, nr3x), nr2x )
CALL fft_dlay_allocate( dffts, nproc_pool, max(nr1sx, nr3sx), nr2sx )
CALL fft_dlay_allocate( dfftp, me_pool, nproc_pool, intra_pool_comm, max(nr1x, nr3x), nr2x )
CALL fft_dlay_allocate( dffts, me_pool, nproc_pool, intra_pool_comm, max(nr1sx, nr3sx), nr2sx )
CALL fft_dlay_scalar( dfftp, ub, lb, nr1, nr2, nr3, nr1x, nr2x, nr3x, stw )
CALL fft_dlay_scalar( dffts, ub, lb, nr1s, nr2s, nr3s, nr1sx, nr2sx, nr3sx, stw )
@ -231,6 +234,15 @@
! ... Maximum number of sticks (wave func.)
nstpwx = maxval( nstpw )
IF( use_task_groups ) THEN
!
! Initialize task groups.
! Note that this call modify dffts adding task group data.
!
CALL task_groups_init( dffts )
!
END IF
IF (ionode) WRITE( stdout,118)
118 FORMAT(3X,' n.st n.stw n.sts n.g n.gw n.gs')
WRITE( stdout,121) minval(nstp), minval(nstpw), minval(nstps), minval(sstp), minval(sstpw), minval(sstps)
@ -264,6 +276,118 @@
RETURN
END SUBROUTINE pstickset
!-----------------------------------------
! Task groups Contributed by C. Bekas, October 2005
! Revised by C. Cavazzoni
!--------------------------------------------
SUBROUTINE task_groups_init( dffts )
USE parallel_include
!
USE io_global, ONLY : stdout
USE fft_types, ONLY : fft_dlay_descriptor
! T.G.
! NPGRP: Number of processors per group
! NOGRP: Number of processors per orbital task group
IMPLICIT NONE
TYPE(fft_dlay_descriptor), INTENT(inout) :: dffts
!----------------------------------
!Local Variables declaration
!----------------------------------
INTEGER :: I
INTEGER :: IERR
INTEGER :: num_planes, num_sticks
INTEGER :: nnrsx_vec ( nproc_pool )
INTEGER :: pgroup( nproc_pool )
INTEGER :: strd
!
IF ( nogrp > 1 ) WRITE( stdout, 100 ) nogrp, npgrp
100 FORMAT( /,3X,'Task Groups are in USE',/,3X,'groups and procs/group : ',I5,I5 )
!Find maximum chunk of local data concerning coefficients of eigenfunctions in g-space
#if defined __MPI
CALL MPI_Allgather( dffts%nnr, 1, MPI_INTEGER, nnrsx_vec, 1, MPI_INTEGER, intra_pool_comm, IERR)
strd = maxval( nnrsx_vec( 1:nproc_pool ) )
#else
strd = dffts%nnr
#endif
IF( strd /= dffts%tg_nnr ) CALL errore( ' task_groups_init ', ' inconsistent nnr ', 1 )
!-------------------------------------------------------------------------------------
!C. Bekas...TASK GROUP RELATED. FFT DATA STRUCTURES ARE ALREADY DEFINED ABOVE
!-------------------------------------------------------------------------------------
!dfft%nsw(me) holds the number of z-sticks for the current processor per wave-function
!We can either send these in the group with an mpi_allgather...or put the
!in the PSIS vector (in special positions) and send them with them.
!Otherwise we can do this once at the beginning, before the loop.
!we choose to do the latter one.
!-------------------------------------------------------------------------------------
!
dffts%nogrp = nogrp
dffts%npgrp = npgrp
dffts%ogrp_comm = ogrp_comm
dffts%pgrp_comm = pgrp_comm
!
ALLOCATE( dffts%tg_nsw(nproc_pool))
ALLOCATE( dffts%tg_npp(nproc_pool))
ALLOCATE( dffts%nolist(nogrp))
num_sticks = 0
num_planes = 0
DO i = 1, nogrp
dffts%nolist( i ) = nolist( i )
num_sticks = num_sticks + dffts%nsw( nolist(i) + 1 )
num_planes = num_planes + dffts%npp( nolist(i) + 1 )
ENDDO
#if defined __MPI
CALL MPI_ALLGATHER(num_sticks, 1, MPI_INTEGER, dffts%tg_nsw(1), 1, MPI_INTEGER, intra_pool_comm, IERR)
CALL MPI_ALLGATHER(num_planes, 1, MPI_INTEGER, dffts%tg_npp(1), 1, MPI_INTEGER, intra_pool_comm, IERR)
#else
dffts%tg_nsw(1) = num_sticks
dffts%tg_npp(1) = num_planes
#endif
ALLOCATE( dffts%tg_snd( nogrp ) )
ALLOCATE( dffts%tg_rcv( nogrp ) )
ALLOCATE( dffts%tg_psdsp( nogrp ) )
ALLOCATE( dffts%tg_usdsp( nogrp ) )
ALLOCATE( dffts%tg_rdsp( nogrp ) )
dffts%tg_snd(1) = dffts%nr3x * dffts%nsw( me_pool + 1 )
IF( dffts%nr3x * dffts%nsw( me_pool + 1 ) > dffts%tg_nnr ) THEN
CALL errore( ' task_groups_init ', ' inconsistent dffts%tg_nnr ', 1 )
ENDIF
dffts%tg_psdsp(1) = 0
dffts%tg_usdsp(1) = 0
dffts%tg_rcv(1) = dffts%nr3x * dffts%nsw( nolist(1) + 1 )
dffts%tg_rdsp(1) = 0
DO i = 2, nogrp
dffts%tg_snd(i) = dffts%nr3x * dffts%nsw( me_pool + 1 )
dffts%tg_psdsp(i) = dffts%tg_psdsp(i-1) + dffts%tg_nnr
dffts%tg_usdsp(i) = dffts%tg_usdsp(i-1) + dffts%tg_snd(i-1)
dffts%tg_rcv(i) = dffts%nr3x * dffts%nsw( nolist(i) + 1 )
dffts%tg_rdsp(i) = dffts%tg_rdsp(i-1) + dffts%tg_rcv(i-1)
ENDDO
dffts%have_task_groups = .true.
RETURN
END SUBROUTINE task_groups_init
!=----------------------------------------------------------------------=
END MODULE stick_set
!=----------------------------------------------------------------------=

View File

@ -1,222 +0,0 @@
!
! Copyright (C) 2002-2004 PWSCF-FPMD-CP90 group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-----------------------------------------
! Contributed by C. Bekas, October 2005
! Revised by C. Cavazzoni
!--------------------------------------------
MODULE task_groups
USE kinds, ONLY: DP
IMPLICIT NONE
SAVE
CONTAINS
!========================================================================================
! ADDED SUBROUTINEs FOR TASK GROUP PARALLIZATION
! C. Bekas, IBM Research, Zurich
! - GROUPS: Define and initialize Task Groups
! - tg_ivfftw: Inverse FFT driver for Task Groups
!=======================================================================================
!-----------------------------------------------------------------------
! SUBROUTINE GROUPS (added by C. Bekas)
! Define groups for task group parallilization
!-----------------------------------------------------------------------
SUBROUTINE task_groups_init( dffts )
USE parallel_include
!
USE mp_global, ONLY : me_pool, nproc_pool, intra_pool_comm
USE mp_global, ONLY : NOGRP, NPGRP, ogrp_comm, pgrp_comm
USE mp_global, ONLY : nolist, nplist
USE io_global, ONLY : stdout
USE fft_types, ONLY : fft_dlay_descriptor
! T.G.
! NPGRP: Number of processors per group
! NOGRP: Number of processors per orbital task group
IMPLICIT NONE
TYPE(fft_dlay_descriptor), INTENT(inout) :: dffts
!----------------------------------
!Local Variables declaration
!----------------------------------
INTEGER :: I
INTEGER :: IERR
INTEGER :: num_planes, num_sticks
INTEGER :: nnrsx_vec ( nproc_pool )
INTEGER :: pgroup( nproc_pool )
INTEGER :: strd
!
IF ( nogrp > 1 ) WRITE( stdout, 100 ) nogrp, npgrp
100 FORMAT( /,3X,'Task Groups are in USE',/,3X,'groups and procs/group : ',I5,I5 )
!Find maximum chunk of local data concerning coefficients of eigenfunctions in g-space
#if defined __MPI
CALL MPI_Allgather( dffts%nnr, 1, MPI_INTEGER, nnrsx_vec, 1, MPI_INTEGER, intra_pool_comm, IERR)
strd = maxval( nnrsx_vec( 1:nproc_pool ) )
#else
strd = dffts%nnr
#endif
IF( strd /= dffts%tg_nnr ) CALL errore( ' task_groups_init ', ' inconsistent nnr ', 1 )
!-------------------------------------------------------------------------------------
!C. Bekas...TASK GROUP RELATED. FFT DATA STRUCTURES ARE ALREADY DEFINED ABOVE
!-------------------------------------------------------------------------------------
!dfft%nsw(me) holds the number of z-sticks for the current processor per wave-function
!We can either send these in the group with an mpi_allgather...or put the
!in the PSIS vector (in special positions) and send them with them.
!Otherwise we can do this once at the beginning, before the loop.
!we choose to do the latter one.
!-------------------------------------------------------------------------------------
!
ALLOCATE( dffts%tg_nsw(nproc_pool))
ALLOCATE( dffts%tg_npp(nproc_pool))
num_sticks = 0
num_planes = 0
DO i = 1, nogrp
num_sticks = num_sticks + dffts%nsw( nolist(i) + 1 )
num_planes = num_planes + dffts%npp( nolist(i) + 1 )
ENDDO
#if defined __MPI
CALL MPI_ALLGATHER(num_sticks, 1, MPI_INTEGER, dffts%tg_nsw(1), 1, MPI_INTEGER, intra_pool_comm, IERR)
CALL MPI_ALLGATHER(num_planes, 1, MPI_INTEGER, dffts%tg_npp(1), 1, MPI_INTEGER, intra_pool_comm, IERR)
#else
dffts%tg_nsw(1) = num_sticks
dffts%tg_npp(1) = num_planes
#endif
ALLOCATE( dffts%tg_snd( nogrp ) )
ALLOCATE( dffts%tg_rcv( nogrp ) )
ALLOCATE( dffts%tg_psdsp( nogrp ) )
ALLOCATE( dffts%tg_usdsp( nogrp ) )
ALLOCATE( dffts%tg_rdsp( nogrp ) )
dffts%tg_snd(1) = dffts%nr3x * dffts%nsw( me_pool + 1 )
IF( dffts%nr3x * dffts%nsw( me_pool + 1 ) > dffts%tg_nnr ) THEN
CALL errore( ' task_groups_init ', ' inconsistent dffts%tg_nnr ', 1 )
ENDIF
dffts%tg_psdsp(1) = 0
dffts%tg_usdsp(1) = 0
dffts%tg_rcv(1) = dffts%nr3x * dffts%nsw( nolist(1) + 1 )
dffts%tg_rdsp(1) = 0
DO i = 2, nogrp
dffts%tg_snd(i) = dffts%nr3x * dffts%nsw( me_pool + 1 )
dffts%tg_psdsp(i) = dffts%tg_psdsp(i-1) + dffts%tg_nnr
dffts%tg_usdsp(i) = dffts%tg_usdsp(i-1) + dffts%tg_snd(i-1)
dffts%tg_rcv(i) = dffts%nr3x * dffts%nsw( nolist(i) + 1 )
dffts%tg_rdsp(i) = dffts%tg_rdsp(i-1) + dffts%tg_rcv(i-1)
ENDDO
! ALLOCATE( dffts%tg_sca_snd( nproc_pool / nogrp ) )
! ALLOCATE( dffts%tg_sca_rcv( nproc_pool / nogrp ) )
! ALLOCATE( dffts%tg_sca_sdsp( nproc_pool / nogrp ) )
! ALLOCATE( dffts%tg_sca_rdsp( nproc_pool / nogrp ) )
! ALLOCATE( dffts%tg_sca_off( nproc_pool / nogrp ) )
! do i = 1, nproc_pool / nogrp
! dffts%tg_sca_snd (i) = dffts%tg_npp ( nplist( i ) + 1 ) * dffts%tg_nsw ( me_pool + 1 )
! dffts%tg_sca_rcv (i) = dffts%tg_npp ( me_pool + 1 ) * dffts%tg_nsw ( nplist( i ) + 1 )
! end do
! dffts%tg_sca_off(1) = 0
! do i = 2, nproc_pool / nogrp
! dffts%tg_sca_off(i) = dffts%tg_sca_off(i - 1) + dffts%tg_npp ( nplist( i - 1 ) + 1 )
! end do
! dffts%tg_sca_sdsp (1) = 0
! dffts%tg_sca_rdsp (1) = 0
! do i = 2, nproc_pool / nogrp
! dffts%tg_sca_sdsp (i) = dffts%tg_sca_sdsp (i - 1) + dffts%tg_sca_snd (i - 1)
! dffts%tg_sca_rdsp (i) = dffts%tg_sca_rdsp (i - 1) + dffts%tg_sca_rcv (i - 1)
! enddo
dffts%have_task_groups = .true.
RETURN
END SUBROUTINE task_groups_init
!
SUBROUTINE tg_gather( dffts, v, tg_v )
!
USE parallel_include
!
USE mp_global, ONLY : me_pool, nogrp, ogrp_comm, nolist
USE fft_types, ONLY : fft_dlay_descriptor
! T.G.
! NOGRP: Number of processors per orbital task group
IMPLICIT NONE
TYPE(fft_dlay_descriptor), INTENT(in) :: dffts
REAL(DP) :: v(:)
REAL(DP) :: tg_v(:)
INTEGER :: nsiz, i, ierr, nsiz_tg
INTEGER :: recv_cnt( nogrp ), recv_displ( nogrp )
nsiz_tg = dffts%tg_nnr * nogrp
IF( size( tg_v ) < nsiz_tg ) &
CALL errore( ' tg_gather ', ' tg_v too small ', ( nsiz_tg - size( tg_v ) ) )
nsiz = dffts%npp( me_pool+1 ) * dffts%nr1x * dffts%nr2x
IF( size( v ) < nsiz ) &
CALL errore( ' tg_gather ', ' v too small ', ( nsiz - size( v ) ) )
!
! The potential in v is distributed accros all processors
! We need to redistribute it so that it is completely contained in the
! processors of an orbital TASK-GROUP
!
recv_cnt(1) = dffts%npp( nolist(1) + 1 ) * dffts%nr1x * dffts%nr2x
recv_displ(1) = 0
DO i = 2, nogrp
recv_cnt(i) = dffts%npp( nolist(i) + 1 ) * dffts%nr1x * dffts%nr2x
recv_displ(i) = recv_displ(i-1) + recv_cnt(i-1)
ENDDO
! clean only elements that will not be overwritten
!
DO i = recv_displ(nogrp) + recv_cnt( nogrp ) + 1, size( tg_v )
tg_v( i ) = 0.0d0
ENDDO
#if defined (__PARA) && defined (__MPI)
CALL MPI_Allgatherv( v(1), nsiz, MPI_DOUBLE_PRECISION, &
tg_v(1), recv_cnt, recv_displ, MPI_DOUBLE_PRECISION, ogrp_comm, IERR)
IF( ierr /= 0 ) &
CALL errore( ' tg_gather ', ' MPI_Allgatherv ', abs( ierr ) )
#endif
END SUBROUTINE
END MODULE task_groups