diff --git a/CPV/chargedensity.f90 b/CPV/chargedensity.f90 index 36ee5c940..6f48a503b 100644 --- a/CPV/chargedensity.f90 +++ b/CPV/chargedensity.f90 @@ -115,8 +115,7 @@ USE constants, ONLY: pi, fpi USE mp, ONLY: mp_sum USE io_global, ONLY: stdout, ionode - USE mp_global, ONLY: intra_bgrp_comm, nogrp, me_bgrp, & - use_task_groups, ogrp_comm, nolist, nbgrp, inter_bgrp_comm + USE mp_global, ONLY: intra_bgrp_comm, nbgrp, inter_bgrp_comm, me_bgrp USE funct, ONLY: dft_is_meta USE cg_module, ONLY: tcg USE cp_interfaces, ONLY: stress_kin, enkin @@ -295,7 +294,7 @@ rhos(ir,iss1)=rhos(ir,iss1) + sa1*( DBLE(psis(ir)))**2 END DO ! - ELSE IF( use_task_groups ) THEN + ELSE IF( dffts%have_task_groups ) THEN ! CALL loop_over_states_tg() ! @@ -500,13 +499,13 @@ INTEGER :: from, ii, eig_index, eig_offset REAL(DP), ALLOCATABLE :: tmp_rhos(:,:) - ALLOCATE( psis( dffts%tg_nnr * nogrp ) ) + ALLOCATE( psis( dffts%tg_nnr * dffts%nogrp ) ) ! ALLOCATE( tmp_rhos ( dffts%nr1x*dffts%nr2x*dffts%tg_npp( me_bgrp + 1 ), nspin ) ) ! tmp_rhos = 0_DP - do i = 1, nbsp_bgrp, 2*nogrp + do i = 1, nbsp_bgrp, 2*dffts%nogrp ! ! Initialize wave-functions in Fourier space (to be FFTed) ! The size of psis is nnr: which is equal to the total number @@ -530,7 +529,7 @@ ! eig_offset = 0 - do eig_index = 1, 2*nogrp, 2 + do eig_index = 1, 2*dffts%nogrp, 2 ! ! here we pack 2*nogrp electronic states in the psis array ! @@ -571,8 +570,8 @@ ! ! Compute the proper factor for each band ! - DO ii = 1, nogrp - IF( nolist( ii ) == me_bgrp ) EXIT + DO ii = 1, dffts%nogrp + IF( dffts%nolist( ii ) == me_bgrp ) EXIT END DO ! ! Remember two bands are packed in a single array : @@ -629,8 +628,8 @@ CALL mp_sum( tmp_rhos, inter_bgrp_comm ) END IF - IF ( nogrp > 1 ) THEN - CALL mp_sum( tmp_rhos, gid = ogrp_comm ) + IF ( dffts%nogrp > 1 ) THEN + CALL mp_sum( tmp_rhos, gid = dffts%ogrp_comm ) ENDIF ! !BRING CHARGE DENSITY BACK TO ITS ORIGINAL POSITION @@ -639,9 +638,9 @@ !orbital group then does a local copy (reshuffling) of its data ! from = 1 - DO ii = 1, nogrp - IF ( nolist( ii ) == me_bgrp ) EXIT !Exit the loop - from = from + dffts%nr1x*dffts%nr2x*dffts%npp( nolist( ii ) + 1 )! From where to copy initially + DO ii = 1, dffts%nogrp + IF ( dffts%nolist( ii ) == me_bgrp ) EXIT !Exit the loop + from = from + dffts%nr1x*dffts%nr2x*dffts%npp( dffts%nolist( ii ) + 1 )! From where to copy initially ENDDO ! DO ir = 1, nspin diff --git a/CPV/cplib.f90 b/CPV/cplib.f90 index ff306f0bc..1437acd50 100644 --- a/CPV/cplib.f90 +++ b/CPV/cplib.f90 @@ -2188,7 +2188,7 @@ END FUNCTION ! initialize FFT descriptor CALL fft_box_set( dfftb, nr1b, nr2b, nr3b, nr1bx, nr2bx, nr3bx, & - nat, irb, me_bgrp+1, nproc_bgrp, intra_bgrp_comm, dfftp%npp, dfftp%ipp ) + nat, irb, dfftp%npp, dfftp%ipp ) IF( iprsta > 2 ) THEN isa = 1 diff --git a/CPV/cpr.f90 b/CPV/cpr.f90 index fe46214f3..3df4619db 100644 --- a/CPV/cpr.f90 +++ b/CPV/cpr.f90 @@ -910,7 +910,6 @@ SUBROUTINE terminate_run() USE cp_main_variables, ONLY : acc USE cg_module, ONLY : tcg, print_clock_tcg USE mp, ONLY : mp_report - USE mp_global, ONLY : use_task_groups ! IMPLICIT NONE ! @@ -951,11 +950,7 @@ SUBROUTINE terminate_run() ! IF (tcg) call print_clock_tcg() ! - IF( use_task_groups ) THEN - ! - CALL print_clock( 'ALLTOALL' ) - ! - END IF + CALL print_clock( 'ALLTOALL' ) ! CALL mp_report() ! diff --git a/CPV/forces.f90 b/CPV/forces.f90 index 4c9d1d1fc..3654663d9 100644 --- a/CPV/forces.f90 +++ b/CPV/forces.f90 @@ -39,8 +39,7 @@ USE funct, ONLY: dft_is_meta USE fft_base, ONLY: dffts USE fft_interfaces, ONLY: fwfft, invfft - USE mp_global, ONLY: nogrp, me_bgrp, ogrp_comm, & - use_task_groups + USE mp_global, ONLY: me_bgrp ! IMPLICIT NONE ! @@ -69,9 +68,9 @@ ! CALL start_clock( 'dforce' ) ! - IF( use_task_groups ) THEN - nogrp_ = nogrp - ALLOCATE( psi( dffts%tg_nnr * nogrp ) ) + IF( dffts%have_task_groups ) THEN + nogrp_ = dffts%nogrp + ALLOCATE( psi( dffts%tg_nnr * dffts%nogrp ) ) ELSE nogrp_ = 1 ALLOCATE( psi( nrxxs ) ) @@ -120,7 +119,7 @@ iss2 = iss1 END IF ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! !$omp parallel do DO ir = 1, dffts%nr1x*dffts%nr2x*dffts%tg_npp( me_bgrp + 1 ) @@ -161,7 +160,7 @@ !$omp parallel default(none) & !$omp private( eig_offset, igno, fi, fip, idx, fp, fm, ig ) & !$omp shared( nogrp_ , f, ngw, psi, df, da, c, tpiba2, tens, dffts, me_bgrp, & -!$omp i, n, ggp, use_task_groups, nls, nlsm ) +!$omp i, n, ggp, nls, nlsm ) eig_offset = 0 igno = 1 @@ -176,7 +175,7 @@ fi = -0.5d0*f(i+idx-1) fip = -0.5d0*f(i+idx) endif - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN !$omp do DO ig=1,ngw fp= psi(nls(ig)+eig_offset) + psi(nlsm(ig)+eig_offset) diff --git a/CPV/init.f90 b/CPV/init.f90 index 40e6aeec2..d1674af23 100644 --- a/CPV/init.f90 +++ b/CPV/init.f90 @@ -47,7 +47,8 @@ USE electrons_module, ONLY: bmeshset USE electrons_base, ONLY: distribute_bands USE problem_size, ONLY: cpsizes - USE mp_global, ONLY: nproc_bgrp, nbgrp, my_bgrp_id, intra_bgrp_comm + USE mp_global, ONLY: me_bgrp, nproc_bgrp, nbgrp, my_bgrp_id, intra_bgrp_comm + USE mp_global, ONLY: get_ntask_groups USE core, ONLY: nlcc_any USE uspp, ONLY: okvan @@ -56,7 +57,7 @@ integer :: i real(dp) :: rat1, rat2, rat3 real(dp) :: at(3,3), bg(3,3), tpiba2 - integer :: ng_, ngs_, ngm_ , ngw_ + integer :: ng_, ngs_, ngm_ , ngw_ , nogrp_ tpiba2 = ( tpi / alat ) ** 2 @@ -117,9 +118,10 @@ ! ... set the sticks mesh and distribute g vectors among processors ! ... pstickset lso sets the local real-space grid dimensions ! + nogrp_ = get_ntask_groups() CALL pstickset( gamma_only, bg, gcutm, gkcut, gcutms, & - dfftp, dffts, ngw_ , ngm_ , ngs_ ) + dfftp, dffts, ngw_ , ngm_ , ngs_ , me_bgrp, nproc_bgrp, intra_bgrp_comm, nogrp_ ) ! ! ! ... Initialize reciprocal space local and global dimensions diff --git a/CPV/runcp.f90 b/CPV/runcp.f90 index 172294c24..372797681 100644 --- a/CPV/runcp.f90 +++ b/CPV/runcp.f90 @@ -26,8 +26,8 @@ ! USE parallel_include USE kinds, ONLY : DP - USE mp_global, ONLY : nogrp, ogrp_comm, me_bgrp, nolist,& - use_task_groups, my_bgrp_id, nbgrp, inter_bgrp_comm + USE mp_global, ONLY : me_bgrp, & + my_bgrp_id, nbgrp, inter_bgrp_comm USE mp, ONLY : mp_sum USE fft_base, ONLY : dffts, tg_gather use wave_base, only : wave_steepest, wave_verlet @@ -74,9 +74,9 @@ IF( restart ) iflag = 2 END IF - IF( use_task_groups ) THEN - tg_rhos_siz = nogrp * dffts%tg_nnr - c2_siz = nogrp * ngwx + IF( dffts%have_task_groups ) THEN + tg_rhos_siz = dffts%nogrp * dffts%tg_nnr + c2_siz = dffts%nogrp * ngwx ELSE tg_rhos_siz = 1 c2_siz = ngw @@ -116,7 +116,7 @@ c2 = 0D0 c3 = 0D0 - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! ! The potential in rhos is distributed accros all processors ! We need to redistribute it so that it is completely contained in the @@ -126,7 +126,7 @@ CALL tg_gather( dffts, rhos(:,i), tg_rhos(:,i) ) END DO - incr = 2 * nogrp + incr = 2 * dffts%nogrp ELSE @@ -137,7 +137,7 @@ DO i = 1, nbsp_bgrp, incr - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! !The input coefficients to dforce cover eigenstates i:i+2*NOGRP-1 !Thus, in dforce the dummy arguments for c0_bgrp(1,i) and @@ -241,11 +241,8 @@ USE efield_module, ONLY : dforce_efield, tefield USE electrons_base, ONLY : ispin, nspin, f, n=>nbsp USE cp_interfaces, ONLY : dforce - USE mp_global, ONLY : use_task_groups - ! USE gvecw, ONLY: ngw - ! - ! + USE fft_base, ONLY: dffts USE electrons_base, ONLY: nx=>nbnd, nupdwn, iupdwn, nbspx, nbsp USE mp, ONLY: mp_sum USE mp_global, ONLY: intra_bgrp_comm @@ -286,7 +283,7 @@ 'Electric field and sic are not implemented',2) IF( nspin == 1 ) CALL errore(' runcp_force_pairing ',' inconsistent nspin ', 1) - IF( use_task_groups ) CALL errore(' runcp_force_pairing ',' task_groups not implemented ', 1) + IF( dffts%have_task_groups ) CALL errore(' runcp_force_pairing ',' task_groups not implemented ', 1) ! ALLOCATE( emadt2( ngw ) ) ALLOCATE( emaver( ngw ) ) diff --git a/Modules/environment.f90 b/Modules/environment.f90 index 11199470c..62a494aa1 100644 --- a/Modules/environment.f90 +++ b/Modules/environment.f90 @@ -13,7 +13,7 @@ MODULE environment USE io_files, ONLY: crash_file, crashunit, nd_nmbr USE io_global, ONLY: stdout, meta_ionode USE mp_global, ONLY: me_image, my_image_id, root_image, nimage, & - nproc_image, nproc, nogrp, npool, nproc_pool, nbgrp + nproc_image, nproc, npool, nproc_pool, nbgrp, get_ntask_groups USE global_version, ONLY: version_number IMPLICIT NONE @@ -194,8 +194,8 @@ CONTAINS '(5X,"K-points division: npool = ",I4)' ) npool IF ( nproc_pool > 1 ) WRITE( stdout, & '(5X,"R & G space division: proc/pool = ",I4)' ) nproc_pool - IF ( nogrp > 1 ) WRITE( stdout, & - '(5X,"wavefunctions fft division: fft/group = ",I4)' ) nogrp + IF ( get_ntask_groups() > 1 ) WRITE( stdout, & + '(5X,"wavefunctions fft division: fft/group = ",I4)' ) get_ntask_groups() ! END SUBROUTINE parallel_info !==-----------------------------------------------------------------------==! diff --git a/Modules/fft_base.f90 b/Modules/fft_base.f90 index ac9ef0bf7..9d8be53bd 100644 --- a/Modules/fft_base.f90 +++ b/Modules/fft_base.f90 @@ -55,7 +55,8 @@ ! like infiniband, ethernet, myrinet ! !----------------------------------------------------------------------- -SUBROUTINE fft_scatter ( f_in, nr3x, nxx_, f_aux, ncp_, npp_, sign, use_tg ) +SUBROUTINE fft_scatter ( f_in, nr3x, nxx_, f_aux, ncp_, npp_, sign, use_tg, & + me_pgrp, npgrp, pgrp_comm, nplist ) !----------------------------------------------------------------------- ! ! transpose the fft grid across nodes @@ -89,7 +90,7 @@ SUBROUTINE fft_scatter ( f_in, nr3x, nxx_, f_aux, ncp_, npp_, sign, use_tg ) USE parallel_include #endif USE mp_global, ONLY : nproc_pool, me_pool, intra_pool_comm, nproc, & - my_image_id, nogrp, pgrp_comm, nplist, me_pgrp, npgrp + my_image_id USE kinds, ONLY : DP IMPLICIT NONE @@ -97,6 +98,7 @@ SUBROUTINE fft_scatter ( f_in, nr3x, nxx_, f_aux, ncp_, npp_, sign, use_tg ) INTEGER, INTENT(in) :: nr3x, nxx_, sign, ncp_ (:), npp_ (:) COMPLEX (DP), INTENT(inout) :: f_in (nxx_), f_aux (nxx_) LOGICAL, OPTIONAL, INTENT(in) :: use_tg + INTEGER, OPTIONAL, INTENT(in) :: pgrp_comm, nplist(:), npgrp, me_pgrp #ifdef __PARA @@ -426,7 +428,8 @@ END SUBROUTINE fft_scatter ! with a defined topology, like on bluegene and cray machine ! !----------------------------------------------------------------------- -SUBROUTINE fft_scatter ( f_in, nr3x, nxx_, f_aux, ncp_, npp_, sign, use_tg ) +SUBROUTINE fft_scatter ( f_in, nr3x, nxx_, f_aux, ncp_, npp_, sign, use_tg, & + me_pgrp, npgrp, pgrp_comm, nplist ) !----------------------------------------------------------------------- ! ! transpose the fft grid across nodes @@ -459,8 +462,7 @@ SUBROUTINE fft_scatter ( f_in, nr3x, nxx_, f_aux, ncp_, npp_, sign, use_tg ) #ifdef __PARA USE parallel_include #endif - USE mp_global, ONLY : nproc_pool, me_pool, intra_pool_comm, nproc, & - my_image_id, nogrp, pgrp_comm, nplist + USE mp_global, ONLY : nproc_pool, me_pool, intra_pool_comm, nproc USE kinds, ONLY : DP IMPLICIT NONE @@ -468,6 +470,7 @@ SUBROUTINE fft_scatter ( f_in, nr3x, nxx_, f_aux, ncp_, npp_, sign, use_tg ) INTEGER, INTENT(in) :: nr3x, nxx_, sign, ncp_ (:), npp_ (:) COMPLEX (DP), INTENT(inout) :: f_in (nxx_), f_aux (nxx_) LOGICAL, OPTIONAL, INTENT(in) :: use_tg + INTEGER, OPTIONAL, INTENT(in) :: pgrp_comm, nplist(:), npgrp, me_pgrp #ifdef __PARA @@ -491,7 +494,7 @@ SUBROUTINE fft_scatter ( f_in, nr3x, nxx_, f_aux, ncp_, npp_, sign, use_tg ) ! IF( use_tg_ ) THEN ! This is the number of procs. in the plane-wave group - nprocp = nproc_pool / nogrp + nprocp = npgrp ELSE nprocp = nproc_pool ENDIF @@ -1191,7 +1194,7 @@ SUBROUTINE tg_gather( dffts, v, tg_v ) IF( size( tg_v ) < nsiz_tg ) & CALL errore( ' tg_gather ', ' tg_v too small ', ( nsiz_tg - size( tg_v ) ) ) - nsiz = dffts%npp( dffts%myid+1 ) * dffts%nr1x * dffts%nr2x + nsiz = dffts%npp( dffts%mype+1 ) * dffts%nr1x * dffts%nr2x IF( size( v ) < nsiz ) & CALL errore( ' tg_gather ', ' v too small ', ( nsiz - size( v ) ) ) diff --git a/Modules/fft_interfaces.f90 b/Modules/fft_interfaces.f90 index 3aa4d60a9..7c2ce05fa 100644 --- a/Modules/fft_interfaces.f90 +++ b/Modules/fft_interfaces.f90 @@ -83,7 +83,6 @@ use fft_scalar, only: cfft3d, cfft3ds, cft_b, cft_b_omp use fft_parallel, only: tg_cft3s USE fft_types, only: fft_dlay_descriptor - use mp_global, only: use_task_groups IMPLICIT none @@ -135,7 +134,7 @@ ELSE IF( grid_type == 'Smooth' ) THEN call tg_cft3s( f, dffts, 1 ) ELSE IF( grid_type == 'Wave' ) THEN - call tg_cft3s( f, dffts, 2, use_task_groups ) + call tg_cft3s( f, dffts, 2, dffts%have_task_groups ) ELSE IF( grid_type == 'Box' .AND. dfftb%np3( ia ) > 0 ) THEN #if defined __OPENMP && defined __FFTW call cft_b_omp( f, dfftb%nr1, dfftb%nr2, dfftb%nr3, & @@ -200,7 +199,6 @@ use fft_scalar, only: cfft3d, cfft3ds use fft_parallel, only: tg_cft3s USE fft_types, only: fft_dlay_descriptor - use mp_global, only: use_task_groups implicit none @@ -237,7 +235,7 @@ ELSE IF( grid_type == 'Smooth' ) THEN call tg_cft3s(f,dffts,-1) ELSE IF( grid_type == 'Wave' ) THEN - call tg_cft3s(f,dffts,-2, use_task_groups ) + call tg_cft3s(f,dffts,-2, dffts%have_task_groups ) END IF #else diff --git a/Modules/fft_parallel.f90 b/Modules/fft_parallel.f90 index 8e0be7b1a..9f6db6f07 100644 --- a/Modules/fft_parallel.f90 +++ b/Modules/fft_parallel.f90 @@ -60,8 +60,8 @@ SUBROUTINE tg_cft3s( f, dfft, isgn, use_task_groups ) USE fft_scalar, ONLY : cft_1z, cft_2xy USE fft_base, ONLY : fft_scatter USE kinds, ONLY : DP - USE mp_global, ONLY : me_pool, nproc_pool, ogrp_comm, npgrp, nogrp, & - intra_pool_comm, nolist, nplist + USE mp_global, ONLY : me_pool, nproc_pool, & + intra_pool_comm USE fft_types, ONLY : fft_dlay_descriptor USE parallel_include @@ -99,8 +99,8 @@ SUBROUTINE tg_cft3s( f, dfft, isgn, use_task_groups ) nx3 = dfft%nr3x ! IF( use_tg ) THEN - ALLOCATE( aux( nogrp * dfft%tg_nnr ) ) - ALLOCATE( YF ( nogrp * dfft%tg_nnr ) ) + ALLOCATE( aux( dfft%nogrp * dfft%tg_nnr ) ) + ALLOCATE( YF ( dfft%nogrp * dfft%tg_nnr ) ) ELSE ALLOCATE( aux( dfft%tg_nnr ) ) ENDIF @@ -198,10 +198,10 @@ CONTAINS ! IF( .not. use_tg ) RETURN ! - IF( dfft%tg_rdsp(nogrp) + dfft%tg_rcv(nogrp) > size( yf ) ) THEN + IF( dfft%tg_rdsp(dfft%nogrp) + dfft%tg_rcv(dfft%nogrp) > size( yf ) ) THEN CALL errore( ' tg_cfft ', ' inconsistent size ', 1 ) ENDIF - IF( dfft%tg_psdsp(nogrp) + dfft%tg_snd(nogrp) > size( f ) ) THEN + IF( dfft%tg_psdsp(dfft%nogrp) + dfft%tg_snd(dfft%nogrp) > size( f ) ) THEN CALL errore( ' tg_cfft ', ' inconsistent size ', 2 ) ENDIF @@ -213,7 +213,7 @@ CONTAINS #if defined __MPI CALL MPI_ALLTOALLV( f(1), dfft%tg_snd, dfft%tg_psdsp, MPI_DOUBLE_COMPLEX, yf(1), dfft%tg_rcv, & - & dfft%tg_rdsp, MPI_DOUBLE_COMPLEX, ogrp_comm, IERR) + & dfft%tg_rdsp, MPI_DOUBLE_COMPLEX, dfft%ogrp_comm, IERR) IF( ierr /= 0 ) THEN CALL errore( ' tg_cfft ', ' alltoall error 1 ', abs(ierr) ) ENDIF @@ -237,10 +237,10 @@ CONTAINS ! IF( .not. use_tg ) RETURN ! - IF( dfft%tg_usdsp(nogrp) + dfft%tg_snd(nogrp) > size( f ) ) THEN + IF( dfft%tg_usdsp(dfft%nogrp) + dfft%tg_snd(dfft%nogrp) > size( f ) ) THEN CALL errore( ' tg_cfft ', ' inconsistent size ', 3 ) ENDIF - IF( dfft%tg_rdsp(nogrp) + dfft%tg_rcv(nogrp) > size( yf ) ) THEN + IF( dfft%tg_rdsp(dfft%nogrp) + dfft%tg_rcv(dfft%nogrp) > size( yf ) ) THEN CALL errore( ' tg_cfft ', ' inconsistent size ', 4 ) ENDIF @@ -249,7 +249,7 @@ CONTAINS #if defined __MPI CALL MPI_Alltoallv( yf(1), & dfft%tg_rcv, dfft%tg_rdsp, MPI_DOUBLE_COMPLEX, f(1), & - dfft%tg_snd, dfft%tg_usdsp, MPI_DOUBLE_COMPLEX, ogrp_comm, IERR) + dfft%tg_snd, dfft%tg_usdsp, MPI_DOUBLE_COMPLEX, dfft%ogrp_comm, IERR) IF( ierr /= 0 ) THEN CALL errore( ' tg_cfft ', ' alltoall error 2 ', abs(ierr) ) ENDIF @@ -288,7 +288,8 @@ CONTAINS npp = dfft%tg_npp( me_p ) nnp = nx1*nx2 ! - CALL fft_scatter( aux, nx3, nogrp*dfft%tg_nnr, f, dfft%tg_nsw, dfft%tg_npp, iopt, use_tg ) + CALL fft_scatter( aux, nx3, dfft%nogrp*dfft%tg_nnr, f, dfft%tg_nsw, dfft%tg_npp, iopt, use_tg, & + dfft%me_pgrp, dfft%npgrp, dfft%pgrp_comm, dfft%nplist ) ! ELSE ! @@ -407,7 +408,8 @@ CONTAINS ! IF( use_tg ) THEN ! - CALL fft_scatter( aux, nx3, nogrp*dfft%tg_nnr, f, dfft%tg_nsw, dfft%tg_npp, iopt, use_tg ) + CALL fft_scatter( aux, nx3, dfft%nogrp*dfft%tg_nnr, f, dfft%tg_nsw, dfft%tg_npp, iopt, use_tg, & + dfft%me_pgrp, dfft%npgrp, dfft%pgrp_comm, dfft%nplist ) ! ELSE ! diff --git a/Modules/fft_types.f90 b/Modules/fft_types.f90 index f2b564dc5..7ebe614ab 100644 --- a/Modules/fft_types.f90 +++ b/Modules/fft_types.f90 @@ -54,19 +54,22 @@ MODULE fft_types ! ! fft parallelization ! - INTEGER :: myid ! my processor id + INTEGER :: mype ! my processor id (starting from 0) in the fft group INTEGER :: comm ! communicator of the fft gruop INTEGER :: nproc ! number of processor in the fft group ! ! task groups ! - INTEGER :: NOGRP - INTEGER :: NPGRP - INTEGER :: ogrp_comm - INTEGER :: pgrp_comm - INTEGER, POINTER :: nolist(:) ! number of sticks per task group ( wave func ) - ! LOGICAL :: have_task_groups + ! + INTEGER :: me_pgrp ! task id for plane wave task group + INTEGER :: nogrp ! number of proc. in an orbital "task group" + INTEGER :: npgrp ! number of proc. in a plane-wave "task group" + INTEGER :: ogrp_comm ! orbital group communicator + INTEGER :: pgrp_comm ! plane-wave group communicator + INTEGER, POINTER :: nolist(:) ! list of pes in orbital group + INTEGER, POINTER :: nplist(:) ! list of pes in pw group + ! INTEGER :: tg_nnr ! maximum among nnr INTEGER, POINTER :: tg_nsw(:) ! number of sticks per task group ( wave func ) INTEGER, POINTER :: tg_npp(:) ! number of "Z" planes per task group @@ -84,9 +87,10 @@ MODULE fft_types CONTAINS - SUBROUTINE fft_dlay_allocate( desc, myid, nproc, comm, nx, ny ) + SUBROUTINE fft_dlay_allocate( desc, mype, nproc, comm, nogrp, nx, ny ) TYPE (fft_dlay_descriptor) :: desc - INTEGER, INTENT(in) :: myid, nproc, comm, nx, ny ! myid starting from 0 + INTEGER, INTENT(in) :: mype, nproc, comm, nx, ny ! mype starting from 0 + INTEGER, INTENT(in) :: nogrp ! number of task groups ALLOCATE( desc%nsp( nproc ) ) ALLOCATE( desc%nsw( nproc ) ) ALLOCATE( desc%ngl( nproc ) ) @@ -113,15 +117,25 @@ CONTAINS desc%id = 0 - desc%myid = myid + desc%mype = mype desc%comm = comm desc%nproc = nproc desc%have_task_groups = .false. - desc%NOGRP = 0 - desc%NPGRP = 0 + IF( nogrp > 1 ) & + desc%have_task_groups = .true. + desc%me_pgrp = 0 + ! + IF( MOD( nproc, MAX( 1, nogrp ) ) /= 0 ) & + CALL errore( " fft_dlay_allocate ", "the number of task groups should be a divisor of nproc ", 1 ) + + desc%nogrp = MAX( 1, nogrp ) + desc%npgrp = nproc / MAX( 1, nogrp ) desc%ogrp_comm = 0 desc%pgrp_comm = 0 - NULLIFY( desc%nolist ) + ALLOCATE( desc%nolist( desc%nogrp ) ) + ALLOCATE( desc%nplist( desc%npgrp ) ) + desc%nolist = 0 + desc%nplist = 0 NULLIFY( desc%tg_nsw ) NULLIFY( desc%tg_npp ) NULLIFY( desc%tg_snd ) @@ -146,9 +160,10 @@ CONTAINS IF ( associated( desc%ismap ) ) DEALLOCATE( desc%ismap ) IF ( associated( desc%iplp ) ) DEALLOCATE( desc%iplp ) IF ( associated( desc%iplw ) ) DEALLOCATE( desc%iplw ) + IF ( associated( desc%nolist ) ) DEALLOCATE( desc%nolist ) + IF ( associated( desc%nplist ) ) DEALLOCATE( desc%nplist ) desc%id = 0 IF( desc%have_task_groups ) THEN - IF ( associated( desc%nolist ) ) DEALLOCATE( desc%nolist ) IF ( associated( desc%tg_nsw ) ) DEALLOCATE( desc%tg_nsw ) IF ( associated( desc%tg_npp ) ) DEALLOCATE( desc%tg_npp ) IF ( associated( desc%tg_snd ) ) DEALLOCATE( desc%tg_snd ) @@ -162,9 +177,9 @@ CONTAINS !=----------------------------------------------------------------------------=! - SUBROUTINE fft_box_allocate( desc, myid, nproc, comm, nat ) + SUBROUTINE fft_box_allocate( desc, mype, nproc, comm, nat ) TYPE (fft_dlay_descriptor) :: desc - INTEGER, INTENT(in) :: nat, nproc, myid, comm ! myid starting from 0 + INTEGER, INTENT(in) :: nat, nproc, mype, comm ! mype starting from 0 ALLOCATE( desc%irb( 3, nat ) ) ALLOCATE( desc%imin3( nat ) ) ALLOCATE( desc%imax3( nat ) ) @@ -177,7 +192,7 @@ CONTAINS desc%npp = 0 desc%ipp = 0 desc%np3 = 0 - desc%myid = myid + desc%mype = mype desc%nproc = nproc desc%comm = comm desc%have_task_groups = .false. @@ -197,8 +212,8 @@ CONTAINS !=----------------------------------------------------------------------------=! - SUBROUTINE fft_dlay_set( desc, tk, nst, nr1, nr2, nr3, nr1x, nr2x, nr3x, me, & - nproc, comm, nogrp, ub, lb, idx, in1, in2, ncp, ncpw, ngp, ngpw, st, stw ) + SUBROUTINE fft_dlay_set( desc, tk, nst, nr1, nr2, nr3, nr1x, nr2x, nr3x, & + ub, lb, idx, in1, in2, ncp, ncpw, ngp, ngpw, st, stw ) TYPE (fft_dlay_descriptor) :: desc @@ -206,10 +221,6 @@ CONTAINS INTEGER, INTENT(in) :: nst INTEGER, INTENT(in) :: nr1, nr2, nr3 ! size of real space grid INTEGER, INTENT(in) :: nr1x, nr2x, nr3x ! padded size of real space grid - INTEGER, INTENT(in) :: me ! processor index (starting from 1) - INTEGER, INTENT(in) :: nproc ! number of processors - INTEGER, INTENT(in) :: comm ! communicator - INTEGER, INTENT(in) :: nogrp ! number of processors in task-group INTEGER, INTENT(in) :: ub(3), lb(3) ! upper and lower bound of real space indices INTEGER, INTENT(in) :: idx(:) INTEGER, INTENT(in) :: in1(:) @@ -221,15 +232,15 @@ CONTAINS INTEGER, INTENT(in) :: st( lb(1) : ub(1), lb(2) : ub(2) ) INTEGER, INTENT(in) :: stw( lb(1) : ub(1), lb(2) : ub(2) ) - INTEGER :: npp( nproc ), n3( nproc ), nsp( nproc ) + INTEGER :: npp( desc%nproc ), n3( desc%nproc ), nsp( desc%nproc ) INTEGER :: np, nq, i, is, iss, i1, i2, m1, m2, n1, n2, ip ! Task-grouping C. Bekas ! INTEGER :: sm - IF( ( size( desc%ngl ) < nproc ) .or. ( size( desc%npp ) < nproc ) .or. & - ( size( desc%ipp ) < nproc ) .or. ( size( desc%iss ) < nproc ) ) & + IF( ( size( desc%ngl ) < desc%nproc ) .or. ( size( desc%npp ) < desc%nproc ) .or. & + ( size( desc%ipp ) < desc%nproc ) .or. ( size( desc%iss ) < desc%nproc ) ) & CALL errore( ' fft_dlay_set ', ' wrong descriptor dimensions ', 1 ) IF( ( nr1 > nr1x ) .or. ( nr2 > nr2x ) .or. ( nr3 > nr3x ) ) & @@ -238,37 +249,26 @@ CONTAINS IF( ( size( idx ) < nst ) .or. ( size( in1 ) < nst ) .or. ( size( in2 ) < nst ) ) & CALL errore( ' fft_dlay_set ', ' wrong number of stick dimensions ', 3 ) - IF( ( size( ncp ) < nproc ) .or. ( size( ngp ) < nproc ) ) & + IF( ( size( ncp ) < desc%nproc ) .or. ( size( ngp ) < desc%nproc ) ) & CALL errore( ' fft_dlay_set ', ' wrong stick dimensions ', 4 ) - IF( desc%nproc /= nproc ) & - CALL errore( ' fft_dlay_set ', ' wrong number of processor ', 4 ) - - IF( desc%myid /= (me - 1) ) & - CALL errore( ' fft_dlay_set ', ' wrong processor index ', 4 ) - - IF( desc%comm /= comm ) & - CALL errore( ' fft_dlay_set ', ' wrong communicator ', 4 ) - - desc%have_task_groups = .false. - ! Set the number of "xy" planes for each processor ! in other word do a slab partition along the z axis sm = 0 npp = 0 - IF ( nproc == 1 ) THEN + IF ( desc%nproc == 1 ) THEN npp(1) = nr3 - ELSEIF( nproc <= nr3 ) THEN - np = nr3 / nproc - nq = nr3 - np * nproc - DO i = 1, nproc + ELSEIF( desc%nproc <= nr3 ) THEN + np = nr3 / desc%nproc + nq = nr3 - np * desc%nproc + DO i = 1, desc%nproc npp(i) = np IF ( i <= nq ) npp(i) = np + 1 ENDDO ELSE DO ip = 1, nr3 ! some compiler complains for empty DO loops - DO i = 1, nproc, nogrp + DO i = 1, desc%nproc, desc%nogrp npp(i) = npp(i) + 1 sm = sm + 1 IF ( sm == nr3 ) exit @@ -277,17 +277,17 @@ CONTAINS ENDDO ENDIF - desc%npp( 1:nproc ) = npp - desc%npl = npp( me ) + desc%npp( 1:desc%nproc ) = npp + desc%npl = npp( desc%mype + 1 ) ! Find out the index of the starting plane on each proc n3 = 0 - DO i = 2, nproc + DO i = 2, desc%nproc n3(i) = n3(i-1) + npp(i-1) ENDDO - desc%ipp( 1:nproc ) = n3 + desc%ipp( 1:desc%nproc ) = n3 ! Set the proper number of sticks @@ -309,14 +309,14 @@ CONTAINS ! Set fft local workspace dimension - IF ( nproc == 1 ) THEN + IF ( desc%nproc == 1 ) THEN desc%nnr = nr1x * nr2x * nr3x desc%tg_nnr = desc%nnr ELSE - desc%nnr = max( nr3x * ncp(me), nr1x * nr2x * npp(me) ) + desc%nnr = max( nr3x * ncp( desc%mype + 1 ), nr1x * nr2x * npp( desc%mype + 1 ) ) desc%nnr = max( 1, desc%nnr ) ! ensure that desc%nrr > 0 ( for extreme parallelism ) desc%tg_nnr = desc%nnr - DO i = 1, nproc + DO i = 1, desc%nproc desc%tg_nnr = max( desc%tg_nnr, nr3x * ncp( i ) ) desc%tg_nnr = max( desc%tg_nnr, nr1x * nr2x * npp( i ) ) ENDDO @@ -325,8 +325,8 @@ CONTAINS - desc%ngl( 1:nproc ) = ngp( 1:nproc ) - desc%nwl( 1:nproc ) = ngpw( 1:nproc ) + desc%ngl( 1:desc%nproc ) = ngp( 1:desc%nproc ) + desc%nwl( 1:desc%nproc ) = ngpw( 1:desc%nproc ) IF( size( desc%isind ) < ( nr1x * nr2x ) ) & CALL errore( ' fft_dlay_set ', ' wrong descriptor dimensions, isind ', 5 ) @@ -379,7 +379,7 @@ CONTAINS ! local stick ( desc%iss ) ! - DO i = 1, nproc + DO i = 1, desc%nproc IF( i == 1 ) THEN desc%iss( i ) = 0 ELSE @@ -406,7 +406,7 @@ CONTAINS IF( ip > 0 ) THEN nsp( ip ) = nsp( ip ) + 1 desc%ismap( nsp( ip ) + desc%iss( ip ) ) = iss - IF( ip == me ) THEN + IF( ip == ( desc%mype + 1 ) ) THEN desc%isind( iss ) = nsp( ip ) ELSE desc%isind( iss ) = 0 @@ -416,14 +416,14 @@ CONTAINS ! chack number of stick against the input value - IF( any( nsp( 1:nproc ) /= ncpw( 1:nproc ) ) ) THEN - DO ip = 1, nproc + IF( any( nsp( 1:desc%nproc ) /= ncpw( 1:desc%nproc ) ) ) THEN + DO ip = 1, desc%nproc WRITE( stdout,*) ' * ', ip, ' * ', nsp( ip ), ' /= ', ncpw( ip ) ENDDO CALL errore( ' fft_dlay_set ', ' inconsistent number of sticks ', 7 ) ENDIF - desc%nsw( 1:nproc ) = nsp( 1:nproc ) + desc%nsw( 1:desc%nproc ) = nsp( 1:desc%nproc ) ! then add pseudopotential stick @@ -432,7 +432,7 @@ CONTAINS IF( ip < 0 ) THEN nsp( -ip ) = nsp( -ip ) + 1 desc%ismap( nsp( -ip ) + desc%iss( -ip ) ) = iss - IF( -ip == me ) THEN + IF( -ip == ( desc%mype + 1 ) ) THEN desc%isind( iss ) = nsp( -ip ) ELSE desc%isind( iss ) = 0 @@ -442,14 +442,14 @@ CONTAINS ! chack number of stick against the input value - IF( any( nsp( 1:nproc ) /= ncp( 1:nproc ) ) ) THEN - DO ip = 1, nproc + IF( any( nsp( 1:desc%nproc ) /= ncp( 1:desc%nproc ) ) ) THEN + DO ip = 1, desc%nproc WRITE( stdout,*) ' * ', ip, ' * ', nsp( ip ), ' /= ', ncp( ip ) ENDDO CALL errore( ' fft_dlay_set ', ' inconsistent number of sticks ', 8 ) ENDIF - desc%nsp( 1:nproc ) = nsp( 1:nproc ) + desc%nsp( 1:desc%nproc ) = nsp( 1:desc%nproc ) icount = icount + 1 desc%id = icount @@ -464,13 +464,13 @@ CONTAINS !=----------------------------------------------------------------------------=! SUBROUTINE fft_box_set( desc, nr1b, nr2b, nr3b, nr1bx, nr2bx, nr3bx, nat, & - irb, me, nproc, comm, npp, ipp ) + irb, npp, ipp ) IMPLICIT NONE TYPE (fft_dlay_descriptor) :: desc - INTEGER, INTENT(in) :: nat, me, nproc, comm + INTEGER, INTENT(in) :: nat INTEGER, INTENT(in) :: irb( :, : ) INTEGER, INTENT(in) :: npp( : ) INTEGER, INTENT(in) :: ipp( : ) @@ -483,19 +483,9 @@ CONTAINS CALL errore(" fft_box_set ", " inconsistent dimensions ", 1 ) ENDIF - IF( nproc > size( desc%npp ) ) & + IF( desc%nproc > size( desc%npp ) ) & CALL errore(" fft_box_set ", " inconsistent dimensions ", 2 ) - IF( desc%nproc /= nproc ) & - CALL errore( ' fft_dlay_set ', ' wrong number of processor ', 4 ) - - IF( desc%myid /= (me - 1) ) & - CALL errore( ' fft_dlay_set ', ' wrong processor index ', 4 ) - - IF( desc%comm /= comm ) & - CALL errore( ' fft_dlay_set ', ' wrong communicator ', 4 ) - - desc%nr1 = nr1b desc%nr2 = nr2b desc%nr3 = nr3b @@ -504,10 +494,10 @@ CONTAINS desc%nr3x = nr3bx desc%irb( 1:3, 1:nat ) = irb( 1:3, 1:nat ) - desc%npp( 1:nproc ) = npp( 1:nproc ) - desc%ipp( 1:nproc ) = ipp( 1:nproc ) + desc%npp( 1:desc%nproc ) = npp( 1:desc%nproc ) + desc%ipp( 1:desc%nproc ) = ipp( 1:desc%nproc ) - nr3 = sum( npp( 1:nproc ) ) + nr3 = sum( npp( 1:desc%nproc ) ) DO isa = 1, nat @@ -519,8 +509,8 @@ CONTAINS ibig3 = 1 + mod( irb3 + ir3 - 2, nr3 ) IF( ibig3 < 1 .or. ibig3 > nr3 ) & & CALL errore(' fft_box_set ',' ibig3 wrong ', ibig3 ) - ibig3 = ibig3 - ipp( me ) - IF ( ibig3 > 0 .and. ibig3 <= npp(me) ) THEN + ibig3 = ibig3 - ipp( desc%mype + 1 ) + IF ( ibig3 > 0 .and. ibig3 <= npp(desc%mype + 1) ) THEN imin3 = min( imin3, ir3 ) imax3 = max( imax3, ir3 ) ENDIF @@ -583,8 +573,9 @@ CONTAINS desc%nnp = nr1x * nr2x desc%npp = nr3 desc%ipp = 0 - desc%have_task_groups = .false. desc%tg_nnr = desc%nnr + ! + desc%have_task_groups = .false. RETURN END SUBROUTINE fft_dlay_scalar diff --git a/Modules/mp_global.f90 b/Modules/mp_global.f90 index 585e59480..0bd6c0908 100644 --- a/Modules/mp_global.f90 +++ b/Modules/mp_global.f90 @@ -74,15 +74,7 @@ MODULE mp_global ! ! ... "task" groups (for band parallelization of FFT) ! - INTEGER :: nogrp = 1 ! number of proc. in an orbital "task group" - INTEGER :: npgrp = 1 ! number of proc. in a plane-wave "task group" - INTEGER :: me_pgrp = 0 ! task id for plane wave task group - INTEGER, ALLOCATABLE :: nolist(:) ! list of procs in my orbital task group - INTEGER, ALLOCATABLE :: nplist(:) ! list of procs in my plane wave task group - INTEGER :: pgrp_comm = 0 ! plane-wave group communicator - INTEGER :: ogrp_comm = 0 ! orbital group communicator - LOGICAL :: & - use_task_groups = .FALSE. ! if TRUE task groups parallelization is used + INTEGER :: ntask_groups = 1 ! number of proc. in an orbital "task group" ! ! ... Misc parallelization info ! @@ -93,6 +85,7 @@ MODULE mp_global INTEGER :: nproc_pool_file = 1 ! in a pool ! PRIVATE :: init_images, init_pools, init_bands, init_ortho + PRIVATE :: ntask_groups ! CONTAINS ! @@ -112,7 +105,7 @@ CONTAINS ! ... NPOOL must be a whole divisor of NPROC ! IMPLICIT NONE - INTEGER :: world, ntask_groups, nproc_ortho_in, meta_ionode_id + INTEGER :: world, nproc_ortho_in, meta_ionode_id INTEGER :: root = 0 LOGICAL :: meta_ionode ! @@ -193,12 +186,6 @@ CONTAINS ! CALL init_ortho( nproc_ortho_in ) ! - use_task_groups = ( ntask_groups > 1 ) - IF( use_task_groups ) THEN - nogrp = ntask_groups - CALL init_task_groups( ) - END IF - ! ! RETURN ! @@ -225,7 +212,6 @@ CONTAINS me_pool = mpime me_image = mpime me_bgrp = mpime - me_pgrp = me_pool root_pool = root root_image = root root_bgrp = root @@ -236,10 +222,6 @@ CONTAINS inter_bgrp_comm = group_i intra_bgrp_comm = group_i ortho_comm = group_i - ALLOCATE( nolist( nproc_i ) ) - ALLOCATE( nplist( nproc_i ) ) - nolist = 0 - nplist = 0 ! RETURN ! @@ -251,8 +233,6 @@ CONTAINS ! CALL mp_barrier() CALL mp_end () - IF (ALLOCATED (nolist) ) DEALLOCATE ( nolist ) - IF (ALLOCATED (nplist) ) DEALLOCATE ( nplist ) ! END SUBROUTINE mp_global_end ! @@ -468,87 +448,6 @@ CONTAINS END SUBROUTINE init_ortho ! ! - SUBROUTINE init_task_groups( ) - ! - INTEGER :: i, n1, ipos, color, key, ierr, itsk, ntsk - INTEGER :: pgroup( nproc_pool ) - ! - !SUBDIVIDE THE PROCESSORS IN GROUPS - ! - !THE NUMBER OF GROUPS HAS TO BE A DIVISOR OF THE NUMBER - !OF PROCESSORS - ! - IF( MOD( nproc_pool, nogrp ) /= 0 ) & - CALL errore( " init_task_groups ", "the number of task groups should be a divisor of nproc_pool ", 1 ) - ! - npgrp = nproc_pool / nogrp - - DO i = 1, nproc_pool - pgroup( i ) = i - 1 - ENDDO - ! - !LIST OF PROCESSORS IN MY ORBITAL GROUP - ! - ! processors in these group have contiguous indexes - ! - N1 = ( me_pool / NOGRP ) * NOGRP - 1 - DO i = 1, nogrp - nolist( I ) = pgroup( N1 + I + 1 ) - IF( me_pool == nolist( I ) ) ipos = i - 1 - ENDDO - ! - !LIST OF PROCESSORS IN MY PLANE WAVE GROUP - ! - DO I = 1, npgrp - nplist( I ) = pgroup( ipos + ( i - 1 ) * nogrp + 1 ) - ENDDO - - ! - !SET UP THE GROUPS - ! - ! - !CREATE ORBITAL GROUPS - ! -#if defined __MPI - color = me_pool / nogrp - key = MOD( me_pool , nogrp ) - CALL MPI_COMM_SPLIT( intra_pool_comm, color, key, ogrp_comm, ierr ) - if( ierr /= 0 ) & - CALL errore( ' init_task_groups ', ' creating ogrp_comm ', ABS(ierr) ) - CALL MPI_COMM_RANK( ogrp_comm, itsk, IERR ) - CALL MPI_COMM_SIZE( ogrp_comm, ntsk, IERR ) - IF( nogrp /= ntsk ) CALL errore( ' init_task_groups ', ' ogrp_comm size ', ntsk ) - DO i = 1, nogrp - IF( me_pool == nolist( i ) ) THEN - IF( (i-1) /= itsk ) CALL errore( ' init_task_groups ', ' ogrp_comm rank ', itsk ) - END IF - END DO -#endif - ! - !CREATE PLANEWAVE GROUPS - ! -#if defined __MPI - color = MOD( me_pool , nogrp ) - key = me_pool / nogrp - CALL MPI_COMM_SPLIT( intra_pool_comm, color, key, pgrp_comm, ierr ) - if( ierr /= 0 ) & - CALL errore( ' init_task_groups ', ' creating pgrp_comm ', ABS(ierr) ) - CALL MPI_COMM_RANK( pgrp_comm, itsk, IERR ) - CALL MPI_COMM_SIZE( pgrp_comm, ntsk, IERR ) - IF( npgrp /= ntsk ) CALL errore( ' init_task_groups ', ' pgrp_comm size ', ntsk ) - DO i = 1, npgrp - IF( me_pool == nplist( i ) ) THEN - IF( (i-1) /= itsk ) CALL errore( ' init_task_groups ', ' pgrp_comm rank ', itsk ) - END IF - END DO - me_pgrp = itsk -#endif - - - RETURN - END SUBROUTINE init_task_groups - ! - ! SUBROUTINE init_ortho_group( nproc_try_in, comm_all ) ! IMPLICIT NONE @@ -748,4 +647,11 @@ CONTAINS END SUBROUTINE distribute_over_bgrp ! ! + FUNCTION get_ntask_groups() + IMPLICIT NONE + INTEGER :: get_ntask_groups + get_ntask_groups = ntask_groups + RETURN + END FUNCTION get_ntask_groups + ! END MODULE mp_global diff --git a/Modules/stick_set.f90 b/Modules/stick_set.f90 index c8074aff8..1a1505ba3 100644 --- a/Modules/stick_set.f90 +++ b/Modules/stick_set.f90 @@ -22,10 +22,6 @@ USE io_global, ONLY: ionode, stdout USE fft_types, ONLY: fft_dlay_descriptor, fft_dlay_allocate, & fft_dlay_set, fft_dlay_scalar - USE mp_global, ONLY: me_pool, nproc_pool, intra_pool_comm, nogrp, use_task_groups - USE mp_global, ONLY : me_pool, nproc_pool, intra_pool_comm - USE mp_global, ONLY : NOGRP, NPGRP, ogrp_comm, pgrp_comm - USE mp_global, ONLY : nolist PRIVATE SAVE @@ -37,7 +33,7 @@ !=----------------------------------------------------------------------= SUBROUTINE pstickset( gamma_only, bg, gcut, gkcut, gcuts, & - dfftp, dffts, ngw, ngm, ngs ) + dfftp, dffts, ngw, ngm, ngs, mype, nproc, comm, nogrp_ ) LOGICAL, INTENT(in) :: gamma_only ! ... bg(:,1), bg(:,2), bg(:,3) reciprocal space base vectors. @@ -46,6 +42,10 @@ TYPE(fft_dlay_descriptor), INTENT(inout) :: dfftp, dffts INTEGER, INTENT(out) :: ngw, ngm, ngs + INTEGER, INTENT(IN) :: mype, nproc, comm + INTEGER, INTENT(IN) :: nogrp_ + + LOGICAL :: tk INTEGER :: ub(3), lb(3) @@ -136,8 +136,8 @@ ! ... Fill in the stick maps, for given g-space base and cut-off CALL sticks_maps( tk, ub, lb, bg(:,1), bg(:,2), bg(:,3), & - gcut, gkcut, gcuts, st, stw, sts, me_pool, & - nproc_pool, intra_pool_comm ) + gcut, gkcut, gcuts, st, stw, sts, mype, & + nproc, comm ) ! ... Now count the number of stick nst and nstw @@ -156,14 +156,14 @@ ALLOCATE(ist(nst,5)) - ALLOCATE(nstp(nproc_pool)) - ALLOCATE(sstp(nproc_pool)) + ALLOCATE(nstp(nproc)) + ALLOCATE(sstp(nproc)) - ALLOCATE(nstpw(nproc_pool)) - ALLOCATE(sstpw(nproc_pool)) + ALLOCATE(nstpw(nproc)) + ALLOCATE(sstpw(nproc)) - ALLOCATE(nstps(nproc_pool)) - ALLOCATE(sstps(nproc_pool)) + ALLOCATE(nstps(nproc)) + ALLOCATE(sstps(nproc)) ! ... initialize the sticks indexes array ist @@ -174,7 +174,7 @@ ALLOCATE( idx( nst ) ) - CALL sticks_sort( ist(:,4), ist(:,3), ist(:,5), nst, idx, nproc_pool ) + CALL sticks_sort( ist(:,4), ist(:,3), ist(:,5), nst, idx, nproc ) ! ... Set as first stick the stick containing the G=0 ! @@ -186,26 +186,26 @@ ! idx( iss ) = itmp CALL sticks_dist( tk, ub, lb, idx, ist(:,1), ist(:,2), ist(:,4), ist(:,3), ist(:,5), & - nst, nstp, nstpw, nstps, sstp, sstpw, sstps, st, stw, sts, nproc_pool ) + nst, nstp, nstpw, nstps, sstp, sstpw, sstps, st, stw, sts, nproc ) - ngw = sstpw( me_pool + 1 ) - ngm = sstp( me_pool + 1 ) - ngs = sstps( me_pool + 1 ) + ngw = sstpw( mype + 1 ) + ngm = sstp( mype + 1 ) + ngs = sstps( mype + 1 ) CALL sticks_pairup( tk, ub, lb, idx, ist(:,1), ist(:,2), ist(:,4), ist(:,3), ist(:,5), & - nst, nstp, nstpw, nstps, sstp, sstpw, sstps, st, stw, sts, nproc_pool ) + nst, nstp, nstpw, nstps, sstp, sstpw, sstps, st, stw, sts, nproc ) ! ... Allocate and Set fft data layout descriptors #if defined __PARA - CALL fft_dlay_allocate( dfftp, me_pool, nproc_pool, intra_pool_comm, nr1x, nr2x ) - CALL fft_dlay_allocate( dffts, me_pool, nproc_pool, intra_pool_comm, nr1sx, nr2sx ) + CALL fft_dlay_allocate( dfftp, mype, nproc, comm, nogrp_ , nr1x, nr2x ) + CALL fft_dlay_allocate( dffts, mype, nproc, comm, nogrp_ , nr1sx, nr2sx ) - CALL fft_dlay_set( dfftp, tk, nst, nr1, nr2, nr3, nr1x, nr2x, nr3x, (me_pool+1), & - nproc_pool, intra_pool_comm, nogrp, ub, lb, idx, ist(:,1), ist(:,2), nstp, nstpw, sstp, sstpw, st, stw ) - CALL fft_dlay_set( dffts, tk, nsts, nr1s, nr2s, nr3s, nr1sx, nr2sx, nr3sx, (me_pool+1), & - nproc_pool, intra_pool_comm, nogrp, ub, lb, idx, ist(:,1), ist(:,2), nstps, nstpw, sstps, sstpw, sts, stw ) + CALL fft_dlay_set( dfftp, tk, nst, nr1, nr2, nr3, nr1x, nr2x, nr3x, & + ub, lb, idx, ist(:,1), ist(:,2), nstp, nstpw, sstp, sstpw, st, stw ) + CALL fft_dlay_set( dffts, tk, nsts, nr1s, nr2s, nr3s, nr1sx, nr2sx, nr3sx, & + ub, lb, idx, ist(:,1), ist(:,2), nstps, nstpw, sstps, sstpw, sts, stw ) #else @@ -218,8 +218,8 @@ IF( ngm_ /= ngm ) CALL errore( ' pstickset ', ' inconsistent ngm ', abs( ngm - ngm_ ) ) IF( ngs_ /= ngs ) CALL errore( ' pstickset ', ' inconsistent ngs ', abs( ngs - ngs_ ) ) - CALL fft_dlay_allocate( dfftp, me_pool, nproc_pool, intra_pool_comm, max(nr1x, nr3x), nr2x ) - CALL fft_dlay_allocate( dffts, me_pool, nproc_pool, intra_pool_comm, max(nr1sx, nr3sx), nr2sx ) + CALL fft_dlay_allocate( dfftp, mype, nproc, comm, max(nr1x, nr3x), nr2x ) + CALL fft_dlay_allocate( dffts, mype, nproc, comm, max(nr1sx, nr3sx), nr2sx ) CALL fft_dlay_scalar( dfftp, ub, lb, nr1, nr2, nr3, nr1x, nr2x, nr3x, stw ) CALL fft_dlay_scalar( dffts, ub, lb, nr1s, nr2s, nr3s, nr1sx, nr2sx, nr3sx, stw ) @@ -234,7 +234,7 @@ ! ... Maximum number of sticks (wave func.) nstpwx = maxval( nstpw ) - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! ! Initialize task groups. ! Note that this call modify dffts adding task group data. @@ -247,7 +247,7 @@ 118 FORMAT(3X,' n.st n.stw n.sts n.g n.gw n.gs') WRITE( stdout,121) minval(nstp), minval(nstpw), minval(nstps), minval(sstp), minval(sstpw), minval(sstps) WRITE( stdout,122) maxval(nstp), maxval(nstpw), maxval(nstps), maxval(sstp), maxval(sstpw), maxval(sstps) -! DO ip = 1, nproc_pool +! DO ip = 1, nproc ! IF (ionode) THEN ! WRITE( stdout,120) ip, nstp(ip), nstpw(ip), nstps(ip), sstp(ip), sstpw(ip), sstps(ip) ! END IF @@ -304,20 +304,21 @@ SUBROUTINE task_groups_init( dffts ) INTEGER :: I INTEGER :: IERR INTEGER :: num_planes, num_sticks - INTEGER :: nnrsx_vec ( nproc_pool ) - INTEGER :: pgroup( nproc_pool ) + INTEGER :: nnrsx_vec ( dffts%nproc ) + INTEGER :: pgroup( dffts%nproc ) INTEGER :: strd + CALL task_groups_init_first( dffts ) ! - IF ( nogrp > 1 ) WRITE( stdout, 100 ) nogrp, npgrp + IF ( dffts%nogrp > 1 ) WRITE( stdout, 100 ) dffts%nogrp, dffts%npgrp 100 FORMAT( /,3X,'Task Groups are in USE',/,3X,'groups and procs/group : ',I5,I5 ) !Find maximum chunk of local data concerning coefficients of eigenfunctions in g-space #if defined __MPI - CALL MPI_Allgather( dffts%nnr, 1, MPI_INTEGER, nnrsx_vec, 1, MPI_INTEGER, intra_pool_comm, IERR) - strd = maxval( nnrsx_vec( 1:nproc_pool ) ) + CALL MPI_Allgather( dffts%nnr, 1, MPI_INTEGER, nnrsx_vec, 1, MPI_INTEGER, dffts%comm, IERR) + strd = maxval( nnrsx_vec( 1:dffts%nproc ) ) #else strd = dffts%nnr #endif @@ -334,60 +335,131 @@ SUBROUTINE task_groups_init( dffts ) !we choose to do the latter one. !------------------------------------------------------------------------------------- ! - dffts%nogrp = nogrp - dffts%npgrp = npgrp - dffts%ogrp_comm = ogrp_comm - dffts%pgrp_comm = pgrp_comm ! - ALLOCATE( dffts%tg_nsw(nproc_pool)) - ALLOCATE( dffts%tg_npp(nproc_pool)) - ALLOCATE( dffts%nolist(nogrp)) + ALLOCATE( dffts%tg_nsw(dffts%nproc)) + ALLOCATE( dffts%tg_npp(dffts%nproc)) num_sticks = 0 num_planes = 0 - DO i = 1, nogrp - dffts%nolist( i ) = nolist( i ) - num_sticks = num_sticks + dffts%nsw( nolist(i) + 1 ) - num_planes = num_planes + dffts%npp( nolist(i) + 1 ) + DO i = 1, dffts%nogrp + num_sticks = num_sticks + dffts%nsw( dffts%nolist(i) + 1 ) + num_planes = num_planes + dffts%npp( dffts%nolist(i) + 1 ) ENDDO #if defined __MPI - CALL MPI_ALLGATHER(num_sticks, 1, MPI_INTEGER, dffts%tg_nsw(1), 1, MPI_INTEGER, intra_pool_comm, IERR) - CALL MPI_ALLGATHER(num_planes, 1, MPI_INTEGER, dffts%tg_npp(1), 1, MPI_INTEGER, intra_pool_comm, IERR) + CALL MPI_ALLGATHER(num_sticks, 1, MPI_INTEGER, dffts%tg_nsw(1), 1, MPI_INTEGER, dffts%comm, IERR) + CALL MPI_ALLGATHER(num_planes, 1, MPI_INTEGER, dffts%tg_npp(1), 1, MPI_INTEGER, dffts%comm, IERR) #else dffts%tg_nsw(1) = num_sticks dffts%tg_npp(1) = num_planes #endif - ALLOCATE( dffts%tg_snd( nogrp ) ) - ALLOCATE( dffts%tg_rcv( nogrp ) ) - ALLOCATE( dffts%tg_psdsp( nogrp ) ) - ALLOCATE( dffts%tg_usdsp( nogrp ) ) - ALLOCATE( dffts%tg_rdsp( nogrp ) ) + ALLOCATE( dffts%tg_snd( dffts%nogrp ) ) + ALLOCATE( dffts%tg_rcv( dffts%nogrp ) ) + ALLOCATE( dffts%tg_psdsp( dffts%nogrp ) ) + ALLOCATE( dffts%tg_usdsp( dffts%nogrp ) ) + ALLOCATE( dffts%tg_rdsp( dffts%nogrp ) ) - dffts%tg_snd(1) = dffts%nr3x * dffts%nsw( me_pool + 1 ) - IF( dffts%nr3x * dffts%nsw( me_pool + 1 ) > dffts%tg_nnr ) THEN + dffts%tg_snd(1) = dffts%nr3x * dffts%nsw( dffts%mype + 1 ) + IF( dffts%nr3x * dffts%nsw( dffts%mype + 1 ) > dffts%tg_nnr ) THEN CALL errore( ' task_groups_init ', ' inconsistent dffts%tg_nnr ', 1 ) ENDIF dffts%tg_psdsp(1) = 0 dffts%tg_usdsp(1) = 0 - dffts%tg_rcv(1) = dffts%nr3x * dffts%nsw( nolist(1) + 1 ) + dffts%tg_rcv(1) = dffts%nr3x * dffts%nsw( dffts%nolist(1) + 1 ) dffts%tg_rdsp(1) = 0 - DO i = 2, nogrp - dffts%tg_snd(i) = dffts%nr3x * dffts%nsw( me_pool + 1 ) + DO i = 2, dffts%nogrp + dffts%tg_snd(i) = dffts%nr3x * dffts%nsw( dffts%mype + 1 ) dffts%tg_psdsp(i) = dffts%tg_psdsp(i-1) + dffts%tg_nnr dffts%tg_usdsp(i) = dffts%tg_usdsp(i-1) + dffts%tg_snd(i-1) - dffts%tg_rcv(i) = dffts%nr3x * dffts%nsw( nolist(i) + 1 ) + dffts%tg_rcv(i) = dffts%nr3x * dffts%nsw( dffts%nolist(i) + 1 ) dffts%tg_rdsp(i) = dffts%tg_rdsp(i-1) + dffts%tg_rcv(i-1) ENDDO - dffts%have_task_groups = .true. - RETURN END SUBROUTINE task_groups_init + ! +SUBROUTINE task_groups_init_first( dffts ) + + USE parallel_include + ! + USE fft_types, ONLY : fft_dlay_descriptor + ! + IMPLICIT NONE + ! + TYPE(fft_dlay_descriptor), INTENT(inout) :: dffts + ! + INTEGER :: i, n1, ipos, color, key, ierr, itsk, ntsk + INTEGER :: pgroup( dffts%nproc ) + ! + !SUBDIVIDE THE PROCESSORS IN GROUPS + ! + DO i = 1, dffts%nproc + pgroup( i ) = i - 1 + ENDDO + ! + !LIST OF PROCESSORS IN MY ORBITAL GROUP + ! + ! processors in these group have contiguous indexes + ! + n1 = ( dffts%mype / dffts%nogrp ) * dffts%nogrp - 1 + DO i = 1, dffts%nogrp + dffts%nolist( i ) = pgroup( n1 + i + 1 ) + IF( dffts%mype == dffts%nolist( i ) ) ipos = i - 1 + ENDDO + ! + !LIST OF PROCESSORS IN MY PLANE WAVE GROUP + ! + DO I = 1, dffts%npgrp + dffts%nplist( i ) = pgroup( ipos + ( i - 1 ) * dffts%nogrp + 1 ) + ENDDO + ! + !SET UP THE GROUPS + ! + ! + !CREATE ORBITAL GROUPS + ! +#if defined __MPI + color = dffts%mype / dffts%nogrp + key = MOD( dffts%mype , dffts%nogrp ) + CALL MPI_COMM_SPLIT( dffts%comm, color, key, dffts%ogrp_comm, ierr ) + if( ierr /= 0 ) & + CALL errore( ' init_task_groups ', ' creating ogrp_comm ', ABS(ierr) ) + CALL MPI_COMM_RANK( dffts%ogrp_comm, itsk, IERR ) + CALL MPI_COMM_SIZE( dffts%ogrp_comm, ntsk, IERR ) + IF( dffts%nogrp /= ntsk ) CALL errore( ' init_task_groups ', ' ogrp_comm size ', ntsk ) + DO i = 1, dffts%nogrp + IF( dffts%mype == dffts%nolist( i ) ) THEN + IF( (i-1) /= itsk ) CALL errore( ' init_task_groups ', ' ogrp_comm rank ', itsk ) + END IF + END DO +#endif + ! + !CREATE PLANEWAVE GROUPS + ! +#if defined __MPI + color = MOD( dffts%mype , dffts%nogrp ) + key = dffts%mype / dffts%nogrp + CALL MPI_COMM_SPLIT( dffts%comm, color, key, dffts%pgrp_comm, ierr ) + if( ierr /= 0 ) & + CALL errore( ' init_task_groups ', ' creating pgrp_comm ', ABS(ierr) ) + CALL MPI_COMM_RANK( dffts%pgrp_comm, itsk, IERR ) + CALL MPI_COMM_SIZE( dffts%pgrp_comm, ntsk, IERR ) + IF( dffts%npgrp /= ntsk ) CALL errore( ' init_task_groups ', ' pgrp_comm size ', ntsk ) + DO i = 1, dffts%npgrp + IF( dffts%mype == dffts%nplist( i ) ) THEN + IF( (i-1) /= itsk ) CALL errore( ' init_task_groups ', ' pgrp_comm rank ', itsk ) + END IF + END DO + dffts%me_pgrp = itsk +#endif + + RETURN + END SUBROUTINE task_groups_init_first + ! !=----------------------------------------------------------------------= END MODULE stick_set !=----------------------------------------------------------------------= diff --git a/PW/data_structure.f90 b/PW/data_structure.f90 index e65cf5c5f..13f2e8254 100644 --- a/PW/data_structure.f90 +++ b/PW/data_structure.f90 @@ -16,7 +16,8 @@ SUBROUTINE data_structure( gamma_only ) USE kinds, ONLY : DP USE io_global, ONLY : stdout USE mp, ONLY : mp_max - USE mp_global, ONLY : inter_pool_comm, intra_pool_comm + USE mp_global, ONLY : me_pool, nproc_pool, inter_pool_comm, intra_pool_comm + USE mp_global, ONLY : get_ntask_groups USE fft_base, ONLY : dfftp, dffts USE cell_base, ONLY : bg, tpiba USE klist, ONLY : xk, nks @@ -29,7 +30,7 @@ SUBROUTINE data_structure( gamma_only ) IMPLICIT NONE LOGICAL, INTENT(in) :: gamma_only REAL (DP) :: gkcut - INTEGER :: ik, ngm_, ngs_, ngw_ + INTEGER :: ik, ngm_, ngs_, ngw_ , nogrp ! ! ... calculate gkcut = max |k+G|^2, in (2pi/a)^2 units ! @@ -56,8 +57,11 @@ SUBROUTINE data_structure( gamma_only ) ! ! ... set up fft descriptors, including parallel stuff: sticks, planes, etc. ! + nogrp = get_ntask_groups() + ! CALL pstickset( gamma_only, bg, gcutm, gkcut, gcutms, & - dfftp, dffts, ngw_ , ngm_ , ngs_ ) + dfftp, dffts, ngw_ , ngm_ , ngs_ , me_pool, nproc_pool, intra_pool_comm, & + nogrp ) ! ! on output, ngm_ and ngs_ contain the local number of G-vectors ! for the two grids. Initialize local and global number of G-vectors diff --git a/PW/h_psi.f90 b/PW/h_psi.f90 index c55980583..9387dd45e 100644 --- a/PW/h_psi.f90 +++ b/PW/h_psi.f90 @@ -36,7 +36,7 @@ SUBROUTINE h_psi( lda, n, m, psi, hpsi ) USE realus, ONLY : real_space, fft_orbital_gamma, initialisation_level, & bfft_orbital_gamma, calbec_rs_gamma, & add_vuspsir_gamma, v_loc_psir - USE mp_global,ONLY : nogrp, use_task_groups + USE fft_base, ONLY : dffts #ifdef EXX USE exx, ONLY : vexx USE funct, ONLY : exx_is_active @@ -74,8 +74,8 @@ SUBROUTINE h_psi( lda, n, m, psi, hpsi ) CALL start_clock( 'h_psi:vloc' ) IF ( gamma_only ) THEN ! - IF (( use_task_groups ) .AND. ( m >= nogrp )) then - incr = 2 * nogrp + IF ( dffts%have_task_groups .AND. ( m >= dffts%nogrp )) then + incr = 2 * dffts%nogrp else incr = 2 endif diff --git a/PW/make.depend b/PW/make.depend index acd894db0..0f9c82314 100644 --- a/PW/make.depend +++ b/PW/make.depend @@ -614,9 +614,9 @@ h_epsi_her_set.o : buffers.o h_epsi_her_set.o : pwcom.o h_epsi_her_set.o : scf_mod.o h_psi.o : ../Modules/control_flags.o +h_psi.o : ../Modules/fft_base.o h_psi.o : ../Modules/funct.o h_psi.o : ../Modules/kind.o -h_psi.o : ../Modules/mp_global.o h_psi.o : ../Modules/recvec.o h_psi.o : ../Modules/uspp.o h_psi.o : becmod.o diff --git a/PW/print_clock_pw.f90 b/PW/print_clock_pw.f90 index a11f71057..5dacba4ea 100644 --- a/PW/print_clock_pw.f90 +++ b/PW/print_clock_pw.f90 @@ -14,7 +14,7 @@ SUBROUTINE print_clock_pw() ! USE io_global, ONLY : stdout USE control_flags, ONLY : isolve, iverbosity, gamma_only - USE mp_global, ONLY : mpime, root, use_task_groups + USE mp_global, ONLY : mpime, root USE paw_variables, ONLY : okpaw USE realus, ONLY : real_space ! @@ -161,7 +161,7 @@ SUBROUTINE print_clock_pw() ! CALL print_clock( 'reduce' ) CALL print_clock( 'fft_scatter' ) - IF( use_task_groups ) CALL print_clock( 'ALLTOALL' ) + CALL print_clock( 'ALLTOALL' ) #endif ! #ifdef EXX diff --git a/PW/realus.f90 b/PW/realus.f90 index eab3baef5..97ed39cdf 100644 --- a/PW/realus.f90 +++ b/PW/realus.f90 @@ -115,7 +115,6 @@ MODULE realus USE cell_base, ONLY : tpiba2 USE control_flags, ONLY : tqr USE fft_base, ONLY : dffts - USE mp_global, ONLY : nogrp, use_task_groups USE wavefunctions_module, ONLY : psic USE io_global, ONLY : stdout @@ -133,13 +132,13 @@ MODULE realus ALLOCATE(npw_k(nks)) !allocate (psic_temp(size(psic))) !real space, allocation for task group fft work arrays - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! IF (allocated( tg_psic ) ) DEALLOCATE( tg_psic ) ! - ALLOCATE( tg_psic( dffts%tg_nnr * nogrp ) ) - !ALLOCATE( tg_psic_temp( dffts%tg_nnr * nogrp ) ) - ALLOCATE( tg_vrs( dffts%tg_nnr * nogrp ) ) + ALLOCATE( tg_psic( dffts%tg_nnr * dffts%nogrp ) ) + !ALLOCATE( tg_psic_temp( dffts%tg_nnr * dffts%nogrp ) ) + ALLOCATE( tg_vrs( dffts%tg_nnr * dffts%nogrp ) ) ! ENDIF @@ -1556,9 +1555,8 @@ MODULE realus USE ions_base, ONLY : nat, ntyp => nsp, ityp USE smooth_grid_dimensions,ONLY : nr1s, nr2s, nr3s USE uspp_param, ONLY : nh, nhm - USE fft_base, ONLY : tg_gather - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, & - use_task_groups,intra_pool_comm + USE fft_base, ONLY : tg_gather, dffts + USE mp_global, ONLY : me_pool, intra_pool_comm USE mp, ONLY : mp_sum ! IMPLICIT NONE @@ -1577,7 +1575,7 @@ MODULE realus ! CALL start_clock( 'calbec_rs' ) ! - IF( ( use_task_groups ) .and. ( m >= nogrp ) ) THEN + IF( ( dffts%have_task_groups ) .and. ( m >= dffts%nogrp ) ) THEN CALL errore( 'calbec_rs_gamma', 'task_groups not implemented', 1 ) @@ -1666,9 +1664,8 @@ MODULE realus USE smooth_grid_dimensions,ONLY : nr1s, nr2s, nr3s USE uspp_param, ONLY : nh, nhm USE becmod, ONLY : bec_type, becp - USE fft_base, ONLY : tg_gather - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, & - use_task_groups + USE fft_base, ONLY : tg_gather, dffts + USE mp_global, ONLY : me_pool ! IMPLICIT NONE ! @@ -1685,7 +1682,7 @@ MODULE realus ! CALL start_clock( 'calbec_rs' ) ! - IF( ( use_task_groups ) .and. ( m >= nogrp ) ) THEN + IF( ( dffts%have_task_groups ) .and. ( m >= dffts%nogrp ) ) THEN CALL errore( 'calbec_rs_k', 'task_groups not implemented', 1 ) @@ -1752,9 +1749,8 @@ MODULE realus USE lsda_mod, ONLY : current_spin USE uspp, ONLY : qq USE becmod, ONLY : bec_type, becp - USE fft_base, ONLY : tg_gather - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, & - use_task_groups + USE fft_base, ONLY : tg_gather, dffts + USE mp_global, ONLY : me_pool ! IMPLICIT NONE ! @@ -1770,7 +1766,7 @@ MODULE realus CALL start_clock( 's_psir' ) - IF( ( use_task_groups ) .and. ( m >= nogrp ) ) THEN + IF( ( dffts%have_task_groups ) .and. ( m >= dffts%nogrp ) ) THEN CALL errore( 's_psir_gamma', 'task_groups not implemented', 1 ) @@ -1853,9 +1849,8 @@ MODULE realus USE lsda_mod, ONLY : current_spin USE uspp, ONLY : qq USE becmod, ONLY : bec_type, becp - USE fft_base, ONLY : tg_gather - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, & - use_task_groups + USE fft_base, ONLY : tg_gather, dffts + USE mp_global, ONLY : me_pool ! IMPLICIT NONE ! @@ -1872,7 +1867,7 @@ MODULE realus CALL start_clock( 's_psir' ) - IF( ( use_task_groups ) .and. ( m >= nogrp ) ) THEN + IF( ( dffts%have_task_groups ) .and. ( m >= dffts%nogrp ) ) THEN CALL errore( 's_psir_k', 'task_groups not implemented', 1 ) @@ -1956,9 +1951,8 @@ MODULE realus USE lsda_mod, ONLY : current_spin USE uspp, ONLY : deeq USE becmod, ONLY : bec_type, becp - USE fft_base, ONLY : tg_gather - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, & - use_task_groups + USE fft_base, ONLY : tg_gather, dffts + USE mp_global, ONLY : me_pool ! IMPLICIT NONE ! @@ -1972,7 +1966,7 @@ MODULE realus ! CALL start_clock( 'add_vuspsir' ) - IF( ( use_task_groups ) .and. ( m >= nogrp ) ) THEN + IF( ( dffts%have_task_groups ) .and. ( m >= dffts%nogrp ) ) THEN CALL errore( 'add_vuspsir_gamma', 'task_groups not implemented', 1 ) @@ -2060,9 +2054,8 @@ MODULE realus USE lsda_mod, ONLY : current_spin USE uspp, ONLY : deeq USE becmod, ONLY : bec_type, becp - USE fft_base, ONLY : tg_gather - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, & - use_task_groups + USE fft_base, ONLY : tg_gather, dffts + USE mp_global, ONLY : me_pool ! IMPLICIT NONE ! @@ -2078,7 +2071,7 @@ MODULE realus ! CALL start_clock( 'add_vuspsir' ) - IF( ( use_task_groups ) .and. ( m >= nogrp ) ) THEN + IF( ( dffts%have_task_groups ) .and. ( m >= dffts%nogrp ) ) THEN CALL errore( 'add_vuspsir_k', 'task_groups not implemented', 1 ) @@ -2157,8 +2150,7 @@ MODULE realus USE kinds, ONLY : DP USE fft_base, ONLY : dffts, tg_gather USE fft_interfaces,ONLY : invfft - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, & - use_task_groups + USE mp_global, ONLY : me_pool IMPLICIT NONE @@ -2180,7 +2172,7 @@ MODULE realus !Task groups !COMPLEX(DP), ALLOCATABLE :: tg_psic(:) - INTEGER :: recv_cnt( nogrp ), recv_displ( nogrp ) + INTEGER :: recv_cnt( dffts%nogrp ), recv_displ( dffts%nogrp ) INTEGER :: v_siz !The new task group version based on vloc_psi @@ -2190,16 +2182,16 @@ MODULE realus ! The following is dirty trick to prevent usage of task groups if ! the number of bands is smaller than the number of task groups ! - use_tg = use_task_groups - use_task_groups = ( use_task_groups ) .and. ( nbnd >= nogrp ) + use_tg = dffts%have_task_groups + dffts%have_task_groups = ( dffts%have_task_groups ) .and. ( nbnd >= dffts%nogrp ) - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! tg_psic = (0.d0, 0.d0) ioff = 0 ! - DO idx = 1, 2*nogrp, 2 + DO idx = 1, 2*dffts%nogrp, 2 IF( idx + ibnd - 1 < nbnd ) THEN DO j = 1, npw_k(1) @@ -2225,7 +2217,7 @@ MODULE realus ! IF (present(conserved)) THEN IF (conserved .eqv. .true.) THEN - IF (.not. allocated(tg_psic_temp)) ALLOCATE( tg_psic_temp( dffts%tg_nnr * nogrp ) ) + IF (.not. allocated(tg_psic_temp)) ALLOCATE( tg_psic_temp( dffts%tg_nnr * dffts%nogrp ) ) tg_psic_temp=tg_psic ENDIF ENDIF @@ -2275,7 +2267,7 @@ MODULE realus ENDIF - use_task_groups = use_tg + dffts%have_task_groups = use_tg !if (.not. allocated(psic)) CALL errore( 'fft_orbital_gamma', 'psic not allocated', 2 ) ! OLD VERSION ! Based on an algorithm found somewhere in the TDDFT codes, generalised to k points @@ -2322,8 +2314,7 @@ MODULE realus USE kinds, ONLY : DP USE fft_base, ONLY : dffts, tg_gather USE fft_interfaces,ONLY : fwfft - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, & - use_task_groups + USE mp_global, ONLY : me_pool IMPLICIT NONE @@ -2342,20 +2333,20 @@ MODULE realus LOGICAL :: use_tg !Task groups - INTEGER :: recv_cnt( nogrp ), recv_displ( nogrp ) + INTEGER :: recv_cnt( dffts%nogrp ), recv_displ( dffts%nogrp ) INTEGER :: v_siz !print *, "->fourier space" CALL start_clock( 'bfft_orbital' ) !New task_groups versions - use_tg = use_task_groups - use_task_groups = ( use_task_groups ) .and. ( nbnd >= nogrp ) - IF( use_task_groups ) THEN + use_tg = dffts%have_task_groups + dffts%have_task_groups = ( dffts%have_task_groups ) .and. ( nbnd >= dffts%nogrp ) + IF( dffts%have_task_groups ) THEN ! CALL fwfft ('Wave', tg_psic, dffts ) ! ioff = 0 ! - DO idx = 1, 2*nogrp, 2 + DO idx = 1, 2*dffts%nogrp, 2 ! IF( idx + ibnd - 1 < nbnd ) THEN DO j = 1, npw_k(1) @@ -2407,7 +2398,7 @@ MODULE realus ENDIF ENDIF ENDIF - use_task_groups = use_tg + dffts%have_task_groups = use_tg !! OLD VERSION Based on the algorithm found in lr_apply_liovillian !!print * ,"a" !CALL fwfft ('Wave', psic, dffts) @@ -2460,8 +2451,7 @@ MODULE realus USE kinds, ONLY : DP USE fft_base, ONLY : dffts USE fft_interfaces,ONLY : invfft - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, & - use_task_groups + USE mp_global, ONLY : me_pool USE wvfct, ONLY : igk IMPLICIT NONE @@ -2477,15 +2467,15 @@ MODULE realus LOGICAL :: use_tg CALL start_clock( 'fft_orbital' ) - use_tg = use_task_groups - use_task_groups = ( use_task_groups ) .and. ( nbnd >= nogrp ) + use_tg = dffts%have_task_groups + dffts%have_task_groups = ( dffts%have_task_groups ) .and. ( nbnd >= dffts%nogrp ) - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! tg_psic = ( 0.D0, 0.D0 ) ioff = 0 ! - DO idx = 1, nogrp + DO idx = 1, dffts%nogrp ! IF( idx + ibnd - 1 <= nbnd ) THEN !DO j = 1, size(orbital,1) @@ -2500,7 +2490,7 @@ MODULE realus CALL invfft ('Wave', tg_psic, dffts) IF (present(conserved)) THEN IF (conserved .eqv. .true.) THEN - IF (.not. allocated(tg_psic_temp)) ALLOCATE( tg_psic_temp( dffts%tg_nnr * nogrp ) ) + IF (.not. allocated(tg_psic_temp)) ALLOCATE( tg_psic_temp( dffts%tg_nnr * dffts%nogrp ) ) tg_psic_temp=tg_psic ENDIF ENDIF @@ -2520,7 +2510,7 @@ MODULE realus ENDIF ! ENDIF - use_task_groups = use_tg + dffts%have_task_groups = use_tg CALL stop_clock( 'fft_orbital' ) END SUBROUTINE fft_orbital_k !-------------------------------------------------------------------------- @@ -2538,8 +2528,7 @@ MODULE realus USE kinds, ONLY : DP USE fft_base, ONLY : dffts USE fft_interfaces,ONLY : fwfft - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, & - use_task_groups + USE mp_global, ONLY : me_pool USE wvfct, ONLY : igk IMPLICIT NONE @@ -2555,16 +2544,16 @@ MODULE realus LOGICAL :: use_tg CALL start_clock( 'bfft_orbital' ) - use_tg = use_task_groups - use_task_groups = ( use_task_groups ) .and. ( nbnd >= nogrp ) + use_tg = dffts%have_task_groups + dffts%have_task_groups = ( dffts%have_task_groups ) .and. ( nbnd >= dffts%nogrp ) - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! CALL fwfft ('Wave', tg_psic, dffts) ! ioff = 0 ! - DO idx = 1, nogrp + DO idx = 1, dffts%nogrp ! IF( idx + ibnd - 1 <= nbnd ) THEN orbital (:, ibnd+idx-1) = tg_psic( nls(igk(:)) + ioff ) @@ -2591,7 +2580,7 @@ MODULE realus ENDIF ENDIF ENDIF - use_task_groups = use_tg + dffts%have_task_groups = use_tg CALL stop_clock( 'bfft_orbital' ) END SUBROUTINE bfft_orbital_k !-------------------------------------------------------------------------- @@ -2605,8 +2594,7 @@ MODULE realus USE gvecs, ONLY : nls,nlsm,doublegrid USE kinds, ONLY : DP USE fft_base, ONLY : dffts, tg_gather - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, & - use_task_groups + USE mp_global, ONLY : me_pool USE scf, ONLY : vrs USE lsda_mod, ONLY : current_spin @@ -2623,11 +2611,11 @@ MODULE realus !Task groups REAL(DP), ALLOCATABLE :: tg_v(:) - INTEGER :: recv_cnt( nogrp ), recv_displ( nogrp ) + INTEGER :: recv_cnt( dffts%nogrp ), recv_displ( dffts%nogrp ) INTEGER :: v_siz CALL start_clock( 'v_loc_psir' ) - IF( use_task_groups .and. nbnd >= nogrp ) THEN + IF( dffts%have_task_groups .and. nbnd >= dffts%nogrp ) THEN IF (ibnd == 1 ) THEN CALL tg_gather( dffts, vrs(:,current_spin), tg_v ) !if ibnd==1 this is a new calculation, and tg_v should be distributed. ENDIF diff --git a/PW/sum_band.f90 b/PW/sum_band.f90 index 12a927d86..44a9b3b62 100644 --- a/PW/sum_band.f90 +++ b/PW/sum_band.f90 @@ -228,8 +228,7 @@ SUBROUTINE sum_band() ! ... gamma version ! USE becmod, ONLY : bec_type, becp, calbec - USE mp_global, ONLY : nogrp, nolist, ogrp_comm, me_pool, & - use_task_groups + USE mp_global, ONLY : me_pool USE mp, ONLY : mp_sum ! IMPLICIT NONE @@ -249,28 +248,28 @@ SUBROUTINE sum_band() ! IF ( nks > 1 ) REWIND( iunigk ) ! - use_tg = use_task_groups - use_task_groups = ( use_task_groups ) .AND. ( nbnd >= nogrp ) + use_tg = dffts%have_task_groups + dffts%have_task_groups = ( dffts%have_task_groups ) .AND. ( nbnd >= dffts%nogrp ) ! incr = 2 ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! IF( dft_is_meta() ) & CALL errore( ' sum_band ', ' task groups with meta dft, not yet implemented ', 1 ) ! - v_siz = dffts%tg_nnr * nogrp + v_siz = dffts%tg_nnr * dffts%nogrp ! ALLOCATE( tg_psi( v_siz ) ) ALLOCATE( tg_rho( v_siz ) ) ! - incr = 2 * nogrp + incr = 2 * dffts%nogrp ! END IF ! k_loop: DO ik = 1, nks ! - IF( use_task_groups ) tg_rho = 0.0_DP + IF( dffts%have_task_groups ) tg_rho = 0.0_DP IF ( lsda ) current_spin = isk(ik) ! npw = ngk(ik) @@ -299,14 +298,14 @@ SUBROUTINE sum_band() ! DO ibnd = 1, nbnd, incr ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! tg_psi(:) = ( 0.D0, 0.D0 ) ioff = 0 ! - DO idx = 1, 2*nogrp, 2 + DO idx = 1, 2*dffts%nogrp, 2 ! - ! ... 2*nogrp ffts at the same time + ! ... 2*dffts%nogrp ffts at the same time ! IF( idx + ibnd - 1 < nbnd ) THEN DO j = 1, npw @@ -327,14 +326,14 @@ SUBROUTINE sum_band() CALL invfft ('Wave', tg_psi, dffts) ! ! Now the first proc of the group holds the first two bands - ! of the 2*nogrp bands that we are processing at the same time, + ! of the 2*dffts%nogrp bands that we are processing at the same time, ! the second proc. holds the third and fourth band ! and so on ! ! Compute the proper factor for each band ! - DO idx = 1, nogrp - IF( nolist( idx ) == me_pool ) EXIT + DO idx = 1, dffts%nogrp + IF( dffts%nolist( idx ) == me_pool ) EXIT END DO ! ! Remember two bands are packed in a single array : @@ -437,16 +436,16 @@ SUBROUTINE sum_band() ! END DO ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! ! reduce the group charge ! - CALL mp_sum( tg_rho, gid = ogrp_comm ) + CALL mp_sum( tg_rho, gid = dffts%ogrp_comm ) ! ioff = 0 - DO idx = 1, nogrp - IF( me_pool == nolist( idx ) ) EXIT - ioff = ioff + dffts%nr1x * dffts%nr2x * dffts%npp( nolist( idx ) + 1 ) + DO idx = 1, dffts%nogrp + IF( me_pool == dffts%nolist( idx ) ) EXIT + ioff = ioff + dffts%nr1x * dffts%nr2x * dffts%npp( dffts%nolist( idx ) + 1 ) END DO ! ! copy the charge back to the processor location @@ -538,11 +537,11 @@ SUBROUTINE sum_band() ! END DO k_loop ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN DEALLOCATE( tg_psi ) DEALLOCATE( tg_rho ) END IF - use_task_groups = use_tg + dffts%have_task_groups = use_tg ! RETURN ! @@ -556,8 +555,7 @@ SUBROUTINE sum_band() ! ... k-points version ! USE becmod, ONLY : bec_type, becp, calbec - USE mp_global, ONLY : nogrp, nolist, ogrp_comm, me_pool, & - use_task_groups + USE mp_global, ONLY : me_pool USE mp, ONLY : mp_sum ! IMPLICIT NONE @@ -588,26 +586,26 @@ SUBROUTINE sum_band() ! IF ( nks > 1 ) REWIND( iunigk ) ! - use_tg = use_task_groups - use_task_groups = ( use_task_groups ) .AND. ( nbnd >= nogrp ) & + use_tg = dffts%have_task_groups + dffts%have_task_groups = ( dffts%have_task_groups ) .AND. ( nbnd >= dffts%nogrp ) & .AND. ( .NOT. noncolin ) .AND. ( .NOT. dft_is_meta() ) ! incr = 1 ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! - v_siz = dffts%tg_nnr * nogrp + v_siz = dffts%tg_nnr * dffts%nogrp ! ALLOCATE( tg_psi( v_siz ) ) ALLOCATE( tg_rho( v_siz ) ) ! - incr = nogrp + incr = dffts%nogrp ! END IF ! k_loop: DO ik = 1, nks ! - IF( use_task_groups ) tg_rho = 0.0_DP + IF( dffts%have_task_groups ) tg_rho = 0.0_DP IF ( lsda ) current_spin = isk(ik) npw = ngk (ik) @@ -626,8 +624,8 @@ SUBROUTINE sum_band() ! DO ibnd = 1, nbnd, incr ! - IF( use_task_groups ) THEN - DO idx = 1, nogrp + IF( dffts%have_task_groups ) THEN + DO idx = 1, dffts%nogrp IF( idx + ibnd - 1 <= nbnd ) eband = eband + et( idx + ibnd - 1, ik ) * wg( idx + ibnd - 1, ik ) END DO ELSE @@ -665,7 +663,7 @@ SUBROUTINE sum_band() ! ELSE ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! !$omp parallel default(shared), private(j,ioff,idx) !$omp do @@ -676,9 +674,9 @@ SUBROUTINE sum_band() ! ioff = 0 ! - DO idx = 1, nogrp + DO idx = 1, dffts%nogrp ! - ! ... nogrp ffts at the same time + ! ... dffts%nogrp ffts at the same time ! IF( idx + ibnd - 1 <= nbnd ) THEN !$omp do @@ -696,13 +694,13 @@ SUBROUTINE sum_band() CALL invfft ('Wave', tg_psi, dffts) ! ! Now the first proc of the group holds the first band - ! of the nogrp bands that we are processing at the same time, + ! of the dffts%nogrp bands that we are processing at the same time, ! the second proc. holds the second and so on ! ! Compute the proper factor for each band ! - DO idx = 1, nogrp - IF( nolist( idx ) == me_pool ) EXIT + DO idx = 1, dffts%nogrp + IF( dffts%nolist( idx ) == me_pool ) EXIT END DO ! ! Remember @@ -752,16 +750,16 @@ SUBROUTINE sum_band() ! END DO ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! ! reduce the group charge ! - CALL mp_sum( tg_rho, gid = ogrp_comm ) + CALL mp_sum( tg_rho, gid = dffts%ogrp_comm ) ! ioff = 0 - DO idx = 1, nogrp - IF( me_pool == nolist( idx ) ) EXIT - ioff = ioff + dffts%nr1x * dffts%nr2x * dffts%npp( nolist( idx ) + 1 ) + DO idx = 1, dffts%nogrp + IF( me_pool == dffts%nolist( idx ) ) EXIT + ioff = ioff + dffts%nr1x * dffts%nr2x * dffts%npp( dffts%nolist( idx ) + 1 ) END DO ! ! copy the charge back to the proper processor location @@ -902,11 +900,11 @@ SUBROUTINE sum_band() ! END DO k_loop - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN DEALLOCATE( tg_psi ) DEALLOCATE( tg_rho ) END IF - use_task_groups = use_tg + dffts%have_task_groups = use_tg IF (noncolin.and.okvan) THEN DO np = 1, ntyp diff --git a/PW/vloc_psi.f90 b/PW/vloc_psi.f90 index 06d6e1a15..f76a6f8ec 100644 --- a/PW/vloc_psi.f90 +++ b/PW/vloc_psi.f90 @@ -16,7 +16,7 @@ SUBROUTINE vloc_psi_gamma(lda, n, m, psi, v, hpsi) USE kinds, ONLY : DP USE gvecs, ONLY : nls, nlsm USE wvfct, ONLY : igk - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, use_task_groups + USE mp_global, ONLY : me_pool USE fft_base, ONLY : dffts, tg_gather USE fft_interfaces,ONLY : fwfft, invfft USE wavefunctions_module, ONLY: psic @@ -43,19 +43,19 @@ SUBROUTINE vloc_psi_gamma(lda, n, m, psi, v, hpsi) ! The following is dirty trick to prevent usage of task groups if ! the number of bands is smaller than the number of task groups ! - use_tg = use_task_groups - use_task_groups = use_task_groups .and. ( m >= nogrp ) + use_tg = dffts%have_task_groups + dffts%have_task_groups = dffts%have_task_groups .and. ( m >= dffts%nogrp ) ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! - v_siz = dffts%tg_nnr * nogrp + v_siz = dffts%tg_nnr * dffts%nogrp ! ALLOCATE( tg_v ( v_siz ) ) ALLOCATE( tg_psic( v_siz ) ) ! CALL tg_gather( dffts, v, tg_v ) ! - incr = 2 * nogrp + incr = 2 * dffts%nogrp ! ENDIF ! @@ -63,12 +63,12 @@ SUBROUTINE vloc_psi_gamma(lda, n, m, psi, v, hpsi) ! DO ibnd = 1, m, incr ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! tg_psic = (0.d0, 0.d0) ioff = 0 ! - DO idx = 1, 2*nogrp, 2 + DO idx = 1, 2*dffts%nogrp, 2 IF( idx + ibnd - 1 < m ) THEN DO j = 1, n tg_psic(nls (igk(j))+ioff) = psi(j,idx+ibnd-1) + & @@ -109,7 +109,7 @@ SUBROUTINE vloc_psi_gamma(lda, n, m, psi, v, hpsi) ! product with the potential v on the smooth grid ! back to reciprocal space ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! CALL invfft ('Wave', tg_psic, dffts) ! @@ -133,11 +133,11 @@ SUBROUTINE vloc_psi_gamma(lda, n, m, psi, v, hpsi) ! ! addition to the total product ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! ioff = 0 ! - DO idx = 1, 2*nogrp, 2 + DO idx = 1, 2*dffts%nogrp, 2 ! IF( idx + ibnd - 1 < m ) THEN DO j = 1, n @@ -181,13 +181,13 @@ SUBROUTINE vloc_psi_gamma(lda, n, m, psi, v, hpsi) ! ENDDO ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! DEALLOCATE( tg_psic ) DEALLOCATE( tg_v ) ! ENDIF - use_task_groups = use_tg + dffts%have_task_groups = use_tg ! RETURN END SUBROUTINE vloc_psi_gamma @@ -202,7 +202,7 @@ SUBROUTINE vloc_psi_k(lda, n, m, psi, v, hpsi) USE kinds, ONLY : DP USE gvecs, ONLY : nls, nlsm USE wvfct, ONLY : igk - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, use_task_groups + USE mp_global, ONLY : me_pool USE fft_base, ONLY : dffts, tg_gather USE fft_interfaces,ONLY : fwfft, invfft USE wavefunctions_module, ONLY: psic @@ -226,20 +226,20 @@ SUBROUTINE vloc_psi_k(lda, n, m, psi, v, hpsi) ! The following is dirty trick to prevent usage of task groups if ! the number of bands is smaller than the number of task groups ! - use_tg = use_task_groups - use_task_groups = use_task_groups .and. ( m >= nogrp ) + use_tg = dffts%have_task_groups + dffts%have_task_groups = dffts%have_task_groups .and. ( m >= dffts%nogrp ) ! incr = 1 ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! - v_siz = dffts%tg_nnr * nogrp + v_siz = dffts%tg_nnr * dffts%nogrp ! ALLOCATE( tg_v ( v_siz ) ) ALLOCATE( tg_psic( v_siz ) ) ! CALL tg_gather( dffts, v, tg_v ) - incr = nogrp + incr = dffts%nogrp ! ENDIF ! @@ -247,12 +247,12 @@ SUBROUTINE vloc_psi_k(lda, n, m, psi, v, hpsi) ! DO ibnd = 1, m, incr ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! tg_psic = (0.d0, 0.d0) ioff = 0 ! - DO idx = 1, nogrp + DO idx = 1, dffts%nogrp IF( idx + ibnd - 1 <= m ) THEN !$omp parallel do @@ -281,7 +281,7 @@ SUBROUTINE vloc_psi_k(lda, n, m, psi, v, hpsi) ! product with the potential v on the smooth grid ! back to reciprocal space ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! !$omp parallel do DO j = 1, dffts%nr1x*dffts%nr2x*dffts%tg_npp( me_pool + 1 ) @@ -305,11 +305,11 @@ SUBROUTINE vloc_psi_k(lda, n, m, psi, v, hpsi) ! ! addition to the total product ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! ioff = 0 ! - DO idx = 1, nogrp + DO idx = 1, dffts%nogrp ! IF( idx + ibnd - 1 <= m ) THEN !$omp parallel do @@ -333,13 +333,13 @@ SUBROUTINE vloc_psi_k(lda, n, m, psi, v, hpsi) ! ENDDO ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! DEALLOCATE( tg_psic ) DEALLOCATE( tg_v ) ! ENDIF - use_task_groups = use_tg + dffts%have_task_groups = use_tg ! RETURN END SUBROUTINE vloc_psi_k @@ -354,7 +354,7 @@ SUBROUTINE vloc_psi_nc (lda, n, m, psi, v, hpsi) USE kinds, ONLY : DP USE gvecs, ONLY : nls, nlsm USE wvfct, ONLY : igk - USE mp_global, ONLY : nogrp, ogrp_comm, me_pool, nolist, use_task_groups + USE mp_global, ONLY : me_pool USE fft_base, ONLY : dffts, dfftp, tg_gather USE fft_interfaces,ONLY : fwfft, invfft USE noncollin_module, ONLY: npol @@ -382,32 +382,32 @@ SUBROUTINE vloc_psi_nc (lda, n, m, psi, v, hpsi) ! The following is dirty trick to prevent usage of task groups if ! the number of bands is smaller than the number of task groups ! - use_tg = use_task_groups - use_task_groups = use_task_groups .and. ( m >= nogrp ) + use_tg = dffts%have_task_groups + dffts%have_task_groups = dffts%have_task_groups .and. ( m >= dffts%nogrp ) ! - IF( use_task_groups ) THEN - v_siz = dffts%tg_nnr * nogrp + IF( dffts%have_task_groups ) THEN + v_siz = dffts%tg_nnr * dffts%nogrp ALLOCATE( tg_v( v_siz, 4 ) ) CALL tg_gather( dffts, v(:,1), tg_v(:,1) ) CALL tg_gather( dffts, v(:,2), tg_v(:,2) ) CALL tg_gather( dffts, v(:,3), tg_v(:,3) ) CALL tg_gather( dffts, v(:,4), tg_v(:,4) ) ALLOCATE( tg_psic( v_siz, npol ) ) - incr = nogrp + incr = dffts%nogrp ENDIF ! ! the local potential V_Loc psi. First the psi in real space ! DO ibnd = 1, m, incr - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! DO ipol = 1, npol ! tg_psic(:,ipol) = ( 0.D0, 0.D0 ) ioff = 0 ! - DO idx = 1, nogrp + DO idx = 1, dffts%nogrp ! IF( idx + ibnd - 1 <= m ) THEN DO j = 1, n @@ -436,7 +436,7 @@ SUBROUTINE vloc_psi_nc (lda, n, m, psi, v, hpsi) ! ! product with the potential v = (vltot+vr) on the smooth grid ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN DO j=1, dffts%nr1x*dffts%nr2x*dffts%tg_npp( me_pool + 1 ) sup = tg_psic(j,1) * (tg_v(j,1)+tg_v(j,4)) + & tg_psic(j,2) * (tg_v(j,2)-(0.d0,1.d0)*tg_v(j,3)) @@ -458,7 +458,7 @@ SUBROUTINE vloc_psi_nc (lda, n, m, psi, v, hpsi) ! ! back to reciprocal space ! - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! DO ipol = 1, npol @@ -466,7 +466,7 @@ SUBROUTINE vloc_psi_nc (lda, n, m, psi, v, hpsi) ! ioff = 0 ! - DO idx = 1, nogrp + DO idx = 1, dffts%nogrp ! IF( idx + ibnd - 1 <= m ) THEN DO j = 1, n @@ -499,13 +499,13 @@ SUBROUTINE vloc_psi_nc (lda, n, m, psi, v, hpsi) ENDDO - IF( use_task_groups ) THEN + IF( dffts%have_task_groups ) THEN ! DEALLOCATE( tg_v ) DEALLOCATE( tg_psic ) ! ENDIF - use_task_groups = use_tg + dffts%have_task_groups = use_tg ! RETURN END SUBROUTINE vloc_psi_nc