- small help functions for data distribution collected in a single file

git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@5748 c92efa57-630b-4861-b058-cf58834340f0
2009-07-24 12:59:54 +00:00 · 2009-07-24 12:59:54 +00:00 · af3e133a36
parent de0d9c842b
commit af3e133a36
5 changed files with 641 additions and 600 deletions
--- a/flib/Makefile
+++ b/flib/Makefile
@ -38,9 +38,7 @@ ylmr2.o

 POBJS = \
 flush_unit.o \
-gridsetup.o \
-localdim.o \
-localindex.o
+distools.o 

 all: blas.a flib.a lapack_atlas.a lapack.a ptools.a

--- a/flib/distools.f90
+++ b/flib/distools.f90
@ -0,0 +1,640 @@
+!
+! Copyright (C) 2001-2009 Quantum ESPRESSO group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!-----------------------------------------------------------------------
+!
+SUBROUTINE block_distribute( nat, me_image, nproc_image, ia_s, ia_e, mykey )
+    INTEGER, INTENT(IN)  :: nat, me_image, nproc_image  
+    INTEGER, INTENT(OUT) :: ia_s, ia_e, mykey
+    INTEGER :: na_loc, r, nproc_ia
+
+    INTEGER, EXTERNAL :: ldim_block, gind_block
+
+    ! Parallel: divide among processors for the same image
+    !
+    ! compute how many processors we have for a given atom
+    !
+    nproc_ia = nproc_image / nat
+    !
+    IF( nproc_ia == 0 ) THEN
+       !       
+       ! here we have less than one processor per atom
+       !       
+       mykey  = 0
+       na_loc = ldim_block( nat, nproc_image, me_image)
+       ia_s   = gind_block( 1, nat, nproc_image, me_image )
+       ia_e   = ia_s + na_loc - 1
+       !       
+    ELSE 
+       !       
+       ! here we have more than one proc per atom
+       !
+       r = MOD( nproc_image, nat )
+       !
+       IF( me_image < (nproc_ia + 1)*r ) THEN
+          ! processors that do the work, more procs work on a single atom
+          ia_s  = me_image/(nproc_ia + 1) + 1
+          mykey = MOD( me_image, nproc_ia + 1 )
+       ELSE
+          ia_s  = ( me_image - (nproc_ia + 1)*r ) / nproc_ia + 1 + r
+          mykey = MOD( me_image - (nproc_ia + 1)*r , nproc_ia )
+       END IF
+       !
+       ia_e = ia_s
+       !
+    END IF
+
+    RETURN
+
+END SUBROUTINE
+!
+
+!
+SUBROUTINE GRID2D_DIMS( grid_shape, nproc, nprow, npcol )
+   !
+   ! This subroutine factorizes the number of processors (NPROC)
+   ! into NPROW and NPCOL according to the shape
+   !
+   !    Written by Carlo Cavazzoni
+   !
+   IMPLICIT NONE
+   CHARACTER, INTENT(IN) :: grid_shape
+   INTEGER, INTENT(IN)  :: nproc
+   INTEGER, INTENT(OUT) :: nprow, npcol
+   INTEGER :: sqrtnp, i
+   !
+   sqrtnp = INT( SQRT( REAL( nproc ) + 0.1 ) )
+   !
+   IF( grid_shape == 'S' ) THEN
+      ! Square grid
+      nprow = sqrtnp
+      npcol = sqrtnp
+   ELSE
+      ! Rectangular grid
+      DO i = 1, sqrtnp + 1
+         IF( MOD( nproc, i ) == 0 ) nprow = i
+      end do
+      npcol = nproc / nprow
+   END IF
+   RETURN
+END SUBROUTINE
+
+SUBROUTINE GRID2D_COORDS( order, rank, nprow, npcol, row, col )
+   !
+   !  this subroutine compute the cartesian coordinetes "row" and "col"
+   !  of the processor whose MPI task id is "rank". 
+   !  Note that if the rank is larger that the grid size
+   !  all processors whose MPI task id is greather or equal 
+   !  than nprow * npcol are placed on the diagonal extension of the grid itself
+   !
+   IMPLICIT NONE
+   CHARACTER, INTENT(IN) :: order
+   INTEGER, INTENT(IN)  ::  rank          ! process index starting from 0
+   INTEGER, INTENT(IN)  ::  nprow, npcol  ! dimensions of the processor grid
+   INTEGER, INTENT(OUT) ::  row, col
+   IF( rank >= 0 .AND. rank < nprow * npcol ) THEN
+      IF( order == 'C' .OR. order == 'c' ) THEN
+         !  grid in COLUMN MAJOR ORDER
+         row = MOD( rank, nprow )
+         col = rank / nprow
+      ELSE
+         !  grid in ROW MAJOR ORDER
+         row = rank / npcol
+         col = MOD( rank, npcol )
+      END IF
+   ELSE
+      row = rank
+      col = rank
+   END IF
+   RETURN
+END SUBROUTINE
+
+SUBROUTINE GRID2D_RANK( order, nprow, npcol, row, col, rank )
+   !
+   !  this subroutine compute the processor MPI task id "rank" of the processor  
+   !  whose cartesian coordinate are "row" and "col".
+   !  Note that the subroutine assume cyclic indexing ( row = nprow = 0 )
+   !
+   IMPLICIT NONE
+   CHARACTER, INTENT(IN) :: order
+   INTEGER, INTENT(OUT) ::  rank         ! process index starting from 0
+   INTEGER, INTENT(IN)  ::  nprow, npcol  ! dimensions of the processor grid
+   INTEGER, INTENT(IN)  ::  row, col
+   
+   IF( order == 'C' .OR. order == 'c' ) THEN
+     !  grid in COLUMN MAJOR ORDER
+     rank = MOD( row + nprow, nprow ) + MOD( col + npcol, npcol ) * nprow
+   ELSE
+     !  grid in ROW MAJOR ORDER
+     rank = MOD( col + npcol, npcol ) + MOD( row + nprow, nprow ) * npcol
+   END IF
+   !
+   RETURN
+END SUBROUTINE
+!
+! Copyright (C) 2002 FPMD group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+
+
+       INTEGER FUNCTION ldim_cyclic(gdim, np, me)
+
+!   gdim = global dimension of distributed array
+!   np   = number of processors
+!   me   = index of the calling processor (starting from 0)
+!  
+!   this function return the number of elements of the distributed array
+!   stored in the local memory of the processor "me" for a cyclic 
+!   data distribution.
+!   Example of the cyclic distribution of a 10 elements array on 4 processors
+!   array elements  |  PEs
+!    a(1)           |   0
+!    a(2)           |   1
+!    a(3)           |   2
+!    a(4)           |   3
+!    a(5)           |   0
+!    a(6)           |   1
+!    a(7)           |   2
+!    a(8)           |   3
+!    a(9)           |   0
+!    a(10)          |   1
+
+       IMPLICIT NONE
+       INTEGER :: gdim, np, me, r, q
+
+       IF( me >= np .OR. me < 0 ) THEN
+         WRITE(6,*) ' ** ldim_cyclic: arg no. 3 out of range '
+         STOP
+       END IF
+
+       q = INT(gdim / np)
+       r = MOD(gdim, np)
+
+       IF( me .LT. r ) THEN
+         ldim_cyclic = q+1
+       ELSE
+         ldim_cyclic = q
+       END IF
+ 
+       RETURN
+       END FUNCTION ldim_cyclic
+
+!=----------------------------------------------------------------------------=!
+
+       INTEGER FUNCTION ldim_block(gdim, np, me)
+
+!   gdim = global dimension of distributed array
+!   np   = number of processors
+!   me   = index of the calling processor (starting from 0)
+!  
+!   this function return the number of elements of the distributed array
+!   stored in the local memory of the processor "me" for a balanced block 
+!   data distribution, with the larger block on the lower index processors.
+!   Example of the block distribution of 10 elements array a on 4 processors
+!   array elements  |  PEs
+!    a(1)           |   0
+!    a(2)           |   0
+!    a(3)           |   0
+!    a(4)           |   1
+!    a(5)           |   1
+!    a(6)           |   1
+!    a(7)           |   2
+!    a(8)           |   2
+!    a(9)           |   3
+!    a(10)          |   3
+
+       IMPLICIT NONE
+       INTEGER :: gdim, np, me, r, q
+
+       IF( me >= np .OR. me < 0 ) THEN
+         WRITE(6,*) ' ** ldim_block: arg no. 3 out of range '
+         STOP
+       END IF
+
+       q = INT(gdim / np)
+       r = MOD(gdim, np)
+
+       IF( me .LT. r ) THEN
+! ...    if my index is less than the reminder I got an extra element
+         ldim_block = q+1
+       ELSE
+         ldim_block = q
+       END IF
+ 
+       RETURN
+       END FUNCTION ldim_block
+
+!=----------------------------------------------------------------------------=!
+
+       INTEGER FUNCTION ldim_block_sca( gdim, np, me )
+
+!   gdim = global dimension of distributed array
+!   np   = number of processors
+!   me   = index of the calling processor (starting from 0)
+!  
+!   this function return the number of elements of the distributed array
+!   stored in the local memory of the processor "me" for equal block
+!   data distribution, all block have the same size but the last one.
+!   Example of the block distribution of 10 elements array a on 4 processors
+!   array elements  |  PEs
+!    a(1)           |   0
+!    a(2)           |   0
+!    a(3)           |   0
+!    a(4)           |   1
+!    a(5)           |   1
+!    a(6)           |   1
+!    a(7)           |   2
+!    a(8)           |   2
+!    a(9)           |   2
+!    a(10)          |   3
+
+       IMPLICIT NONE
+       INTEGER :: gdim, np, me, nb
+
+       IF( me >= np .OR. me < 0 ) THEN
+         WRITE(6,*) ' ** ldim_block: arg no. 3 out of range '
+         STOP
+       END IF
+
+       nb = INT( gdim / np )
+       IF( MOD( gdim,  np ) /= 0 ) THEN
+         nb = nb+1
+         ! ... last processor take the rest
+         IF( me == ( np - 1 ) ) nb = gdim - (np-1)*nb
+       END IF
+
+       ldim_block_sca = nb
+
+
+       RETURN
+       END FUNCTION ldim_block_sca
+
+!=----------------------------------------------------------------------------=!
+
+
+
+      INTEGER FUNCTION ldim_block_cyclic( N, NB, NPROCS, IPROC )
+
+!  -- Derived from:  NUMROC( N, NB, IPROC, ISRCPROC, NPROCS )
+!  -- ScaLAPACK tools routine (version 1.5) --
+!     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
+!     and University of California, Berkeley.
+!     May 1, 1997
+!
+!     .. Scalar Arguments ..
+      IMPLICIT NONE
+      INTEGER              IPROC, ISRCPROC, N, NB, NPROCS, NUMROC
+!     ..
+!
+!  Purpose
+!  =======
+!
+!  NUMROC computes the NUMber of Rows Or Columns of a distributed
+!  matrix owned by the process indicated by IPROC.
+!
+!  Arguments
+!  =========
+!
+!  N         (global input) INTEGER
+!            The number of rows/columns in distributed matrix.
+!
+!  NB        (global input) INTEGER
+!            Block size, size of the blocks the distributed matrix is
+!            split into.
+!
+!  IPROC     (local input) INTEGER
+!            The coordinate of the process whose local array row or
+!            column is to be determined.
+!
+!  ISRCPROC  (global input) INTEGER
+!            The coordinate of the process that possesses the first
+!            row or column of the distributed matrix.
+!
+!  NPROCS    (global input) INTEGER
+!            The total number processes over which the matrix is
+!            distributed.
+!
+!  =====================================================================
+!
+!     .. Local Scalars ..
+      INTEGER              EXTRABLKS, MYDIST, NBLOCKS
+!     ..
+!     .. Intrinsic Functions ..
+      INTRINSIC            MOD
+!     ..
+!     .. Executable Statements ..
+!
+!     Figure PROC's distance from source process
+!
+      ISRCPROC = 0
+      MYDIST = MOD( NPROCS+IPROC-ISRCPROC, NPROCS )
+!
+!     Figure the total number of whole NB blocks N is split up into
+!
+      NBLOCKS = N / NB
+!
+!     Figure the minimum number of rows/cols a process can have
+!
+      NUMROC = (NBLOCKS/NPROCS) * NB
+!
+!     See if there are any extra blocks
+!
+      EXTRABLKS = MOD( NBLOCKS, NPROCS )
+!
+!     If I have an extra block
+!
+      IF( MYDIST.LT.EXTRABLKS ) THEN
+          NUMROC = NUMROC + NB
+!
+!         If I have last block, it may be a partial block
+!
+      ELSE IF( MYDIST.EQ.EXTRABLKS ) THEN
+          NUMROC = NUMROC + MOD( N, NB )
+      END IF
+!
+
+      ldim_block_cyclic = numroc
+      RETURN
+!
+!     End of NUMROC
+!
+      END FUNCTION ldim_block_cyclic
+!
+! Copyright (C) 2002 FPMD group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+
+
+       INTEGER FUNCTION lind_block(ig, nx, np, me)
+!
+!   INPUT :
+!      ig  global index of the x dimension of array element
+!      nx  dimension of the global array
+!      np  number of processor in the x dimension of the processors grid
+!      me  index of the local processor in the processor grid
+!                (starting from zero)
+!
+!   OUTPUT :
+!
+!      lind_block return the local index corresponding to the
+!      global index "ig" for a balanced block distribution
+!   
+
+       IMPLICIT NONE
+
+       INTEGER :: ig, nx, np, me, r, q
+
+       q = INT(nx/np)
+       r = MOD(nx,np) 
+
+       IF( me < r ) THEN
+         lind_block = ig - (q+1) * me
+       ELSE
+         lind_block = ig - (q+1) * r - q * (me - r)
+       END IF
+
+       RETURN 
+       END FUNCTION lind_block
+
+
+!=----------------------------------------------------------------------------=!
+
+
+       INTEGER FUNCTION lind_block_sca(ig, nx, np, me)
+!
+!   INPUT :
+!      ig  global index of the x dimension of array element
+!      nx  dimension of the global array
+!      np  number of processor in the x dimension of the processors grid
+!      me  index of the local processor in the processor grid
+!                (starting from zero)
+!
+!   OUTPUT :
+!
+!      lind_block_sca return the local index corresponding to the
+!      global index "ig" for an equal block distribution
+!   
+
+       IMPLICIT NONE
+
+       INTEGER :: ig, nx, np, me, nb
+
+       nb = INT( nx / np )
+       IF( MOD( nx,  np ) /= 0 ) nb = nb+1
+
+       lind_block_sca = ig - me * nb
+
+       RETURN
+       END FUNCTION lind_block_sca
+
+
+!=----------------------------------------------------------------------------=!
+
+
+
+       INTEGER FUNCTION lind_cyclic(ig, nx, np, me)
+!
+!   INPUT :
+!      ig  global index of the x dimension of array element
+!      nx  dimension of the global array
+!      np  number of processor in the x dimension of the processors grid
+!      me  index of the local processor in the processor grid
+!                (starting from zero)
+!
+!   OUTPUT :
+!
+!      lind_cyclic return the local index corresponding to the
+!      global index "ig" for a cyclic distribution
+!   
+
+       IMPLICIT NONE
+
+       INTEGER :: ig, nx, np, me
+
+       lind_cyclic = (ig-1)/np + 1
+
+       RETURN 
+       END FUNCTION lind_cyclic
+
+
+!=----------------------------------------------------------------------------=!
+
+
+       INTEGER FUNCTION lind_block_cyclic( INDXGLOB, NB, NPROCS, IPROC )      
+
+!     Derived from:  INDXG2L( INDXGLOB, NB, IPROC, ISRCPROC, NPROCS )
+!  -- ScaLAPACK tools routine (version 1.5) --
+!     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
+!     and University of California, Berkeley.
+!     May 1, 1997
+!
+!     .. Scalar Arguments ..
+      IMPLICIT NONE
+      INTEGER  INDXGLOB, IPROC, ISRCPROC, NB, NPROCS, INDXG2L
+!     ..
+!
+!  Purpose
+!  =======
+!
+!  INDXG2L computes the local index of a distributed matrix entry
+!  pointed to by the global index INDXGLOB.
+!
+!  Arguments
+!  =========
+!
+!  INDXGLOB  (global input) INTEGER
+!            The global index of the distributed matrix entry.
+!
+!  NB        (global input) INTEGER
+!            Block size, size of the blocks the distributed matrix is
+!            split into.
+!
+!  IPROC     (local dummy) INTEGER
+!            Dummy argument in this case in order to unify the calling
+!            sequence of the tool-routines.
+!
+!  ISRCPROC  (local dummy) INTEGER
+!            Dummy argument in this case in order to unify the calling
+!            sequence of the tool-routines.
+!
+!  NPROCS    (global input) INTEGER
+!            The total number processes over which the distributed
+!            matrix is distributed.
+!
+!  =====================================================================
+!
+!     .. Intrinsic Functions ..
+      INTRINSIC          MOD
+!     ..
+!     .. Executable Statements ..
+!
+      ISRCPROC = 0      
+      INDXG2L = NB*((INDXGLOB-1)/(NB*NPROCS))+MOD(INDXGLOB-1,NB)+1
+      lind_block_cyclic = INDXG2L
+!
+      RETURN
+!
+!     End of INDXG2L
+!
+      END FUNCTION lind_block_cyclic
+
+
+!=----------------------------------------------------------------------------=!
+
+
+      INTEGER FUNCTION gind_cyclic( lind, n, np, me )
+
+!  This function computes the global index of a distributed array entry
+!  pointed to by the local index lind of the process indicated by me.
+!  lind      local index of the distributed matrix entry.
+!  N         is the size of the global array.
+!  me        The coordinate of the process whose local array row or
+!            column is to be determined.
+!  np        The total number processes over which the distributed
+!            matrix is distributed.
+!
+
+            INTEGER, INTENT(IN) :: lind, n, me, np
+            INTEGER r, q
+
+            gind_cyclic = (lind-1) * np + me + 1
+
+            RETURN
+      END FUNCTION gind_cyclic
+
+
+!=----------------------------------------------------------------------------=!
+
+
+      INTEGER FUNCTION gind_block( lind, n, np, me )
+
+!  This function computes the global index of a distributed array entry
+!  pointed to by the local index lind of the process indicated by me.
+!  lind      local index of the distributed matrix entry.
+!  N         is the size of the global array.
+!  me        The coordinate of the process whose local array row or
+!            column is to be determined.
+!  np        The total number processes over which the distributed
+!            matrix is distributed.
+
+
+            INTEGER, INTENT(IN) :: lind, n, me, np
+            INTEGER r, q
+
+              q = INT(n/np)
+              r = MOD(n,np)
+              IF( me < r ) THEN
+                gind_block = (Q+1)*me + lind
+              ELSE
+                gind_block = Q*me + R + lind
+              END IF
+
+         RETURN
+      END FUNCTION gind_block
+
+!=----------------------------------------------------------------------------=!
+
+      INTEGER FUNCTION gind_block_sca( lind, n, np, me )
+
+!  This function computes the global index of a distributed array entry
+!  pointed to by the local index lind of the process indicated by me.
+!  lind      local index of the distributed matrix entry.
+!  N         is the size of the global array.
+!  me        The coordinate of the process whose local array row or
+!            column is to be determined.
+!  np        The total number processes over which the distributed
+!            matrix is distributed.
+
+
+       INTEGER, INTENT(IN) :: lind, n, me, np
+       INTEGER nb
+
+       IF( me >= np .OR. me < 0 ) THEN
+         WRITE(6,*) ' ** ldim_block: arg no. 3 out of range '
+         STOP
+       END IF
+
+       nb = INT( n / np )
+       IF( MOD( n,  np ) /= 0 ) nb = nb+1
+
+       gind_block_sca = lind + me * nb
+
+       RETURN
+
+    END FUNCTION gind_block_sca
+
+
+!=----------------------------------------------------------------------------=!
+
+      INTEGER FUNCTION gind_block_cyclic( lind, n, nb, np, me )
+
+!  This function computes the global index of a distributed array entry
+!  pointed to by the local index lind of the process indicated by me.
+!  lind      local index of the distributed matrix entry.
+!  N         is the size of the global array.
+!  NB        size of the blocks the distributed matrix is split into.
+!  me        The coordinate of the process whose local array row or
+!            column is to be determined.
+!  np        The total number processes over which the distributed
+!            matrix is distributed.
+
+
+            INTEGER, INTENT(IN) :: lind, n, nb, me, np
+            INTEGER r, q, isrc
+
+            isrc = 0
+            gind_block_cyclic = np*NB*((lind-1)/NB) + &
+                MOD(lind-1,NB) + MOD(np+me-isrc, np)*NB + 1
+
+         RETURN
+      END FUNCTION gind_block_cyclic
+
--- a/flib/gridsetup.f90
+++ b/flib/gridsetup.f90
@ -1,93 +0,0 @@
-!
-! Copyright (C) 2002 FPMD group
-! This file is distributed under the terms of the
-! GNU General Public License. See the file `License'
-! in the root directory of the present distribution,
-! or http://www.gnu.org/copyleft/gpl.txt .
-!
-
-!
-!-----------------------------------------------------------------------
-!
-
-SUBROUTINE GRID2D_DIMS( grid_shape, nproc, nprow, npcol )
-   !
-   ! This subroutine factorizes the number of processors (NPROC)
-   ! into NPROW and NPCOL according to the shape
-   !
-   !    Written by Carlo Cavazzoni
-   !
-   IMPLICIT NONE
-   CHARACTER, INTENT(IN) :: grid_shape
-   INTEGER, INTENT(IN)  :: nproc
-   INTEGER, INTENT(OUT) :: nprow, npcol
-   INTEGER :: sqrtnp, i
-   !
-   sqrtnp = INT( SQRT( REAL( nproc ) + 0.1 ) )
-   !
-   IF( grid_shape == 'S' ) THEN
-      ! Square grid
-      nprow = sqrtnp
-      npcol = sqrtnp
-   ELSE
-      ! Rectangular grid
-      DO i = 1, sqrtnp + 1
-         IF( MOD( nproc, i ) == 0 ) nprow = i
-      end do
-      npcol = nproc / nprow
-   END IF
-   RETURN
-END SUBROUTINE
-
-SUBROUTINE GRID2D_COORDS( order, rank, nprow, npcol, row, col )
-   !
-   !  this subroutine compute the cartesian coordinetes "row" and "col"
-   !  of the processor whose MPI task id is "rank". 
-   !  Note that if the rank is larger that the grid size
-   !  all processors whose MPI task id is greather or equal 
-   !  than nprow * npcol are placed on the diagonal extension of the grid itself
-   !
-   IMPLICIT NONE
-   CHARACTER, INTENT(IN) :: order
-   INTEGER, INTENT(IN)  ::  rank          ! process index starting from 0
-   INTEGER, INTENT(IN)  ::  nprow, npcol  ! dimensions of the processor grid
-   INTEGER, INTENT(OUT) ::  row, col
-   IF( rank >= 0 .AND. rank < nprow * npcol ) THEN
-      IF( order == 'C' .OR. order == 'c' ) THEN
-         !  grid in COLUMN MAJOR ORDER
-         row = MOD( rank, nprow )
-         col = rank / nprow
-      ELSE
-         !  grid in ROW MAJOR ORDER
-         row = rank / npcol
-         col = MOD( rank, npcol )
-      END IF
-   ELSE
-      row = rank
-      col = rank
-   END IF
-   RETURN
-END SUBROUTINE
-
-SUBROUTINE GRID2D_RANK( order, nprow, npcol, row, col, rank )
-   !
-   !  this subroutine compute the processor MPI task id "rank" of the processor  
-   !  whose cartesian coordinate are "row" and "col".
-   !  Note that the subroutine assume cyclic indexing ( row = nprow = 0 )
-   !
-   IMPLICIT NONE
-   CHARACTER, INTENT(IN) :: order
-   INTEGER, INTENT(OUT) ::  rank         ! process index starting from 0
-   INTEGER, INTENT(IN)  ::  nprow, npcol  ! dimensions of the processor grid
-   INTEGER, INTENT(IN)  ::  row, col
-   
-   IF( order == 'C' .OR. order == 'c' ) THEN
-     !  grid in COLUMN MAJOR ORDER
-     rank = MOD( row + nprow, nprow ) + MOD( col + npcol, npcol ) * nprow
-   ELSE
-     !  grid in ROW MAJOR ORDER
-     rank = MOD( col + npcol, npcol ) + MOD( row + nprow, nprow ) * npcol
-   END IF
-   !
-   RETURN
-END SUBROUTINE
--- a/flib/localdim.f90
+++ b/flib/localdim.f90
@ -1,231 +0,0 @@
-!
-! Copyright (C) 2002 FPMD group
-! This file is distributed under the terms of the
-! GNU General Public License. See the file `License'
-! in the root directory of the present distribution,
-! or http://www.gnu.org/copyleft/gpl.txt .
-!
-
-
-       INTEGER FUNCTION ldim_cyclic(gdim, np, me)
-
-!   gdim = global dimension of distributed array
-!   np   = number of processors
-!   me   = index of the calling processor (starting from 0)
-!  
-!   this function return the number of elements of the distributed array
-!   stored in the local memory of the processor "me" for a cyclic 
-!   data distribution.
-!   Example of the cyclic distribution of a 10 elements array on 4 processors
-!   array elements  |  PEs
-!    a(1)           |   0
-!    a(2)           |   1
-!    a(3)           |   2
-!    a(4)           |   3
-!    a(5)           |   0
-!    a(6)           |   1
-!    a(7)           |   2
-!    a(8)           |   3
-!    a(9)           |   0
-!    a(10)          |   1
-
-       IMPLICIT NONE
-       INTEGER :: gdim, np, me, r, q
-
-       IF( me >= np .OR. me < 0 ) THEN
-         WRITE(6,*) ' ** ldim_cyclic: arg no. 3 out of range '
-         STOP
-       END IF
-
-       q = INT(gdim / np)
-       r = MOD(gdim, np)
-
-       IF( me .LT. r ) THEN
-         ldim_cyclic = q+1
-       ELSE
-         ldim_cyclic = q
-       END IF
- 
-       RETURN
-       END FUNCTION ldim_cyclic
-
-!=----------------------------------------------------------------------------=!
-
-       INTEGER FUNCTION ldim_block(gdim, np, me)
-
-!   gdim = global dimension of distributed array
-!   np   = number of processors
-!   me   = index of the calling processor (starting from 0)
-!  
-!   this function return the number of elements of the distributed array
-!   stored in the local memory of the processor "me" for a balanced block 
-!   data distribution, with the larger block on the lower index processors.
-!   Example of the block distribution of 10 elements array a on 4 processors
-!   array elements  |  PEs
-!    a(1)           |   0
-!    a(2)           |   0
-!    a(3)           |   0
-!    a(4)           |   1
-!    a(5)           |   1
-!    a(6)           |   1
-!    a(7)           |   2
-!    a(8)           |   2
-!    a(9)           |   3
-!    a(10)          |   3
-
-       IMPLICIT NONE
-       INTEGER :: gdim, np, me, r, q
-
-       IF( me >= np .OR. me < 0 ) THEN
-         WRITE(6,*) ' ** ldim_block: arg no. 3 out of range '
-         STOP
-       END IF
-
-       q = INT(gdim / np)
-       r = MOD(gdim, np)
-
-       IF( me .LT. r ) THEN
-! ...    if my index is less than the reminder I got an extra element
-         ldim_block = q+1
-       ELSE
-         ldim_block = q
-       END IF
- 
-       RETURN
-       END FUNCTION ldim_block
-
-!=----------------------------------------------------------------------------=!
-
-       INTEGER FUNCTION ldim_block_sca( gdim, np, me )
-
-!   gdim = global dimension of distributed array
-!   np   = number of processors
-!   me   = index of the calling processor (starting from 0)
-!  
-!   this function return the number of elements of the distributed array
-!   stored in the local memory of the processor "me" for equal block
-!   data distribution, all block have the same size but the last one.
-!   Example of the block distribution of 10 elements array a on 4 processors
-!   array elements  |  PEs
-!    a(1)           |   0
-!    a(2)           |   0
-!    a(3)           |   0
-!    a(4)           |   1
-!    a(5)           |   1
-!    a(6)           |   1
-!    a(7)           |   2
-!    a(8)           |   2
-!    a(9)           |   2
-!    a(10)          |   3
-
-       IMPLICIT NONE
-       INTEGER :: gdim, np, me, nb
-
-       IF( me >= np .OR. me < 0 ) THEN
-         WRITE(6,*) ' ** ldim_block: arg no. 3 out of range '
-         STOP
-       END IF
-
-       nb = INT( gdim / np )
-       IF( MOD( gdim,  np ) /= 0 ) THEN
-         nb = nb+1
-         ! ... last processor take the rest
-         IF( me == ( np - 1 ) ) nb = gdim - (np-1)*nb
-       END IF
-
-       ldim_block_sca = nb
-
-
-       RETURN
-       END FUNCTION ldim_block_sca
-
-!=----------------------------------------------------------------------------=!
-
-
-
-      INTEGER FUNCTION ldim_block_cyclic( N, NB, NPROCS, IPROC )
-
-!  -- Derived from:  NUMROC( N, NB, IPROC, ISRCPROC, NPROCS )
-!  -- ScaLAPACK tools routine (version 1.5) --
-!     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
-!     and University of California, Berkeley.
-!     May 1, 1997
-!
-!     .. Scalar Arguments ..
-      IMPLICIT NONE
-      INTEGER              IPROC, ISRCPROC, N, NB, NPROCS, NUMROC
-!     ..
-!
-!  Purpose
-!  =======
-!
-!  NUMROC computes the NUMber of Rows Or Columns of a distributed
-!  matrix owned by the process indicated by IPROC.
-!
-!  Arguments
-!  =========
-!
-!  N         (global input) INTEGER
-!            The number of rows/columns in distributed matrix.
-!
-!  NB        (global input) INTEGER
-!            Block size, size of the blocks the distributed matrix is
-!            split into.
-!
-!  IPROC     (local input) INTEGER
-!            The coordinate of the process whose local array row or
-!            column is to be determined.
-!
-!  ISRCPROC  (global input) INTEGER
-!            The coordinate of the process that possesses the first
-!            row or column of the distributed matrix.
-!
-!  NPROCS    (global input) INTEGER
-!            The total number processes over which the matrix is
-!            distributed.
-!
-!  =====================================================================
-!
-!     .. Local Scalars ..
-      INTEGER              EXTRABLKS, MYDIST, NBLOCKS
-!     ..
-!     .. Intrinsic Functions ..
-      INTRINSIC            MOD
-!     ..
-!     .. Executable Statements ..
-!
-!     Figure PROC's distance from source process
-!
-      ISRCPROC = 0
-      MYDIST = MOD( NPROCS+IPROC-ISRCPROC, NPROCS )
-!
-!     Figure the total number of whole NB blocks N is split up into
-!
-      NBLOCKS = N / NB
-!
-!     Figure the minimum number of rows/cols a process can have
-!
-      NUMROC = (NBLOCKS/NPROCS) * NB
-!
-!     See if there are any extra blocks
-!
-      EXTRABLKS = MOD( NBLOCKS, NPROCS )
-!
-!     If I have an extra block
-!
-      IF( MYDIST.LT.EXTRABLKS ) THEN
-          NUMROC = NUMROC + NB
-!
-!         If I have last block, it may be a partial block
-!
-      ELSE IF( MYDIST.EQ.EXTRABLKS ) THEN
-          NUMROC = NUMROC + MOD( N, NB )
-      END IF
-!
-
-      ldim_block_cyclic = numroc
-      RETURN
-!
-!     End of NUMROC
-!
-      END FUNCTION ldim_block_cyclic
--- a/flib/localindex.f90
+++ b/flib/localindex.f90
@ -1,273 +0,0 @@
-!
-! Copyright (C) 2002 FPMD group
-! This file is distributed under the terms of the
-! GNU General Public License. See the file `License'
-! in the root directory of the present distribution,
-! or http://www.gnu.org/copyleft/gpl.txt .
-!
-
-
-       INTEGER FUNCTION lind_block(ig, nx, np, me)
-!
-!   INPUT :
-!      ig  global index of the x dimension of array element
-!      nx  dimension of the global array
-!      np  number of processor in the x dimension of the processors grid
-!      me  index of the local processor in the processor grid
-!                (starting from zero)
-!
-!   OUTPUT :
-!
-!      lind_block return the local index corresponding to the
-!      global index "ig" for a balanced block distribution
-!   
-
-       IMPLICIT NONE
-
-       INTEGER :: ig, nx, np, me, r, q
-
-       q = INT(nx/np)
-       r = MOD(nx,np) 
-
-       IF( me < r ) THEN
-         lind_block = ig - (q+1) * me
-       ELSE
-         lind_block = ig - (q+1) * r - q * (me - r)
-       END IF
-
-       RETURN 
-       END FUNCTION lind_block
-
-
-!=----------------------------------------------------------------------------=!
-
-
-       INTEGER FUNCTION lind_block_sca(ig, nx, np, me)
-!
-!   INPUT :
-!      ig  global index of the x dimension of array element
-!      nx  dimension of the global array
-!      np  number of processor in the x dimension of the processors grid
-!      me  index of the local processor in the processor grid
-!                (starting from zero)
-!
-!   OUTPUT :
-!
-!      lind_block_sca return the local index corresponding to the
-!      global index "ig" for an equal block distribution
-!   
-
-       IMPLICIT NONE
-
-       INTEGER :: ig, nx, np, me, nb
-
-       nb = INT( nx / np )
-       IF( MOD( nx,  np ) /= 0 ) nb = nb+1
-
-       lind_block_sca = ig - me * nb
-
-       RETURN
-       END FUNCTION lind_block_sca
-
-
-!=----------------------------------------------------------------------------=!
-
-
-
-       INTEGER FUNCTION lind_cyclic(ig, nx, np, me)
-!
-!   INPUT :
-!      ig  global index of the x dimension of array element
-!      nx  dimension of the global array
-!      np  number of processor in the x dimension of the processors grid
-!      me  index of the local processor in the processor grid
-!                (starting from zero)
-!
-!   OUTPUT :
-!
-!      lind_cyclic return the local index corresponding to the
-!      global index "ig" for a cyclic distribution
-!   
-
-       IMPLICIT NONE
-
-       INTEGER :: ig, nx, np, me
-
-       lind_cyclic = (ig-1)/np + 1
-
-       RETURN 
-       END FUNCTION lind_cyclic
-
-
-!=----------------------------------------------------------------------------=!
-
-
-       INTEGER FUNCTION lind_block_cyclic( INDXGLOB, NB, NPROCS, IPROC )      
-
-!     Derived from:  INDXG2L( INDXGLOB, NB, IPROC, ISRCPROC, NPROCS )
-!  -- ScaLAPACK tools routine (version 1.5) --
-!     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
-!     and University of California, Berkeley.
-!     May 1, 1997
-!
-!     .. Scalar Arguments ..
-      IMPLICIT NONE
-      INTEGER  INDXGLOB, IPROC, ISRCPROC, NB, NPROCS, INDXG2L
-!     ..
-!
-!  Purpose
-!  =======
-!
-!  INDXG2L computes the local index of a distributed matrix entry
-!  pointed to by the global index INDXGLOB.
-!
-!  Arguments
-!  =========
-!
-!  INDXGLOB  (global input) INTEGER
-!            The global index of the distributed matrix entry.
-!
-!  NB        (global input) INTEGER
-!            Block size, size of the blocks the distributed matrix is
-!            split into.
-!
-!  IPROC     (local dummy) INTEGER
-!            Dummy argument in this case in order to unify the calling
-!            sequence of the tool-routines.
-!
-!  ISRCPROC  (local dummy) INTEGER
-!            Dummy argument in this case in order to unify the calling
-!            sequence of the tool-routines.
-!
-!  NPROCS    (global input) INTEGER
-!            The total number processes over which the distributed
-!            matrix is distributed.
-!
-!  =====================================================================
-!
-!     .. Intrinsic Functions ..
-      INTRINSIC          MOD
-!     ..
-!     .. Executable Statements ..
-!
-      ISRCPROC = 0      
-      INDXG2L = NB*((INDXGLOB-1)/(NB*NPROCS))+MOD(INDXGLOB-1,NB)+1
-      lind_block_cyclic = INDXG2L
-!
-      RETURN
-!
-!     End of INDXG2L
-!
-      END FUNCTION lind_block_cyclic
-
-
-!=----------------------------------------------------------------------------=!
-
-
-      INTEGER FUNCTION gind_cyclic( lind, n, np, me )
-
-!  This function computes the global index of a distributed array entry
-!  pointed to by the local index lind of the process indicated by me.
-!  lind      local index of the distributed matrix entry.
-!  N         is the size of the global array.
-!  me        The coordinate of the process whose local array row or
-!            column is to be determined.
-!  np        The total number processes over which the distributed
-!            matrix is distributed.
-!
-
-            INTEGER, INTENT(IN) :: lind, n, me, np
-            INTEGER r, q
-
-            gind_cyclic = (lind-1) * np + me + 1
-
-            RETURN
-      END FUNCTION gind_cyclic
-
-
-!=----------------------------------------------------------------------------=!
-
-
-      INTEGER FUNCTION gind_block( lind, n, np, me )
-
-!  This function computes the global index of a distributed array entry
-!  pointed to by the local index lind of the process indicated by me.
-!  lind      local index of the distributed matrix entry.
-!  N         is the size of the global array.
-!  me        The coordinate of the process whose local array row or
-!            column is to be determined.
-!  np        The total number processes over which the distributed
-!            matrix is distributed.
-
-
-            INTEGER, INTENT(IN) :: lind, n, me, np
-            INTEGER r, q
-
-              q = INT(n/np)
-              r = MOD(n,np)
-              IF( me < r ) THEN
-                gind_block = (Q+1)*me + lind
-              ELSE
-                gind_block = Q*me + R + lind
-              END IF
-
-         RETURN
-      END FUNCTION gind_block
-
-!=----------------------------------------------------------------------------=!
-
-      INTEGER FUNCTION gind_block_sca( lind, n, np, me )
-
-!  This function computes the global index of a distributed array entry
-!  pointed to by the local index lind of the process indicated by me.
-!  lind      local index of the distributed matrix entry.
-!  N         is the size of the global array.
-!  me        The coordinate of the process whose local array row or
-!            column is to be determined.
-!  np        The total number processes over which the distributed
-!            matrix is distributed.
-
-
-       INTEGER, INTENT(IN) :: lind, n, me, np
-       INTEGER nb
-
-       IF( me >= np .OR. me < 0 ) THEN
-         WRITE(6,*) ' ** ldim_block: arg no. 3 out of range '
-         STOP
-       END IF
-
-       nb = INT( n / np )
-       IF( MOD( n,  np ) /= 0 ) nb = nb+1
-
-       gind_block_sca = lind + me * nb
-
-       RETURN
-
-    END FUNCTION gind_block_sca
-
-
-!=----------------------------------------------------------------------------=!
-
-      INTEGER FUNCTION gind_block_cyclic( lind, n, nb, np, me )
-
-!  This function computes the global index of a distributed array entry
-!  pointed to by the local index lind of the process indicated by me.
-!  lind      local index of the distributed matrix entry.
-!  N         is the size of the global array.
-!  NB        size of the blocks the distributed matrix is split into.
-!  me        The coordinate of the process whose local array row or
-!            column is to be determined.
-!  np        The total number processes over which the distributed
-!            matrix is distributed.
-
-
-            INTEGER, INTENT(IN) :: lind, n, nb, me, np
-            INTEGER r, q, isrc
-
-            isrc = 0
-            gind_block_cyclic = np*NB*((lind-1)/NB) + &
-                MOD(lind-1,NB) + MOD(np+me-isrc, np)*NB + 1
-
-         RETURN
-      END FUNCTION gind_block_cyclic
-