quantum-espresso/Modules/ptoolkit.f90

4044 lines
106 KiB
Fortran
Raw Normal View History

!
! Copyright (C) 2001-2006 Quantum-ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#include "f_defs.h"
!
!==----------------------------------------------==!
MODULE parallel_toolkit
!==----------------------------------------------==!
USE kinds, ONLY : DP
USE io_global, ONLY : stdout
USE parallel_include
IMPLICIT NONE
SAVE
PRIVATE
PUBLIC :: rep_matmul_drv
PUBLIC :: zrep_matmul_drv
PUBLIC :: dsqmdst, dsqmcll, dsqmred, dsqmsym
CONTAINS
! ---------------------------------------------------------------------------------
SUBROUTINE dsqmdst( n, ar, ldar, a, lda, desc )
!
! Double precision SQuare Matrix DiSTribution
! This sub. take a replicated square matrix "ar" and distribute it
! across processors as described by descriptor "desc"
!
USE kinds
USE descriptors
!
implicit none
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: ldar
REAL(DP) :: ar(ldar,*) ! matrix to be splitted, replicated on all proc
INTEGER, INTENT(IN) :: lda
REAL(DP) :: a(lda,*)
INTEGER, INTENT(IN) :: desc( descla_siz_ )
!
REAL(DP), PARAMETER :: zero = 0_DP
!
INTEGER :: i, j, nr, nc, ic, ir, nx
!
IF( desc( lambda_node_ ) <= 0 ) THEN
RETURN
END IF
nx = desc( nlax_ )
ir = desc( ilar_ )
ic = desc( ilac_ )
nr = desc( nlar_ )
nc = desc( nlac_ )
IF( lda < nx ) &
CALL errore( " dsqmdst ", " inconsistent dimension lda ", lda )
IF( n /= desc( la_n_ ) ) &
CALL errore( " dsqmdst ", " inconsistent dimension n ", n )
DO j = 1, nc
DO i = 1, nr
a( i, j ) = ar( i + ir - 1, j + ic - 1 )
END DO
DO i = nr+1, nx
a( i, j ) = zero
END DO
END DO
DO j = nc + 1, nx
DO i = 1, nx
a( i, j ) = zero
END DO
END DO
RETURN
END SUBROUTINE dsqmdst
! ---------------------------------------------------------------------------------
SUBROUTINE dsqmcll( n, a, lda, ar, ldar, desc, comm )
!
! Double precision SQuare Matrix CoLLect
! This sub. take a distributed square matrix "a" and collect
! the block assigned to processors into a replicated matrix "ar",
! matrix is distributed as described by descriptor desc
!
USE kinds
USE descriptors
!
implicit none
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: ldar
REAL(DP) :: ar(ldar,*) ! matrix to be merged, replicated on all proc
INTEGER, INTENT(IN) :: lda
REAL(DP) :: a(lda,*)
INTEGER, INTENT(IN) :: desc( descla_siz_ )
INTEGER, INTENT(IN) :: comm
!
INTEGER :: i, j
#if defined __MPI
!
INTEGER :: np, nx, ipc, ipr, npr, npc, noff
INTEGER :: ierr, ir, ic, nr, nc
REAL(DP), ALLOCATABLE :: buf(:,:)
!
IF( desc( lambda_node_ ) > 0 ) THEN
!
np = desc( la_npr_ ) * desc( la_npc_ )
nx = desc( nlax_ )
npr = desc( la_npr_ )
npc = desc( la_npc_ )
!
IF( desc( la_myr_ ) == 0 .AND. desc( la_myc_ ) == 0 ) THEN
ALLOCATE( buf( nx, nx * np ) )
ELSE
ALLOCATE( buf( 1, 1 ) )
END IF
!
IF( lda /= nx ) &
CALL errore( " dsqmcll ", " inconsistent dimension lda ", lda )
!
IF( desc( la_n_ ) /= n ) &
CALL errore( " dsqmcll ", " inconsistent dimension n ", n )
!
CALL mpi_gather( a, nx*nx, mpi_double_precision, &
buf, nx*nx, mpi_double_precision, 0, desc( la_comm_ ) , ierr )
!
IF( desc( la_myr_ ) == 0 .AND. desc( la_myc_ ) == 0 ) THEN
DO ipc = 1, npc
CALL descla_local_dims( ic, nc, n, desc( la_nx_ ), npc, ipc-1 )
DO ipr = 1, npr
CALL descla_local_dims( ir, nr, n, desc( la_nx_ ), npr, ipr-1 )
noff = ( ipc - 1 + npc * ( ipr - 1 ) ) * nx
DO j = 1, nc
DO i = 1, nr
ar( i + ir - 1, j + ic - 1 ) = buf( i, j + noff )
END DO
END DO
END DO
END DO
END IF
!
DEALLOCATE( buf )
!
END IF
!
CALL mpi_bcast( ar, ldar * n, mpi_double_precision, 0, comm, ierr )
#else
DO j = 1, n
DO i = 1, n
ar( i, j ) = a( i, j )
END DO
END DO
#endif
RETURN
END SUBROUTINE dsqmcll
! ---------------------------------------------------------------------------------
SUBROUTINE dsqmwpb( n, a, lda, desc )
!
! Double precision SQuare Matrix WiPe Border subroutine
!
USE kinds
USE descriptors
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: lda
REAL(DP) :: a(lda,*) ! matrix to be redistributed into b
INTEGER, INTENT(IN) :: desc( descla_siz_ )
!
INTEGER :: i, j
!
DO j = 1, desc( nlac_ )
DO i = desc( nlar_ ) + 1, desc( nlax_ )
a( i, j ) = 0_DP
END DO
END DO
DO j = desc( nlac_ ) + 1, desc( nlax_ )
DO i = 1, desc( nlax_ )
a( i, j ) = 0_DP
END DO
END DO
!
RETURN
END SUBROUTINE dsqmwpb
! ---------------------------------------------------------------------------------
SUBROUTINE dsqmsym( n, a, lda, desc )
!
! Double precision SQuare Matrix SYMmetrization
!
USE kinds
USE descriptors
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: lda
REAL(DP) :: a(lda,*)
INTEGER, INTENT(IN) :: desc( descla_siz_ )
#if defined __MPI
INTEGER :: istatus( MPI_STATUS_SIZE )
#endif
INTEGER :: i, j
INTEGER :: comm
INTEGER :: nr, nc, dest, sreq, ierr, sour
REAL(DP) :: atmp
#if defined __MPI
IF( desc( lambda_node_ ) <= 0 ) THEN
RETURN
END IF
IF( n /= desc( la_n_ ) ) &
CALL errore( " dsqmsym ", " wrong global dim n ", n )
IF( lda /= desc( nlax_ ) ) &
CALL errore( " dsqmsym ", " wrong leading dim lda ", lda )
comm = desc( la_comm_ )
nr = desc( nlar_ )
nc = desc( nlac_ )
IF( desc( la_myc_ ) == desc( la_myr_ ) ) THEN
!
! diagonal block, procs work locally
!
DO j = 1, nc
DO i = j + 1, nr
a(i,j) = a(j,i)
END DO
END DO
!
ELSE IF( desc( la_myc_ ) > desc( la_myr_ ) ) THEN
!
! super diagonal block, procs send the block to sub diag.
!
CALL GRID2D_RANK( 'R', desc( la_npr_ ), desc( la_npc_ ), &
desc( la_myc_ ), desc( la_myr_ ), dest )
CALL mpi_isend( a, lda*nr, MPI_DOUBLE_PRECISION, dest, 1, comm, sreq, ierr )
!
ELSE IF( desc( la_myc_ ) < desc( la_myr_ ) ) THEN
!
! sub diagonal block, procs receive the block from super diag,
! then transpose locally
!
CALL GRID2D_RANK( 'R', desc( la_npr_ ), desc( la_npc_ ), &
desc( la_myc_ ), desc( la_myr_ ), sour )
CALL mpi_recv( a, lda*nc, MPI_DOUBLE_PRECISION, sour, 1, comm, istatus, ierr )
!
DO j = 1, nr
DO i = j + 1, nc
atmp = a(i,j)
a(i,j) = a(j,i)
a(j,i) = atmp
END DO
END DO
!
END IF
IF( desc( la_myc_ ) > desc( la_myr_ ) ) THEN
CALL MPI_Wait( sreq, istatus, ierr )
END IF
#else
DO j = 1, n
!
DO i = j + 1, n
!
a(i,j) = a(j,i)
!
END DO
!
END DO
#endif
RETURN
END SUBROUTINE dsqmsym
! ---------------------------------------------------------------------------------
SUBROUTINE dsqmred( na, a, lda, desca, nb, b, ldb, descb )
!
! Double precision SQuare Matrix REDistribution
!
! Copy a global "na * na" matrix locally stored in "a",
! and distributed as described by "desca", into a larger
! global "nb * nb" matrix stored in "b" and distributed
! as described in "descb".
!
! If you want to read, get prepared for an headache!
! Written struggling by Carlo Cavazzoni.
!
USE kinds
USE descriptors
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: na
INTEGER, INTENT(IN) :: lda
REAL(DP) :: a(lda,*) ! matrix to be redistributed into b
INTEGER, INTENT(IN) :: desca( descla_siz_ )
INTEGER, INTENT(IN) :: nb
INTEGER, INTENT(IN) :: ldb
REAL(DP) :: b(ldb,*)
INTEGER, INTENT(IN) :: descb( descla_siz_ )
INTEGER :: ipc, ipr, npc, npr
INTEGER :: ipr_old, ir_old, nr_old, irx_old
INTEGER :: ipc_old, ic_old, nc_old, icx_old
INTEGER :: myrow, mycol, ierr, rank
INTEGER :: col_comm, row_comm, comm, sreq
INTEGER :: nr_new, ir_new, irx_new, ir, nr, nrtot, irb, ire
INTEGER :: nc_new, ic_new, icx_new, ic, nc, nctot, icb, ice
INTEGER :: ib, i, j, myid
INTEGER :: nrsnd( desca( la_npr_ ) )
INTEGER :: ncsnd( desca( la_npr_ ) )
INTEGER :: displ( desca( la_npr_ ) )
INTEGER :: irb_new( desca( la_npr_ ) )
INTEGER :: ire_new( desca( la_npr_ ) )
INTEGER :: icb_new( desca( la_npr_ ) )
INTEGER :: ice_new( desca( la_npr_ ) )
REAL(DP), ALLOCATABLE :: buf(:)
REAL(DP), ALLOCATABLE :: ab(:,:)
REAL(DP), ALLOCATABLE :: tst1(:,:)
REAL(DP), ALLOCATABLE :: tst2(:,:)
#if defined __MPI
INTEGER :: istatus( MPI_STATUS_SIZE )
#endif
IF( desca( lambda_node_ ) <= 0 ) THEN
RETURN
END IF
! preliminary consistency checks
IF( nb < na ) &
CALL errore( " dsqmred ", " nb < na, this sub. work only with nb >= na ", nb )
IF( nb /= descb( la_n_ ) ) &
CALL errore( " dsqmred ", " wrong global dim nb ", nb )
IF( na /= desca( la_n_ ) ) &
CALL errore( " dsqmred ", " wrong global dim na ", na )
IF( ldb /= descb( nlax_ ) ) &
CALL errore( " dsqmred ", " wrong leading dim ldb ", ldb )
IF( lda /= desca( nlax_ ) ) &
CALL errore( " dsqmred ", " wrong leading dim lda ", lda )
npr = desca( la_npr_ )
myrow = desca( la_myr_ )
npc = desca( la_npc_ )
mycol = desca( la_myc_ )
comm = desca( la_comm_ )
#if defined __MPI
! split communicator into row and col communicators
CALL MPI_Comm_rank( comm, myid, ierr )
CALL MPI_Comm_split( comm, mycol, myrow, col_comm, ierr )
CALL MPI_Comm_split( comm, myrow, mycol, row_comm, ierr )
CALL MPI_Comm_rank( col_comm, rank, ierr )
IF( rank /= myrow ) &
CALL errore( " dsqmred ", " building col_comm ", rank )
CALL MPI_Comm_rank( row_comm, rank, ierr )
IF( rank /= mycol ) &
CALL errore( " dsqmred ", " building row_comm ", rank )
ALLOCATE( buf( descb( nlax_ ) * descb( nlax_ ) ) )
ALLOCATE( ab( descb( nlax_ ), desca( nlax_ ) ) )
! write( 3000 + myid, * ) 'na, nb = ', na, nb
DO j = 1, descb( nlac_ )
DO i = 1, descb( nlar_ )
b( i, j ) = 0.0d0
END DO
END DO
ab = 0.0d0
! first redistribute rows, column groups work in parallel
DO ipr = 1, npr
!
CALL descla_local_dims( ir_new, nr_new, nb, descb( la_nx_ ), npr, ipr-1 )
!
irx_new = ir_new + nr_new - 1
! write( 3000 + myid, * ) 'ir_new, nr_new, irx_new = ', ir_new, nr_new, irx_new
!
DO ipr_old = 1, npr
!
CALL descla_local_dims( ir_old, nr_old, na, desca( la_nx_ ), npr, ipr_old-1 )
!
irx_old = ir_old + nr_old - 1
!
! write( 3000 + myid, * ) 'ir_old, nr_old, irx_old = ', ir_old, nr_old, irx_old
!
IF( ir_old >= ir_new .AND. ir_old <= irx_new ) THEN
!
nrsnd( ipr_old ) = MIN( nr_old, irx_new - ir_old + 1 )
irb = 1
ire = nrsnd( ipr_old )
irb_new( ipr_old ) = ir_old - ir_new + 1
ire_new( ipr_old ) = irb_new( ipr_old ) + nrsnd( ipr_old ) - 1
!
ELSE IF( ir_new >= ir_old .AND. ir_new <= irx_old ) THEN
!
nrsnd( ipr_old ) = irx_old - ir_new + 1
irb = ir_new - ir_old + 1
ire = nr_old
irb_new( ipr_old ) = 1
ire_new( ipr_old ) = nrsnd( ipr_old )
!
ELSE
nrsnd( ipr_old ) = 0
irb = 0
ire = 0
irb_new( ipr_old ) = 0
ire_new( ipr_old ) = 0
END IF
!
! write( 3000 + myid, * ) 'ipr_old, nrsnd = ', ipr_old, nrsnd( ipr_old )
! write( 3000 + myid, * ) 'ipr_old, irb, ire = ', ipr_old, irb, ire
! write( 3000 + myid, * ) 'ipr_old, irb_new, ire_new = ', ipr_old, irb_new( ipr_old ), ire_new( ipr_old )
!
IF( ( myrow == ipr_old - 1 ) .AND. ( nrsnd( ipr_old ) > 0 ) ) THEN
IF( myrow /= ipr - 1 ) THEN
ib = 0
DO j = 1, desca( nlac_ )
DO i = irb, ire
ib = ib + 1
buf( ib ) = a( i, j )
END DO
END DO
CALL mpi_isend( buf, ib, MPI_DOUBLE_PRECISION, ipr-1, ipr, col_comm, sreq, ierr )
ELSE
DO j = 1, desca( nlac_ )
ib = irb
DO i = irb_new( ipr_old ), ire_new( ipr_old )
ab( i, j ) = a( ib, j )
ib = ib + 1
END DO
END DO
END IF
END IF
!
IF( nrsnd( ipr_old ) /= ire - irb + 1 ) &
CALL errore( " dsqmred ", " somthing wrong with row 1 ", nrsnd( ipr_old ) )
IF( nrsnd( ipr_old ) /= ire_new( ipr_old ) - irb_new( ipr_old ) + 1 ) &
CALL errore( " dsqmred ", " somthing wrong with row 2 ", nrsnd( ipr_old ) )
!
nrsnd( ipr_old ) = nrsnd( ipr_old ) * desca( nlac_ )
!
END DO
!
IF( myrow == ipr - 1 ) THEN
DO ipr_old = 1, npr
IF( nrsnd( ipr_old ) > 0 ) THEN
IF( myrow /= ipr_old - 1 ) THEN
CALL mpi_recv( buf, nrsnd(ipr_old), MPI_DOUBLE_PRECISION, ipr_old-1, ipr, col_comm, istatus, ierr )
CALL MPI_GET_COUNT( istatus, MPI_DOUBLE_PRECISION, ib)
IF( ib /= nrsnd(ipr_old) ) &
CALL errore( " dsqmred ", " somthing wrong with row 3 ", ib )
ib = 0
DO j = 1, desca( nlac_ )
DO i = irb_new( ipr_old ), ire_new( ipr_old )
ib = ib + 1
ab( i, j ) = buf( ib )
END DO
END DO
END IF
END IF
END DO
ELSE
DO ipr_old = 1, npr
IF( myrow == ipr_old - 1 .AND. nrsnd( ipr_old ) > 0 ) THEN
CALL MPI_Wait( sreq, istatus, ierr )
END IF
END DO
END IF
!
END DO
! then redistribute cols, row groups work in parallel
DO ipc = 1, npc
!
CALL descla_local_dims( ic_new, nc_new, nb, descb( la_nx_ ), npc, ipc-1 )
!
icx_new = ic_new + nc_new - 1
!
! write( 3000 + myid, * ) 'ic_new, nc_new, icx_new = ', ic_new, nc_new, icx_new
!
DO ipc_old = 1, npc
!
CALL descla_local_dims( ic_old, nc_old, na, desca( la_nx_ ), npc, ipc_old-1 )
!
icx_old = ic_old + nc_old - 1
!
! write( 3000 + myid, * ) 'ic_old, nc_old, icx_old = ', ic_old, nc_old, icx_old
!
IF( ic_old >= ic_new .AND. ic_old <= icx_new ) THEN
!
ncsnd( ipc_old ) = MIN( nc_old, icx_new - ic_old + 1 )
icb = 1
ice = ncsnd( ipc_old )
icb_new( ipc_old ) = ic_old - ic_new + 1
ice_new( ipc_old ) = icb_new( ipc_old ) + ncsnd( ipc_old ) - 1
!
ELSE IF( ic_new >= ic_old .AND. ic_new <= icx_old ) THEN
!
ncsnd( ipc_old ) = icx_old - ic_new + 1
icb = ic_new - ic_old + 1
ice = nc_old
icb_new( ipc_old ) = 1
ice_new( ipc_old ) = ncsnd( ipc_old )
!
ELSE
ncsnd( ipc_old ) = 0
icb = 0
ice = 0
icb_new( ipc_old ) = 0
ice_new( ipc_old ) = 0
END IF
!
! write( 3000 + myid, * ) 'ipc_old, ncsnd = ', ipc_old, ncsnd( ipc_old )
! write( 3000 + myid, * ) 'ipc_old, icb, ice = ', ipc_old, icb, ice
! write( 3000 + myid, * ) 'ipc_old, icb_new, ice_new = ', ipc_old, icb_new( ipc_old ), ice_new( ipc_old )
IF( ( mycol == ipc_old - 1 ) .AND. ( ncsnd( ipc_old ) > 0 ) ) THEN
IF( mycol /= ipc - 1 ) THEN
ib = 0
DO j = icb, ice
DO i = 1, descb( nlax_ )
ib = ib + 1
buf( ib ) = ab( i, j )
END DO
END DO
CALL mpi_isend( buf, ib, MPI_DOUBLE_PRECISION, ipc-1, ipc, row_comm, sreq, ierr )
ELSE
ib = icb
DO j = icb_new( ipc_old ), ice_new( ipc_old )
DO i = 1, descb( nlax_ )
b( i, j ) = ab( i, ib )
END DO
ib = ib + 1
END DO
END IF
END IF
IF( ncsnd( ipc_old ) /= ice-icb+1 ) &
CALL errore( " dsqmred ", " somthing wrong with col 1 ", ncsnd( ipc_old ) )
IF( ncsnd( ipc_old ) /= ice_new( ipc_old ) - icb_new( ipc_old ) + 1 ) &
CALL errore( " dsqmred ", " somthing wrong with col 2 ", ncsnd( ipc_old ) )
!
ncsnd( ipc_old ) = ncsnd( ipc_old ) * descb( nlax_ )
!
END DO
!
IF( mycol == ipc - 1 ) THEN
DO ipc_old = 1, npc
IF( ncsnd( ipc_old ) > 0 ) THEN
IF( mycol /= ipc_old - 1 ) THEN
ib = icb_new( ipc_old )
CALL mpi_recv( b( 1, ib ), ncsnd(ipc_old), MPI_DOUBLE_PRECISION, ipc_old-1, ipc, row_comm, istatus, ierr )
CALL MPI_GET_COUNT( istatus, MPI_DOUBLE_PRECISION, ib)
IF( ib /= ncsnd(ipc_old) ) &
CALL errore( " dsqmred ", " somthing wrong with col 3 ", ib )
END IF
END IF
END DO
ELSE
DO ipc_old = 1, npc
IF( mycol == ipc_old - 1 .AND. ncsnd( ipc_old ) > 0 ) THEN
CALL MPI_Wait( sreq, istatus, ierr )
END IF
END DO
END IF
!
END DO
DEALLOCATE( ab )
DEALLOCATE( buf )
CALL mpi_comm_free( col_comm, ierr )
CALL mpi_comm_free( row_comm, ierr )
#if defined __PIPPO
! this is for debugging, tests through global matrix, if
! the two matrix (pre and before the redistribution) coincide.
ALLOCATE( tst1( nb, nb ) )
ALLOCATE( tst2( nb, nb ) )
ALLOCATE( ab( nb, nb ) )
ab = 0.0d0
do j = 1, desca( nlac_ )
do i = 1, desca( nlar_ )
ab( i + desca( ilar_ ) - 1, j + desca( ilac_ ) - 1 ) = a( i , j )
end do
end do
CALL MPI_REDUCE( ab, tst1, nb*nb, MPI_DOUBLE_PRECISION, MPI_SUM, 0, comm, ierr )
ab = 0.0d0
do j = 1, descb( nlac_ )
do i = 1, descb( nlar_ )
ab( i + descb( ilar_ ) - 1, j + descb( ilac_ ) - 1 ) = b( i , j )
end do
end do
CALL MPI_REDUCE( ab, tst2, nb*nb, MPI_DOUBLE_PRECISION, MPI_SUM, 0, comm, ierr )
IF( myid == 0 ) THEN
write( 1000, * ) na, nb, SUM( ABS( tst2 - tst1 ) )
END IF
DEALLOCATE( ab )
DEALLOCATE( tst2 )
DEALLOCATE( tst1 )
#endif
#endif
RETURN
END SUBROUTINE dsqmred
! ---------------------------------------------------------------------------------
SUBROUTINE matscal_drv( m, n, beta, c, ldc, nb, dims, coor, comm )
!
implicit none
!
INTEGER, INTENT(IN) :: m, n
REAL(DP), INTENT(IN) :: beta
INTEGER, INTENT(IN) :: ldc
REAL(DP) :: c(ldc,*)
INTEGER, INTENT(IN) :: nb, coor(2), dims(2), comm
!
INTEGER :: i, j, nr, nc, ierr
!
INTEGER :: numroc
EXTERNAL :: numroc
nr = NUMROC( m, nb, coor(1), 0, dims(1) ) ! local row of C
nc = NUMROC( n, nb, coor(2), 0, dims(2) ) ! local colum of C
IF( beta == 0.0_DP ) THEN
do j = 1, nc
do i = 1, nr
c(i,j) = 0.0_DP
end do
end do
ELSE
do j = 1, nc
do i = 1, nr
c(i,j) = beta * c(i,j)
end do
end do
END IF
RETURN
END SUBROUTINE
! ---------------------------------------------------------------------------------
!==----------------------------------------------==!
!
! Copyright (C) 2005 Carlo Cavazzoni
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
SUBROUTINE rep_matmul_drv( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC, comm )
!
! Parallel matrix multiplication with replicated matrix
!
implicit none
!
CHARACTER(LEN=1), INTENT(IN) :: transa, transb
INTEGER, INTENT(IN) :: m, n, k
REAL(DP), INTENT(IN) :: alpha, beta
INTEGER, INTENT(IN) :: lda, ldb, ldc
REAL(DP) :: a(lda,*), b(ldb,*), c(ldc,*)
INTEGER, INTENT(IN) :: comm
!
! DGEMM PERFORMS ONE OF THE MATRIX-MATRIX OPERATIONS
!
! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!
! WHERE OP( X ) IS ONE OF
!
! OP( X ) = X OR OP( X ) = X',
!
! ALPHA AND BETA ARE SCALARS, AND A, B AND C ARE MATRICES, WITH OP( A )
! AN M BY K MATRIX, OP( B ) A K BY N MATRIX AND C AN M BY N MATRIX.
!
!
!
#if defined __MPI
!
INTEGER :: ME, I, II, J, JJ, IP, SOUR, DEST, INFO, IERR, ioff, ldx
INTEGER :: NB, IB_S, NB_SOUR, IB_SOUR, IBUF
INTEGER :: nproc, mpime, q, r
REAL(8), ALLOCATABLE :: auxa( : )
REAL(8), ALLOCATABLE :: auxc( : )
!
! ... BODY
!
CALL MPI_COMM_SIZE(comm, NPROC, IERR)
CALL MPI_COMM_RANK(comm, MPIME, IERR)
IF ( NPROC == 1 ) THEN
! if there is only one proc no need of using parallel alg.
CALL DGEMM(TRANSA, TRANSB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
RETURN
END IF
ME = MPIME + 1
Q = INT( m / NPROC )
R = MOD( m , NPROC )
! ... Find out the number of elements in the local block
! along "M" first dimension os matrix A
NB = Q
IF( ME <= R ) NB = NB + 1
! ... Find out the global index of the local first row
IF( ME <= R ) THEN
ib_s = (Q+1)*(ME-1) + 1
ELSE
ib_s = Q*(ME-1) + R + 1
END IF
ldx = m / nproc + 1
ALLOCATE( auxa( MAX( n, m ) * ldx ) )
ALLOCATE( auxc( MAX( n, m ) * ldx ) )
IF( TRANSA == 'N' .OR. TRANSA == 'n' ) THEN
ibuf = 0
ioff = ib_s - 1
DO J = 1, k
DO I = 1, NB
auxa( ibuf + I ) = A( I + ioff, J )
END DO
ibuf = ibuf + ldx
END DO
ELSE
ibuf = 0
ioff = ib_s - 1
DO J = 1, k
DO I = 1, NB
auxa( ibuf + I ) = A( J, I + ioff )
END DO
ibuf = ibuf + ldx
END DO
!ioff = ib_s - 1
!call mytranspose( A( 1, ioff + 1 ), lda, auxa(1), ldx, m, nb)
END IF
IF( beta /= 0.0_DP ) THEN
ibuf = 0
ioff = ib_s - 1
DO J = 1, n
DO I = 1, NB
auxc( ibuf + I ) = C( I + ioff, J )
END DO
ibuf = ibuf + ldx
END DO
END IF
CALL DGEMM( 'N', transb, nb, n, k, alpha, auxa(1), ldx, B, ldb, beta, auxc(1), ldx )
! ... Here processors exchange blocks
DO IP = 0, NPROC-1
! ... Find out the number of elements in the block of processor SOUR
NB_SOUR = q
IF( (IP+1) .LE. r ) NB_SOUR = NB_SOUR+1
! ... Find out the global index of the first row owned by SOUR
IF( (IP+1) .LE. r ) THEN
ib_sour = (Q+1)*IP + 1
ELSE
ib_sour = Q*IP + R + 1
END IF
IF( mpime == ip ) auxa(1:n*ldx) = auxc(1:n*ldx)
CALL MPI_BCAST( auxa(1), ldx*n, mpi_double_precision, ip, comm, IERR)
IBUF = 0
ioff = IB_SOUR - 1
DO J = 1, N
DO I = 1, NB_SOUR
C( I + ioff, J ) = AUXA( IBUF + I )
END DO
IBUF = IBUF + ldx
END DO
END DO
DEALLOCATE( auxa, auxc )
#else
! if we are not compiling with __MPI this is equivalent to a blas call
CALL DGEMM(TRANSA, TRANSB, m, N, k, alpha, A, lda, B, ldb, beta, C, ldc)
#endif
RETURN
END SUBROUTINE rep_matmul_drv
SUBROUTINE zrep_matmul_drv( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC, comm )
!
! Parallel matrix multiplication with replicated matrix
!
implicit none
!
CHARACTER(LEN=1), INTENT(IN) :: transa, transb
INTEGER, INTENT(IN) :: m, n, k
COMPLEX(DP), INTENT(IN) :: alpha, beta
INTEGER, INTENT(IN) :: lda, ldb, ldc
COMPLEX(DP) :: a(lda,*), b(ldb,*), c(ldc,*)
INTEGER, INTENT(IN) :: comm
!
! DGEMM PERFORMS ONE OF THE MATRIX-MATRIX OPERATIONS
!
! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!
! WHERE OP( X ) IS ONE OF
!
! OP( X ) = X OR OP( X ) = X',
!
! ALPHA AND BETA ARE SCALARS, AND A, B AND C ARE MATRICES, WITH OP( A )
! AN M BY K MATRIX, OP( B ) A K BY N MATRIX AND C AN M BY N MATRIX.
!
!
!
#if defined __MPI
!
INTEGER :: ME, I, II, J, JJ, IP, SOUR, DEST, INFO, IERR, ioff, ldx
INTEGER :: NB, IB_S, NB_SOUR, IB_SOUR, IBUF
INTEGER :: nproc, mpime, q, r
COMPLEX(DP), ALLOCATABLE :: auxa( : )
COMPLEX(DP), ALLOCATABLE :: auxc( : )
!
! ... BODY
!
CALL MPI_COMM_SIZE(comm, NPROC, IERR)
CALL MPI_COMM_RANK(comm, MPIME, IERR)
IF ( NPROC == 1 ) THEN
! if there is only one proc no need of using parallel alg.
CALL ZGEMM(TRANSA, TRANSB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
RETURN
END IF
ME = MPIME + 1
Q = INT( m / NPROC )
R = MOD( m , NPROC )
! ... Find out the number of elements in the local block
! along "M" first dimension os matrix A
NB = Q
IF( ME <= R ) NB = NB + 1
! ... Find out the global index of the local first row
IF( ME <= R ) THEN
ib_s = (Q+1)*(ME-1) + 1
ELSE
ib_s = Q*(ME-1) + R + 1
END IF
ldx = m / nproc + 1
ALLOCATE( auxa( MAX( n, m ) * ldx ) )
ALLOCATE( auxc( MAX( n, m ) * ldx ) )
IF( TRANSA == 'N' .OR. TRANSA == 'n' ) THEN
ibuf = 0
ioff = ib_s - 1
DO J = 1, k
DO I = 1, NB
auxa( ibuf + I ) = A( I + ioff, J )
END DO
ibuf = ibuf + ldx
END DO
ELSE
ibuf = 0
ioff = ib_s - 1
DO J = 1, k
DO I = 1, NB
auxa( ibuf + I ) = CONJG( A( J, I + ioff ) )
END DO
ibuf = ibuf + ldx
END DO
!ioff = ib_s - 1
!call mytranspose( A( 1, ioff + 1 ), lda, auxa(1), ldx, m, nb)
END IF
IF( beta /= 0.0_DP ) THEN
ibuf = 0
ioff = ib_s - 1
DO J = 1, n
DO I = 1, NB
auxc( ibuf + I ) = C( I + ioff, J )
END DO
ibuf = ibuf + ldx
END DO
END IF
CALL ZGEMM( 'N', transb, nb, n, k, alpha, auxa(1), ldx, B, ldb, beta, auxc(1), ldx )
! ... Here processors exchange blocks
DO IP = 0, NPROC-1
! ... Find out the number of elements in the block of processor SOUR
NB_SOUR = q
IF( (IP+1) .LE. r ) NB_SOUR = NB_SOUR+1
! ... Find out the global index of the first row owned by SOUR
IF( (IP+1) .LE. r ) THEN
ib_sour = (Q+1)*IP + 1
ELSE
ib_sour = Q*IP + R + 1
END IF
IF( mpime == ip ) auxa(1:n*ldx) = auxc(1:n*ldx)
CALL MPI_BCAST( auxa(1), ldx*n, mpi_double_complex, ip, comm, IERR)
IBUF = 0
ioff = IB_SOUR - 1
DO J = 1, N
DO I = 1, NB_SOUR
C( I + ioff, J ) = AUXA( IBUF + I )
END DO
IBUF = IBUF + ldx
END DO
END DO
DEALLOCATE( auxa, auxc )
#else
! if we are not compiling with __MPI this is equivalent to a blas call
CALL ZGEMM(TRANSA, TRANSB, m, N, k, alpha, A, lda, B, ldb, beta, C, ldc)
#endif
RETURN
END SUBROUTINE zrep_matmul_drv
!==----------------------------------------------==!
END MODULE parallel_toolkit
!==----------------------------------------------==!
!
! ... some simple routines for parallel linear algebra (the matrices are
! ... always replicated on all the cpus)
!
! ... written by carlo sbraccia ( 2006 )
!
!----------------------------------------------------------------------------
SUBROUTINE para_dgemm( transa, transb, m, n, k, &
alpha, a, lda, b, ldb, beta, c, ldc, comm )
!----------------------------------------------------------------------------
!
! ... trivial parallelization (splitting matrix B by columns) of DGEMM
!
USE kinds, ONLY : DP
USE mp, ONLY : mp_bcast
USE parallel_include
USE parallel_toolkit
!
IMPLICIT NONE
!
CHARACTER(LEN=1), INTENT(IN) :: transa, transb
INTEGER, INTENT(IN) :: m, n, k
REAL(DP), INTENT(IN) :: alpha, beta
INTEGER, INTENT(IN) :: lda, ldb, ldc
REAL(DP), INTENT(INOUT) :: a(lda,*), b(ldb,*), c(ldc,*)
INTEGER, INTENT(IN) :: comm
!
INTEGER :: i, mpime, nproc, ierr
INTEGER :: ncol, i0, i1
INTEGER, ALLOCATABLE :: i0a(:), i1a(:)
!
! ... quick return if possible
!
IF ( m == 0 .OR. n == 0 .OR. &
( ( alpha == 0.0_DP .OR. k == 0 ) .AND. beta == 1.0_DP ) ) RETURN
!
#if defined (__XD1)
!
CALL rep_matmul_drv( transa, transb, m, n, k, &
alpha, a, lda, b, ldb, beta, c, ldc, comm )
RETURN
!
#endif
#if defined (__MPI)
!
CALL MPI_COMM_SIZE( comm, nproc, ierr )
CALL MPI_COMM_RANK( comm, mpime, ierr )
!
#else
!
nproc = 1
mpime = 0
!
#endif
!
ncol = n / nproc
!
ALLOCATE( i0a( 0:nproc-1 ), i1a( 0:nproc-1 ) )
!
DO i = 0, nproc -1
!
i0a(i) = i*ncol + 1
i1a(i) = ( i + 1 )*ncol
!
END DO
!
i1a(nproc-1) = n
!
i0 = i0a(mpime)
i1 = i1a(mpime)
!
IF ( transb == 'n' .OR. transb == 'N' ) THEN
!
CALL DGEMM( transa, transb, m, i1 - i0 + 1, k, &
alpha, a, lda, b(1,i0), ldb, beta, c(1,i0), ldc )
!
ELSE
!
CALL DGEMM( transa, transb, m, i1 - i0 + 1, k, &
alpha, a, lda, b(i0,1), ldb, beta, c(1,i0), ldc )
!
END IF
!
DO i = 0 , nproc - 1
!
CALL mp_bcast( c(1:ldc,i0a(i):i1a(i)), i, comm )
!
END DO
!
DEALLOCATE( i0a, i1a )
!
RETURN
!
END SUBROUTINE para_dgemm
!
!----------------------------------------------------------------------------
SUBROUTINE para_zgemm( transa, transb, m, n, k, &
alpha, a, lda, b, ldb, beta, c, ldc, comm )
!----------------------------------------------------------------------------
!
! ... trivial parallelization (splitting matrix B by columns) of ZGEMM
!
USE kinds, ONLY : DP
USE mp, ONLY : mp_bcast
USE parallel_include
USE parallel_toolkit
!
IMPLICIT NONE
!
CHARACTER(LEN=1), INTENT(IN) :: transa, transb
INTEGER, INTENT(IN) :: m, n, k
COMPLEX(DP), INTENT(IN) :: alpha, beta
INTEGER, INTENT(IN) :: lda, ldb, ldc
COMPLEX(DP), INTENT(INOUT) :: a(lda,*), b(ldb,*), c(ldc,*)
INTEGER, INTENT(IN) :: comm
!
INTEGER :: i, mpime, nproc, ierr
INTEGER :: ncol, i0, i1
INTEGER, ALLOCATABLE :: i0a(:), i1a(:)
!
COMPLEX(DP), PARAMETER :: ONE = (1.0_DP,0.0_DP), ZERO = ( 0.0_DP, 0.0_DP )
!
! ... quick return if possible
!
IF ( m == 0 .OR. n == 0 .OR. &
( ( alpha == 0.0_DP .OR. k == 0 ) .AND. beta == ONE ) ) RETURN
!
#if defined (__XD1)
!
CALL zrep_matmul_drv( transa, transb, m, n, k, &
alpha, a, lda, b, ldb, beta, c, ldc, comm )
RETURN
!
#endif
!
#if defined (__MPI)
!
CALL MPI_COMM_SIZE( comm, nproc, ierr )
CALL MPI_COMM_RANK( comm, mpime, ierr )
!
#else
!
nproc = 1
mpime = 0
!
#endif
!
ncol = n / nproc
!
ALLOCATE( i0a( 0:nproc-1 ), i1a( 0:nproc-1 ) )
!
DO i = 0, nproc -1
!
i0a(i) = i*ncol + 1
i1a(i) = ( i + 1 )*ncol
!
END DO
!
i1a(nproc-1) = n
!
i0 = i0a(mpime)
i1 = i1a(mpime)
!
IF ( transb == 'n' .OR. transb == 'N' ) THEN
!
CALL ZGEMM( transa, transb, m, i1 - i0 + 1, k, &
alpha, a, lda, b(1,i0), ldb, beta, c(1,i0), ldc )
!
ELSE
!
CALL ZGEMM( transa, transb, m, i1 - i0 + 1, k, &
alpha, a, lda, b(i0,1), ldb, beta, c(1,i0), ldc )
!
END IF
!
DO i = 0 , nproc - 1
!
CALL mp_bcast( c(1:ldc,i0a(i):i1a(i)), i, comm )
!
END DO
!
DEALLOCATE( i0a, i1a )
!
RETURN
!
END SUBROUTINE para_zgemm
!
!----------------------------------------------------------------------------
SUBROUTINE para_dgemv( trans, m, n, alpha, &
a, lda, x, incx, beta, y, incy, comm )
!----------------------------------------------------------------------------
!
! ... trivial parallelization (splitting matrix A by rows) of DGEMV
!
USE kinds, ONLY : DP
USE mp, ONLY : mp_bcast
USE parallel_include
!
IMPLICIT NONE
!
CHARACTER(LEN=1), INTENT(IN) :: trans
INTEGER, INTENT(IN) :: m, n
REAL(DP), INTENT(IN) :: alpha, beta
INTEGER, INTENT(IN) :: lda, incx, incy
REAL(DP), INTENT(INOUT) :: a(lda,*)
REAL(DP), INTENT(INOUT) :: x(*), y(*)
INTEGER, INTENT(IN) :: comm
!
INTEGER :: i, j, mpime, nproc, ierr
INTEGER :: nrow, i0, i1, dim, ydum_size
INTEGER, ALLOCATABLE :: i0a(:), i1a(:)
REAL(DP), ALLOCATABLE :: ydum(:)
!
! ... quick return if possible
!
IF ( m == 0 .OR. n == 0 .OR. &
( alpha == 0.0_DP .AND. beta == 1.0_DP ) ) RETURN
!
#if defined (__MPI)
!
CALL MPI_COMM_SIZE( comm, nproc, ierr )
CALL MPI_COMM_RANK( comm, mpime, ierr )
!
#else
!
nproc = 1
mpime = 0
!
#endif
!
nrow = m / nproc
!
ALLOCATE( i0a( 0:nproc-1 ), i1a( 0:nproc-1 ) )
!
DO i = 0, nproc -1
!
i0a(i) = i*nrow + 1
i1a(i) = ( i + 1 )*nrow
!
END DO
!
i1a(nproc-1) = m
!
i0 = i0a(mpime)
i1 = i1a(mpime)
!
IF ( trans == 'n' .OR. trans == 'N' ) THEN
!
dim = ( 1 + ( m - 1 )*ABS( incy ) )
!
ydum_size = m
!
ELSE
!
dim = ( 1 + ( n - 1 )*ABS( incy ) )
!
ydum_size = n
!
END IF
!
ALLOCATE( ydum( ydum_size ) )
!
i = 0
!
DO j = 1, dim, incy
!
i = i + 1
!
IF ( i < i0 .OR. i > i1 ) CYCLE
!
ydum(i) = y(j)
!
END DO
!
IF ( trans == 'n' .OR. trans == 'N' ) THEN
!
CALL DGEMV( trans, i1 - i0 + 1, n, &
alpha, a(i0,1), lda, x, incx, beta, ydum(i0), 1 )
!
ELSE
!
CALL DGEMV( trans, i1 - i0 + 1, n, &
alpha, a(1,i0), lda, x, incx, beta, ydum(i0), 1 )
!
END IF
!
DO i = 0 , nproc - 1
!
CALL mp_bcast( ydum(i0a(i):i1a(i)), i, comm )
!
END DO
!
i = 0
!
DO j = 1, dim, incy
!
i = i + 1
!
y(j) = ydum(i)
!
END DO
!
DEALLOCATE( ydum, i0a, i1a )
!
RETURN
!
END SUBROUTINE para_dgemv
!
!----------------------------------------------------------------------------
SUBROUTINE para_zgemv( trans, m, n, alpha, &
a, lda, x, incx, beta, y, incy, comm )
!----------------------------------------------------------------------------
!
! ... trivial parallelization (splitting matrix A by rows) of ZGEMV
!
USE kinds, ONLY : DP
USE mp, ONLY : mp_bcast
USE parallel_include
!
IMPLICIT NONE
!
CHARACTER(LEN=1), INTENT(IN) :: trans
INTEGER, INTENT(IN) :: m, n
COMPLEX(DP), INTENT(IN) :: alpha, beta
INTEGER, INTENT(IN) :: lda, incx, incy
COMPLEX(DP), INTENT(INOUT) :: a(lda,*)
COMPLEX(DP), INTENT(INOUT) :: x(*), y(*)
INTEGER, INTENT(IN) :: comm
!
INTEGER :: i, j, mpime, nproc, ierr
INTEGER :: nrow, i0, i1, dim, ydum_size
INTEGER, ALLOCATABLE :: i0a(:), i1a(:)
COMPLEX(DP), ALLOCATABLE :: ydum(:)
!
! ... quick return if possible
!
IF ( m == 0 .OR. n == 0 .OR. &
( alpha == 0.0_DP .AND. beta == 1.0_DP ) ) RETURN
!
#if defined (__MPI)
!
CALL MPI_COMM_SIZE( comm, nproc, ierr )
CALL MPI_COMM_RANK( comm, mpime, ierr )
!
#else
!
nproc = 1
mpime = 0
!
#endif
!
nrow = m / nproc
!
ALLOCATE( i0a( 0:nproc-1 ), i1a( 0:nproc-1 ) )
!
DO i = 0, nproc -1
!
i0a(i) = i*nrow + 1
i1a(i) = ( i + 1 )*nrow
!
END DO
!
i1a(nproc-1) = m
!
i0 = i0a(mpime)
i1 = i1a(mpime)
!
IF ( trans == 'n' .OR. trans == 'N' ) THEN
!
dim = ( 1 + ( m - 1 )*ABS( incy ) )
!
ydum_size = m
!
ELSE
!
dim = ( 1 + ( n - 1 )*ABS( incy ) )
!
ydum_size = n
!
END IF
!
ALLOCATE( ydum( ydum_size ) )
!
i = 0
!
DO j = 1, dim, incy
!
i = i + 1
!
IF ( i < i0 .OR. i > i1 ) CYCLE
!
ydum(i) = y(j)
!
END DO
!
IF ( trans == 'n' .OR. trans == 'N' ) THEN
!
CALL ZGEMV( trans, i1 - i0 + 1, n, &
alpha, a(i0,1), lda, x, incx, beta, ydum(i0), 1 )
!
ELSE
!
CALL ZGEMV( trans, i1 - i0 + 1, n, &
alpha, a(1,i0), lda, x, incx, beta, ydum(i0), 1 )
!
END IF
!
DO i = 0 , nproc - 1
!
CALL mp_bcast( ydum(i0a(i):i1a(i)), i, comm )
!
END DO
!
i = 0
!
DO j = 1, dim, incy
!
i = i + 1
!
y(j) = ydum(i)
!
END DO
!
DEALLOCATE( ydum, i0a, i1a )
!
RETURN
!
END SUBROUTINE para_zgemv
!
!----------------------------------------------------------------------------
SUBROUTINE para_dcholdc( n, a, lda, comm )
!----------------------------------------------------------------------------
!
! ... trivial parallelization (using a parallel version of DGEMV) of
! ... the Cholesky decomposition (equivalent to DPOTF2)
!
USE kinds, ONLY : DP
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: lda
REAL(DP), INTENT(INOUT) :: a(lda,*)
INTEGER, INTENT(IN) :: comm
!
INTEGER :: i, j
REAL(DP) :: aii
REAL(DP), EXTERNAL :: DDOT
!
!
DO i = 1, n
!
aii = a(i,i) - DDOT( i-1, a(i,1), lda, a(i,1), lda )
!
IF ( aii < 0.0_DP ) &
CALL errore( 'para_dcholdc', 'a is not positive definite', i )
!
aii = SQRT( aii )
!
a(i,i) = aii
!
IF ( i < n ) THEN
!
CALL para_dgemv( 'N', n-i, i-1, -1.0_DP, a(i+1,1), &
lda, a(i,1), lda, 1.0_DP, a(i+1,i), 1, comm )
!
CALL DSCAL( n-i, 1.0_DP / aii, a(i+1,i), 1 )
!
END IF
!
END DO
!
FORALL( i = 1:n, j = 1:n, j > i ) a(i,j) = 0.0_DP
!
RETURN
!
END SUBROUTINE para_dcholdc
!
!----------------------------------------------------------------------------
SUBROUTINE para_zcholdc( n, a, lda, comm )
!----------------------------------------------------------------------------
!
! ... trivial parallelization (using a parallel version of ZGEMV) of
! ... the Cholesky decomposition (equivalent to ZPOTF2)
!
USE kinds, ONLY : DP
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: lda
COMPLEX(DP), INTENT(INOUT) :: a(lda,*)
INTEGER, INTENT(IN) :: comm
!
INTEGER :: i, j
REAL(DP) :: aii
COMPLEX(DP), EXTERNAL :: ZDOTC
!
COMPLEX(DP), PARAMETER :: ONE = (1.0_DP,0.0_DP), ZERO = ( 0.0_DP, 0.0_DP )
!
!
DO i = 1, n
!
aii = REAL( a(i,i) ) - ZDOTC( i-1, a(i,1), lda, a(i,1), lda )
!
IF ( aii < 0.0_DP ) &
CALL errore( 'para_zcholdc', 'a is not positive definite', i )
!
aii = SQRT( aii )
!
a(i,i) = aii
!
IF ( i < n ) THEN
!
CALL ZLACGV( i-1, a(i,1), lda )
!
CALL para_zgemv( 'N', n-i, i-1, -ONE, a(i+1,1), &
lda, a(i,1), lda, ONE, a(i+1,i), 1, comm )
!
CALL ZLACGV( i-1, a(i,1), lda )
!
CALL ZDSCAL( n-i, 1.0_DP / aii, a(i+1,i), 1 )
!
END IF
!
END DO
!
FORALL( i = 1:n, j = 1:n, j > i ) a(i,j) = ZERO
!
RETURN
!
END SUBROUTINE para_zcholdc
!
!----------------------------------------------------------------------------
SUBROUTINE para_dtrtri( n, a, lda, comm )
!----------------------------------------------------------------------------
!
! ... parallel inversion of a lower triangular matrix done distributing
! ... by columns ( the number of columns assigned to each processor are
! ... chosen to optimize the load balance )
!
USE kinds, ONLY : DP
USE mp, ONLY : mp_bcast
USE parallel_include
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: lda
REAL(DP), INTENT(INOUT) :: a(lda,*)
INTEGER, INTENT(IN) :: comm
!
INTEGER :: i, j, k
INTEGER :: i0, i1, mpime, nproc, ierr
INTEGER, ALLOCATABLE :: i0a(:), i1a(:)
REAL(DP) :: an, xfrac
REAL(DP) :: sum
REAL(DP), ALLOCATABLE :: inva(:,:)
!
!
#if defined (__MPI)
!
CALL MPI_COMM_SIZE( comm, nproc, ierr )
CALL MPI_COMM_RANK( comm, mpime, ierr )
!
#else
!
nproc = 1
mpime = 0
!
#endif
!
ALLOCATE( i0a( 0:nproc-1 ), i1a( 0:nproc-1 ) )
!
an = 1.0_DP / DBLE( nproc )
!
i0a(0) = 1
!
DO i = 0, nproc - 2
!
xfrac = 1.0_DP - SQRT( 1.0_DP - DBLE( i+1 )*an )
!
i1a(i) = ANINT( xfrac*n )
i0a(i+1) = i1a(i) + 1
!
END DO
!
i1a(nproc-1) = n
!
i0 = i0a(mpime)
i1 = i1a(mpime)
!
ALLOCATE( inva( n, i0:i1 ) )
!
inva(:,:) = 0.0_DP
!
DO i = i0, i1
!
inva(i,i) = 1.0_DP / a(i,i)
!
DO j = i + 1, n
!
sum = 0.0_DP
!
DO k = i, j - 1
!
sum = sum + inva(k,i)*a(j,k)
!
END DO
!
inva(j,i) = - sum / a(j,j)
!
END DO
!
END DO
!
a(1:lda,1:n) = 0.0_DP
a(1:n,i0:i1) = inva(:,:)
!
DEALLOCATE( inva )
!
DO i = 0 , nproc - 1
!
#if defined __XD1
CALL BCAST_REAL( a(1,i0a(i)), i1a(i)-i0a(i)+1, i, comm, ierr )
#else
CALL mp_bcast( a(1:n,i0a(i):i1a(i)), i, comm )
#endif
!
END DO
!
DEALLOCATE( i0a, i1a )
!
RETURN
!
END SUBROUTINE para_dtrtri
!
!----------------------------------------------------------------------------
SUBROUTINE para_ztrtri( n, a, lda, comm )
!----------------------------------------------------------------------------
!
! ... parallel inversion of a lower triangular matrix done distributing
! ... by columns ( the number of columns assigned to each processor are
! ... chosen to optimize the load balance in the limit of large matrices )
!
USE kinds, ONLY : DP
USE mp, ONLY : mp_bcast
USE parallel_include
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: lda
COMPLEX(DP), INTENT(INOUT) :: a(lda,*)
INTEGER, INTENT(IN) :: comm
!
INTEGER :: i, j, k
INTEGER :: i0, i1, mpime, nproc, ierr
INTEGER, ALLOCATABLE :: i0a(:), i1a(:)
REAL(DP) :: an, xfrac
COMPLEX(DP) :: sum
COMPLEX(DP), ALLOCATABLE :: inva(:,:)
!
COMPLEX(DP), PARAMETER :: ONE = (1.0_DP,0.0_DP), ZERO = ( 0.0_DP, 0.0_DP )
!
!
#if defined (__MPI)
!
CALL MPI_COMM_SIZE( comm, nproc, ierr )
CALL MPI_COMM_RANK( comm, mpime, ierr )
!
#else
!
nproc = 1
mpime = 0
!
#endif
!
ALLOCATE( i0a( 0:nproc-1 ), i1a( 0:nproc-1 ) )
!
an = 1.0_DP / DBLE( nproc )
!
i0a(0) = 1
!
DO i = 0, nproc - 2
!
xfrac = 1.0_DP - SQRT( 1.0_DP - DBLE( i+1 )*an )
!
i1a(i) = ANINT( xfrac*n )
i0a(i+1) = i1a(i) + 1
!
END DO
!
i1a(nproc-1) = n
!
i0 = i0a(mpime)
i1 = i1a(mpime)
!
ALLOCATE( inva( n, i0:i1 ) )
!
inva(:,:) = ZERO
!
DO i = i0, i1
!
inva(i,i) = ONE / a(i,i)
!
DO j = i + 1, n
!
sum = ZERO
!
DO k = i, j - 1
!
sum = sum + inva(k,i)*a(j,k)
!
END DO
!
inva(j,i) = - sum / a(j,j)
!
END DO
!
END DO
!
a(1:lda,1:n) = ZERO
a(1:n,i0:i1) = inva(:,:)
!
DEALLOCATE( inva )
!
DO i = 0 , nproc - 1
!
CALL mp_bcast( a(1:n,i0a(i):i1a(i)), i, comm )
!
END DO
!
DEALLOCATE( i0a, i1a )
!
RETURN
!
END SUBROUTINE para_ztrtri
!=----------------------------------------------------------------------------=!
!
!
! Cannon's algorithms for parallel matrix multiplication
! written by Carlo Cavazzoni
!
!
!
SUBROUTINE sqr_mm_cannon( transa, transb, n, alpha, a, lda, b, ldb, beta, c, ldc, desc )
!
! Parallel square matrix multiplication with Cannon's algorithm
!
USE kinds, ONLY : DP
USE descriptors, ONLY : ilar_ , nlar_ , ilac_ , nlac_ , nlax_ , la_npc_ , &
la_comm_ , lambda_node_ , la_npr_ , la_npc_ , la_myr_ , la_myc_
!
IMPLICIT NONE
!
CHARACTER(LEN=1), INTENT(IN) :: transa, transb
INTEGER, INTENT(IN) :: n
REAL(DP), INTENT(IN) :: alpha, beta
INTEGER, INTENT(IN) :: lda, ldb, ldc
REAL(DP) :: a(lda,*), b(ldb,*), c(ldc,*)
INTEGER, INTENT(IN) :: desc(*)
!
! performs one of the matrix-matrix operations
!
! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!
! where op( x ) is one of
!
! OP( X ) = X OR OP( X ) = X',
!
! alpha and beta are scalars, and a, b and c are square matrices
!
#if defined (__MPI)
!
include 'mpif.h'
!
#endif
!
integer :: ierr
integer :: np
integer :: i, j, nr, nc, nb, iter, rowid, colid
logical :: ta, tb
INTEGER :: comm
!
!
real(DP), allocatable :: bblk(:,:), ablk(:,:)
!
#if defined (__MPI)
!
integer :: istatus( MPI_STATUS_SIZE )
!
#endif
!
IF( desc( lambda_node_ ) < 0 ) THEN
!
! processors not interested in this computation return quickly
!
RETURN
!
END IF
IF( desc( la_npr_ ) == 1 ) THEN
!
! quick return if only one processor is used
!
CALL dgemm( TRANSA, TRANSB, n, n, n, alpha, a, lda, b, ldb, beta, c, ldc)
!
RETURN
!
END IF
IF( desc( la_npr_ ) /= desc( la_npc_ ) ) &
CALL errore( ' sqr_mm_cannon ', ' works only with square processor mesh ', 1 )
IF( n < 1 ) &
CALL errore( ' sqr_mm_cannon ', ' n less or equal zero ', 1 )
IF( n < desc( la_npr_ ) ) &
CALL errore( ' sqr_mm_cannon ', ' n less than the mesh size ', 1 )
IF( n < desc( nlax_ ) ) &
CALL errore( ' sqr_mm_cannon ', ' n less than the block size ', 1 )
!
! Retrieve communicator and mesh geometry
!
np = desc( la_npr_ )
comm = desc( la_comm_ )
rowid = desc( la_myr_ )
colid = desc( la_myc_ )
!
! Retrieve the size of the local block
!
nr = desc( nlar_ )
nc = desc( nlac_ )
nb = desc( nlax_ )
!
#if defined (__MPI)
CALL MPI_BARRIER( comm, ierr )
#endif
!
allocate( ablk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
ablk( i, j ) = a( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
DO j = nc+1, nb
DO i = 1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
!
!
allocate( bblk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
bblk( i, j ) = b( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
DO j = nc+1, nb
DO i = 1, nb
bblk( i, j ) = 0.0_DP
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
bblk( i, j ) = 0.0_DP
END DO
END DO
!
!
ta = ( TRANSA == 'T' .OR. TRANSA == 't' )
tb = ( TRANSB == 'T' .OR. TRANSB == 't' )
!
! Shift A rowid+1 places to the west
!
IF( ta ) THEN
CALL shift_exch_block( ablk, 'W', 1 )
ELSE
CALL shift_block( ablk, 'W', rowid+1, 1 )
END IF
!
! Shift B colid+1 places to the north
!
IF( tb ) THEN
CALL shift_exch_block( bblk, 'N', np+1 )
ELSE
CALL shift_block( bblk, 'N', colid+1, np+1 )
END IF
!
! Accumulate on C
!
CALL dgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, beta, c, ldc)
!
DO iter = 2, np
!
! Shift A 1 places to the east
!
CALL shift_block( ablk, 'E', 1, iter )
!
! Shift B 1 places to the south
!
CALL shift_block( bblk, 'S', 1, np+iter )
!
! Accumulate on C
!
CALL dgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, 1.0_DP, c, ldc)
!
END DO
deallocate( ablk, bblk )
RETURN
CONTAINS
SUBROUTINE shift_block( blk, dir, ln, tag )
!
! Block shift
!
IMPLICIT NONE
REAL(DP) :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir ! shift direction
INTEGER, INTENT(IN) :: ln ! shift lenght
INTEGER, INTENT(IN) :: tag ! communication tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
!
IF( dir == 'W' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid - ln + np, np )
icsrc = MOD( colid + ln + np, np )
!
ELSE IF( dir == 'E' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid + ln + np, np )
icsrc = MOD( colid - ln + np, np )
!
ELSE IF( dir == 'N' ) THEN
irdst = MOD( rowid - ln + np, np )
irsrc = MOD( rowid + ln + np, np )
icdst = colid
icsrc = colid
ELSE IF( dir == 'S' ) THEN
irdst = MOD( rowid + ln + np, np )
irsrc = MOD( rowid - ln + np, np )
icdst = colid
icsrc = colid
ELSE
CALL errore( ' sqr_mm_cannon ', ' unknown shift direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_PRECISION, &
idest, tag, isour, tag, comm, istatus, ierr)
!
#endif
RETURN
END SUBROUTINE shift_block
SUBROUTINE shift_exch_block( blk, dir, tag )
!
! Combined block shift and exchange
! only used for the first step
!
IMPLICIT NONE
REAL(DP) :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir
INTEGER, INTENT(IN) :: tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
INTEGER :: icol, irow
!
IF( dir == 'W' ) THEN
!
icol = rowid
irow = colid
!
irdst = irow
icdst = MOD( icol - irow-1 + np, np )
!
irow = rowid
icol = MOD( colid + rowid+1 + np, np )
!
irsrc = icol
icsrc = irow
!
ELSE IF( dir == 'N' ) THEN
!
icol = rowid
irow = colid
!
icdst = icol
irdst = MOD( irow - icol-1 + np, np )
!
irow = MOD( rowid + colid+1 + np, np )
icol = colid
!
irsrc = icol
icsrc = irow
ELSE
CALL errore( ' sqr_mm_cannon ', ' unknown shift_exch direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_PRECISION, &
idest, tag, isour, tag, comm, istatus, ierr)
!
#endif
RETURN
END SUBROUTINE shift_exch_block
END SUBROUTINE sqr_mm_cannon
!=----------------------------------------------------------------------------=!
SUBROUTINE sqr_zmm_cannon( transa, transb, n, alpha, a, lda, b, ldb, beta, c, ldc, desc )
!
! Parallel square matrix multiplication with Cannon's algorithm
!
USE kinds, ONLY : DP
USE descriptors, ONLY : ilar_ , nlar_ , ilac_ , nlac_ , nlax_ , la_npc_ , &
la_comm_ , lambda_node_ , la_npr_ , la_npc_ , la_myr_ , la_myc_
!
IMPLICIT NONE
!
CHARACTER(LEN=1), INTENT(IN) :: transa, transb
INTEGER, INTENT(IN) :: n
COMPLEX(DP), INTENT(IN) :: alpha, beta
INTEGER, INTENT(IN) :: lda, ldb, ldc
COMPLEX(DP) :: a(lda,*), b(ldb,*), c(ldc,*)
INTEGER, INTENT(IN) :: desc(*)
!
! performs one of the matrix-matrix operations
!
! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!
! where op( x ) is one of
!
! OP( X ) = X OR OP( X ) = X',
!
! alpha and beta are scalars, and a, b and c are square matrices
!
#if defined (__MPI)
!
include 'mpif.h'
!
#endif
!
INTEGER :: ierr
INTEGER :: np
INTEGER :: i, j, nr, nc, nb, iter, rowid, colid
LOGICAL :: ta, tb
INTEGER :: comm
!
!
COMPLEX(DP), ALLOCATABLE :: bblk(:,:), ablk(:,:)
COMPLEX(DP) :: zone = ( 1.0_DP, 0.0_DP )
COMPLEX(DP) :: zzero = ( 0.0_DP, 0.0_DP )
!
#if defined (__MPI)
!
integer :: istatus( MPI_STATUS_SIZE )
!
#endif
!
IF( desc( lambda_node_ ) < 0 ) THEN
!
! processors not interested in this computation return quickly
!
RETURN
!
END IF
IF( desc( la_npr_ ) == 1 ) THEN
!
! quick return if only one processor is used
!
CALL zgemm( TRANSA, TRANSB, n, n, n, alpha, a, lda, b, ldb, beta, c, ldc)
!
RETURN
!
END IF
IF( desc( la_npr_ ) /= desc( la_npc_ ) ) &
CALL errore( ' sqr_zmm_cannon ', ' works only with square processor mesh ', 1 )
IF( n < 1 ) &
CALL errore( ' sqr_zmm_cannon ', ' n less or equal zero ', 1 )
IF( n < desc( la_npr_ ) ) &
CALL errore( ' sqr_zmm_cannon ', ' n less than the mesh size ', 1 )
IF( n < desc( nlax_ ) ) &
CALL errore( ' sqr_zmm_cannon ', ' n less than the block size ', 1 )
!
! Retrieve communicator and mesh geometry
!
np = desc( la_npr_ )
comm = desc( la_comm_ )
rowid = desc( la_myr_ )
colid = desc( la_myc_ )
!
! Retrieve the size of the local block
!
nr = desc( nlar_ )
nc = desc( nlac_ )
nb = desc( nlax_ )
!
#if defined (__MPI)
CALL MPI_BARRIER( comm, ierr )
#endif
!
allocate( ablk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
ablk( i, j ) = a( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
DO j = nc+1, nb
DO i = 1, nb
ablk( i, j ) = zzero
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
ablk( i, j ) = zzero
END DO
END DO
!
!
allocate( bblk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
bblk( i, j ) = b( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
DO j = nc+1, nb
DO i = 1, nb
bblk( i, j ) = zzero
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
bblk( i, j ) = zzero
END DO
END DO
!
!
ta = ( TRANSA == 'C' .OR. TRANSA == 'c' )
tb = ( TRANSB == 'C' .OR. TRANSB == 'c' )
!
! Shift A rowid+1 places to the west
!
IF( ta ) THEN
CALL shift_exch_block( ablk, 'W', 1 )
ELSE
CALL shift_block( ablk, 'W', rowid+1, 1 )
END IF
!
! Shift B colid+1 places to the north
!
IF( tb ) THEN
CALL shift_exch_block( bblk, 'N', np+1 )
ELSE
CALL shift_block( bblk, 'N', colid+1, np+1 )
END IF
!
! Accumulate on C
!
CALL zgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, beta, c, ldc)
!
DO iter = 2, np
!
! Shift A 1 places to the east
!
CALL shift_block( ablk, 'E', 1, iter )
!
! Shift B 1 places to the south
!
CALL shift_block( bblk, 'S', 1, np+iter )
!
! Accumulate on C
!
CALL zgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, zone, c, ldc)
!
END DO
deallocate( ablk, bblk )
RETURN
CONTAINS
SUBROUTINE shift_block( blk, dir, ln, tag )
!
! Block shift
!
IMPLICIT NONE
COMPLEX(DP) :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir ! shift direction
INTEGER, INTENT(IN) :: ln ! shift lenght
INTEGER, INTENT(IN) :: tag ! communication tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
!
IF( dir == 'W' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid - ln + np, np )
icsrc = MOD( colid + ln + np, np )
!
ELSE IF( dir == 'E' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid + ln + np, np )
icsrc = MOD( colid - ln + np, np )
!
ELSE IF( dir == 'N' ) THEN
irdst = MOD( rowid - ln + np, np )
irsrc = MOD( rowid + ln + np, np )
icdst = colid
icsrc = colid
ELSE IF( dir == 'S' ) THEN
irdst = MOD( rowid + ln + np, np )
irsrc = MOD( rowid - ln + np, np )
icdst = colid
icsrc = colid
ELSE
CALL errore( ' sqr_zmm_cannon ', ' unknown shift direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_COMPLEX, &
idest, tag, isour, tag, comm, istatus, ierr)
!
#endif
RETURN
END SUBROUTINE shift_block
!
SUBROUTINE shift_exch_block( blk, dir, tag )
!
! Combined block shift and exchange
! only used for the first step
!
IMPLICIT NONE
COMPLEX(DP) :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir
INTEGER, INTENT(IN) :: tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
INTEGER :: icol, irow
!
IF( dir == 'W' ) THEN
!
icol = rowid
irow = colid
!
irdst = irow
icdst = MOD( icol - irow-1 + np, np )
!
irow = rowid
icol = MOD( colid + rowid+1 + np, np )
!
irsrc = icol
icsrc = irow
!
ELSE IF( dir == 'N' ) THEN
!
icol = rowid
irow = colid
!
icdst = icol
irdst = MOD( irow - icol-1 + np, np )
!
irow = MOD( rowid + colid+1 + np, np )
icol = colid
!
irsrc = icol
icsrc = irow
ELSE
CALL errore( ' sqr_zmm_cannon ', ' unknown shift_exch direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_COMPLEX, &
idest, tag, isour, tag, comm, istatus, ierr)
!
#endif
RETURN
END SUBROUTINE shift_exch_block
END SUBROUTINE sqr_zmm_cannon
!
!
!
!
SUBROUTINE sqr_tr_cannon( n, a, lda, b, ldb, desc )
!
! Parallel square matrix transposition with Cannon's algorithm
!
USE kinds, ONLY : DP
USE descriptors, ONLY : ilar_ , nlar_ , ilac_ , nlac_ , nlax_ , la_npc_ , &
la_comm_ , lambda_node_ , la_npr_ , la_myr_ , la_myc_
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: lda, ldb
REAL(DP) :: a(lda,*), b(ldb,*)
INTEGER, INTENT(IN) :: desc(*)
!
#if defined (__MPI)
!
INCLUDE 'mpif.h'
!
#endif
!
INTEGER :: ierr
INTEGER :: np, rowid, colid
INTEGER :: i, j, nr, nc, nb
INTEGER :: comm
!
REAL(DP), ALLOCATABLE :: ablk(:,:)
!
#if defined (__MPI)
!
INTEGER :: istatus( MPI_STATUS_SIZE )
!
#endif
!
IF( desc( lambda_node_ ) < 0 ) THEN
RETURN
END IF
IF( desc( la_npr_ ) == 1 ) THEN
CALL mytranspose( a, lda, b, ldb, n, n )
RETURN
END IF
IF( desc( la_npr_ ) /= desc( la_npc_ ) ) &
CALL errore( ' sqr_tr_cannon ', ' works only with square processor mesh ', 1 )
IF( n < 1 ) &
CALL errore( ' sqr_tr_cannon ', ' n less or equal zero ', 1 )
IF( n < desc( la_npr_ ) ) &
CALL errore( ' sqr_tr_cannon ', ' n less than the mesh size ', 1 )
IF( n < desc( nlax_ ) ) &
CALL errore( ' sqr_tr_cannon ', ' n less than the block size ', 1 )
comm = desc( la_comm_ )
rowid = desc( la_myr_ )
colid = desc( la_myc_ )
np = desc( la_npr_ )
!
! Compute the size of the local block
!
nr = desc( nlar_ )
nc = desc( nlac_ )
nb = desc( nlax_ )
!
allocate( ablk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
ablk( i, j ) = a( i, j )
END DO
END DO
DO j = nc+1, nb
DO i = 1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
!
CALL exchange_block( ablk )
!
#if defined (__MPI)
CALL MPI_BARRIER( comm, ierr )
#endif
!
DO j = 1, nr
DO i = 1, nc
b( j, i ) = ablk( i, j )
END DO
END DO
!
deallocate( ablk )
RETURN
CONTAINS
SUBROUTINE exchange_block( blk )
!
! Block exchange ( transpose )
!
IMPLICIT NONE
REAL(DP) :: blk( :, : )
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
!
irdst = colid
icdst = rowid
irsrc = colid
icsrc = rowid
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_PRECISION, &
idest, np+np+1, isour, np+np+1, comm, istatus, ierr)
!
#endif
RETURN
END SUBROUTINE
END SUBROUTINE
!
!
!
SUBROUTINE cyc2blk_redist( n, a, lda, b, ldb, desc )
!
! Parallel square matrix redistribution.
! A (input) is cyclically distributed by rows across processors
! B (output) is distributed by block across 2D processors grid
!
USE kinds, ONLY : DP
USE descriptors, ONLY : ilar_ , nlar_ , ilac_ , nlac_ , nlax_ , lambda_node_ , la_npr_ , &
descla_siz_ , la_npc_ , la_n_ , la_me_ , la_comm_
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: lda, ldb
REAL(DP) :: a(lda,*), b(ldb,*)
INTEGER :: desc(*)
!
#if defined (__MPI)
!
include 'mpif.h'
!
#endif
!
integer :: ierr, itag
integer :: np, ip, me, nproc, comm_a
integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
!
real(DP), allocatable :: rcvbuf(:,:,:)
real(DP), allocatable :: sndbuf(:,:)
integer, allocatable :: ip_desc(:,:)
!
character(len=256) :: msg
!
#if defined (__MPI)
IF( desc( lambda_node_ ) < 0 ) THEN
RETURN
END IF
np = desc( la_npr_ ) ! dimension of the processor mesh
nb = desc( nlax_ ) ! leading dimension of the local matrix block
me = desc( la_me_ ) ! my processor id (starting from 0)
comm_a = desc( la_comm_ )
nproc = desc( la_npr_ ) * desc( la_npc_ )
IF( np /= desc( la_npc_ ) ) &
CALL errore( ' cyc2blk_redist ', ' works only with square processor mesh ', 1 )
IF( n < 1 ) &
CALL errore( ' cyc2blk_redist ', ' n less or equal zero ', 1 )
IF( desc( la_n_ ) < nproc ) &
CALL errore( ' cyc2blk_redist ', ' nb less than the number of proc ', 1 )
ALLOCATE( ip_desc( descla_siz_ , nproc ) )
CALL mpi_allgather( desc, descla_siz_ , mpi_integer, ip_desc, descla_siz_ , mpi_integer, comm_a, ierr )
!
nbuf = (nb/nproc+2) * nb
!
ALLOCATE( sndbuf( nb/nproc+2, nb ) )
ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )
DO ip = 0, nproc - 1
!
IF( ip_desc( lambda_node_ , ip + 1 ) > 0 ) THEN
ip_nr = ip_desc( nlar_ , ip + 1)
ip_nc = ip_desc( nlac_ , ip + 1)
ip_ir = ip_desc( ilar_ , ip + 1)
ip_ic = ip_desc( ilac_ , ip + 1)
!
DO j = 1, ip_nc
jj = j + ip_ic - 1
il = 1
DO i = 1, ip_nr
ii = i + ip_ir - 1
IF( MOD( ii - 1, nproc ) == me ) THEN
sndbuf( il, j ) = a( ( ii - 1 )/nproc + 1, jj )
il = il + 1
END IF
END DO
END DO
END IF
CALL mpi_gather( sndbuf, nbuf, mpi_double_precision, &
rcvbuf, nbuf, mpi_double_precision, ip, comm_a, ierr )
END DO
!
nr = desc( nlar_ )
nc = desc( nlac_ )
ir = desc( ilar_ )
ic = desc( ilac_ )
!
DO ip = 0, nproc - 1
DO j = 1, nc
il = 1
DO i = 1, nr
ii = i + ir - 1
IF( MOD( ii - 1, nproc ) == ip ) THEN
b( i, j ) = rcvbuf( il, j, ip+1 )
il = il + 1
END IF
END DO
END DO
END DO
!
!
DEALLOCATE( ip_desc )
DEALLOCATE( rcvbuf )
DEALLOCATE( sndbuf )
#else
b( 1:n, 1:n ) = a( 1:n, 1:n )
#endif
RETURN
END SUBROUTINE cyc2blk_redist
SUBROUTINE cyc2blk_zredist( n, a, lda, b, ldb, desc )
!
! Parallel square matrix redistribution.
! A (input) is cyclically distributed by rows across processors
! B (output) is distributed by block across 2D processors grid
!
USE kinds, ONLY : DP
USE descriptors, ONLY : ilar_ , nlar_ , ilac_ , nlac_ , nlax_ , lambda_node_ , la_npr_ , &
descla_siz_ , la_npc_ , la_n_ , la_me_ , la_comm_
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: lda, ldb
COMPLEX(DP) :: a(lda,*), b(ldb,*)
INTEGER :: desc(*)
!
#if defined (__MPI)
!
include 'mpif.h'
!
#endif
!
integer :: ierr, itag
integer :: np, ip, me, nproc, comm_a
integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
!
COMPLEX(DP), allocatable :: rcvbuf(:,:,:)
COMPLEX(DP), allocatable :: sndbuf(:,:)
integer, allocatable :: ip_desc(:,:)
!
character(len=256) :: msg
!
#if defined (__MPI)
IF( desc( lambda_node_ ) < 0 ) THEN
RETURN
END IF
np = desc( la_npr_ ) ! dimension of the processor mesh
nb = desc( nlax_ ) ! leading dimension of the local matrix block
me = desc( la_me_ ) ! my processor id (starting from 0)
comm_a = desc( la_comm_ )
nproc = desc( la_npr_ ) * desc( la_npc_ )
IF( np /= desc( la_npc_ ) ) &
CALL errore( ' cyc2blk_zredist ', ' works only with square processor mesh ', 1 )
IF( n < 1 ) &
CALL errore( ' cyc2blk_zredist ', ' n less or equal zero ', 1 )
IF( desc( la_n_ ) < nproc ) &
CALL errore( ' cyc2blk_zredist ', ' nb less than the number of proc ', 1 )
ALLOCATE( ip_desc( descla_siz_ , nproc ) )
CALL mpi_allgather( desc, descla_siz_ , mpi_integer, ip_desc, descla_siz_ , mpi_integer, comm_a, ierr )
!
nbuf = (nb/nproc+2) * nb
!
ALLOCATE( sndbuf( nb/nproc+2, nb ) )
ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )
DO ip = 0, nproc - 1
!
IF( ip_desc( lambda_node_ , ip + 1 ) > 0 ) THEN
ip_nr = ip_desc( nlar_ , ip + 1)
ip_nc = ip_desc( nlac_ , ip + 1)
ip_ir = ip_desc( ilar_ , ip + 1)
ip_ic = ip_desc( ilac_ , ip + 1)
!
DO j = 1, ip_nc
jj = j + ip_ic - 1
il = 1
DO i = 1, ip_nr
ii = i + ip_ir - 1
IF( MOD( ii - 1, nproc ) == me ) THEN
sndbuf( il, j ) = a( ( ii - 1 )/nproc + 1, jj )
il = il + 1
END IF
END DO
END DO
END IF
CALL mpi_gather( sndbuf, nbuf, mpi_double_complex, &
rcvbuf, nbuf, mpi_double_complex, ip, comm_a, ierr )
END DO
!
nr = desc( nlar_ )
nc = desc( nlac_ )
ir = desc( ilar_ )
ic = desc( ilac_ )
!
DO ip = 0, nproc - 1
DO j = 1, nc
il = 1
DO i = 1, nr
ii = i + ir - 1
IF( MOD( ii - 1, nproc ) == ip ) THEN
b( i, j ) = rcvbuf( il, j, ip+1 )
il = il + 1
END IF
END DO
END DO
END DO
!
!
DEALLOCATE( ip_desc )
DEALLOCATE( rcvbuf )
DEALLOCATE( sndbuf )
#else
b( 1:n, 1:n ) = a( 1:n, 1:n )
#endif
RETURN
END SUBROUTINE cyc2blk_zredist
SUBROUTINE blk2cyc_redist( n, a, lda, b, ldb, desc )
!
! Parallel square matrix redistribution.
! A (output) is cyclically distributed by rows across processors
! B (input) is distributed by block across 2D processors grid
!
USE kinds, ONLY : DP
USE descriptors, ONLY : ilar_ , nlar_ , ilac_ , nlac_ , nlax_ , lambda_node_ , la_npr_ , &
descla_siz_ , la_npc_ , la_n_ , la_me_ , la_comm_
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: lda, ldb
REAL(DP) :: a(lda,*), b(ldb,*)
INTEGER :: desc(*)
!
#if defined (__MPI)
!
include 'mpif.h'
!
#endif
!
integer :: ierr, itag
integer :: np, ip, me, comm_a, nproc
integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
!
real(DP), allocatable :: rcvbuf(:,:,:)
real(DP), allocatable :: sndbuf(:,:)
integer, allocatable :: ip_desc(:,:)
!
character(len=256) :: msg
!
#if defined (__MPI)
IF( desc( lambda_node_ ) < 0 ) THEN
RETURN
END IF
np = desc( la_npr_ ) ! dimension of the processor mesh
nb = desc( nlax_ ) ! leading dimension of the local matrix block
me = desc( la_me_ ) ! my processor id (starting from 0)
comm_a = desc( la_comm_ )
nproc = desc( la_npr_ ) * desc( la_npc_ )
IF( np /= desc( la_npc_ ) ) &
CALL errore( ' blk2cyc_redist ', ' works only with square processor mesh ', 1 )
IF( n < 1 ) &
CALL errore( ' blk2cyc_redist ', ' n less or equal zero ', 1 )
IF( desc( la_n_ ) < nproc ) &
CALL errore( ' blk2cyc_redist ', ' nb less than the number of proc ', 1 )
ALLOCATE( ip_desc( descla_siz_ , nproc ) )
CALL mpi_allgather( desc, descla_siz_ , mpi_integer, ip_desc, descla_siz_ , mpi_integer, comm_a, ierr )
!
nbuf = (nb/nproc+2) * nb
!
ALLOCATE( sndbuf( nb/nproc+2, nb ) )
ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )
!
nr = desc( nlar_ )
nc = desc( nlac_ )
ir = desc( ilar_ )
ic = desc( ilac_ )
!
DO ip = 0, nproc - 1
DO j = 1, nc
il = 1
DO i = 1, nr
ii = i + ir - 1
IF( MOD( ii - 1, nproc ) == ip ) THEN
sndbuf( il, j ) = b( i, j )
il = il + 1
END IF
END DO
END DO
CALL mpi_gather( sndbuf, nbuf, mpi_double_precision, &
rcvbuf, nbuf, mpi_double_precision, ip, comm_a, ierr )
END DO
!
DO ip = 0, nproc - 1
!
IF( ip_desc( lambda_node_ , ip + 1 ) > 0 ) THEN
ip_nr = ip_desc( nlar_ , ip + 1)
ip_nc = ip_desc( nlac_ , ip + 1)
ip_ir = ip_desc( ilar_ , ip + 1)
ip_ic = ip_desc( ilac_ , ip + 1)
!
DO j = 1, ip_nc
jj = j + ip_ic - 1
il = 1
DO i = 1, ip_nr
ii = i + ip_ir - 1
IF( MOD( ii - 1, nproc ) == me ) THEN
a( ( ii - 1 )/nproc + 1, jj ) = rcvbuf( il, j, ip+1 )
il = il + 1
END IF
END DO
END DO
END IF
END DO
!
DEALLOCATE( ip_desc )
DEALLOCATE( rcvbuf )
DEALLOCATE( sndbuf )
#else
a( 1:n, 1:n ) = b( 1:n, 1:n )
#endif
RETURN
END SUBROUTINE blk2cyc_redist
SUBROUTINE blk2cyc_zredist( n, a, lda, b, ldb, desc )
!
! Parallel square matrix redistribution.
! A (output) is cyclically distributed by rows across processors
! B (input) is distributed by block across 2D processors grid
!
USE kinds, ONLY : DP
USE descriptors, ONLY : ilar_ , nlar_ , ilac_ , nlac_ , nlax_ , lambda_node_ , la_npr_ , &
descla_siz_ , la_npc_ , la_n_ , la_me_ , la_comm_
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
INTEGER, INTENT(IN) :: lda, ldb
COMPLEX(DP) :: a(lda,*), b(ldb,*)
INTEGER :: desc(*)
!
#if defined (__MPI)
!
include 'mpif.h'
!
#endif
!
integer :: ierr, itag
integer :: np, ip, me, comm_a, nproc
integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
!
COMPLEX(DP), allocatable :: rcvbuf(:,:,:)
COMPLEX(DP), allocatable :: sndbuf(:,:)
integer, allocatable :: ip_desc(:,:)
!
character(len=256) :: msg
!
#if defined (__MPI)
IF( desc( lambda_node_ ) < 0 ) THEN
RETURN
END IF
np = desc( la_npr_ ) ! dimension of the processor mesh
nb = desc( nlax_ ) ! leading dimension of the local matrix block
me = desc( la_me_ ) ! my processor id (starting from 0)
comm_a = desc( la_comm_ )
nproc = desc( la_npr_ ) * desc( la_npc_ )
IF( np /= desc( la_npc_ ) ) &
CALL errore( ' blk2cyc_zredist ', ' works only with square processor mesh ', 1 )
IF( n < 1 ) &
CALL errore( ' blk2cyc_zredist ', ' n less or equal zero ', 1 )
IF( desc( la_n_ ) < nproc ) &
CALL errore( ' blk2cyc_zredist ', ' nb less than the number of proc ', 1 )
ALLOCATE( ip_desc( descla_siz_ , nproc ) )
CALL mpi_allgather( desc, descla_siz_ , mpi_integer, ip_desc, descla_siz_ , mpi_integer, comm_a, ierr )
!
nbuf = (nb/nproc+2) * nb
!
ALLOCATE( sndbuf( nb/nproc+2, nb ) )
ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )
!
nr = desc( nlar_ )
nc = desc( nlac_ )
ir = desc( ilar_ )
ic = desc( ilac_ )
!
DO ip = 0, nproc - 1
DO j = 1, nc
il = 1
DO i = 1, nr
ii = i + ir - 1
IF( MOD( ii - 1, nproc ) == ip ) THEN
sndbuf( il, j ) = b( i, j )
il = il + 1
END IF
END DO
END DO
CALL mpi_gather( sndbuf, nbuf, mpi_double_complex, &
rcvbuf, nbuf, mpi_double_complex, ip, comm_a, ierr )
END DO
!
DO ip = 0, nproc - 1
!
IF( ip_desc( lambda_node_ , ip + 1 ) > 0 ) THEN
ip_nr = ip_desc( nlar_ , ip + 1)
ip_nc = ip_desc( nlac_ , ip + 1)
ip_ir = ip_desc( ilar_ , ip + 1)
ip_ic = ip_desc( ilac_ , ip + 1)
!
DO j = 1, ip_nc
jj = j + ip_ic - 1
il = 1
DO i = 1, ip_nr
ii = i + ip_ir - 1
IF( MOD( ii - 1, nproc ) == me ) THEN
a( ( ii - 1 )/nproc + 1, jj ) = rcvbuf( il, j, ip+1 )
il = il + 1
END IF
END DO
END DO
END IF
END DO
!
DEALLOCATE( ip_desc )
DEALLOCATE( rcvbuf )
DEALLOCATE( sndbuf )
#else
a( 1:n, 1:n ) = b( 1:n, 1:n )
#endif
RETURN
END SUBROUTINE blk2cyc_zredist
!
!
!
! Double Complex and Double Precision Cholesky Factorization of
! an Hermitan/Symmetric block distributed matrix
!
!
SUBROUTINE pzpotf( sll, ldx, n, desc )
!
use descriptors, ONLY: descla_local_dims, descla_siz_ , la_myr_ , la_myc_ , la_me_ , nlax_ ,&
nlar_ , nlac_ , ilar_ , ilac_ , la_comm_ , la_nx_ , la_npr_ , la_npc_
use parallel_include
use kinds
!
implicit none
!
integer :: n, ldx
integer :: desc( descla_siz_ )
real(DP) :: one, zero
complex(DP) :: sll( ldx, ldx ), cone, czero
integer :: myrow, mycol, ierr
integer :: jb, info, ib, kb
integer :: jnr, jir, jic, jnc
integer :: inr, iir, iic, inc
integer :: knr, kir, kic, knc
integer :: nr, nc
integer :: rcomm, ccomm, color, key, myid, np
complex(DP), allocatable :: ssnd( :, : ), srcv( :, : )
one = 1.0_DP
cone = 1.0_DP
zero = 0.0_DP
czero = 0.0_DP
#if defined __MPI
myrow = desc( la_myr_ )
mycol = desc( la_myc_ )
myid = desc( la_me_ )
np = desc( la_npr_ )
IF( desc( la_npr_ ) /= desc( la_npc_ ) ) THEN
CALL errore( ' pzpotf ', ' only square grid are allowed ', 1 )
END IF
IF( ldx /= desc( nlax_ ) ) THEN
CALL errore( ' pzpotf ', ' wrong leading dimension ldx ', ldx )
END IF
nr = desc( nlar_ )
nc = desc( nlac_ )
ALLOCATE( ssnd( ldx, ldx ) )
ALLOCATE( srcv( ldx, ldx ) )
DO jb = 1, np
!
! Update and factorize the current diagonal block and test
! for non-positive-definiteness.
!
CALL descla_local_dims( jir, jnr, n, desc( la_nx_ ), np, jb-1 )
!
! since we loop on diagonal blocks/procs we have jnc == jnr
!
jnc = jnr
!
! prepare row and colum communicators
IF( ( myrow >= ( jb-1 ) ) .AND. ( mycol <= ( jb-1 ) ) ) THEN
color = mycol
key = myrow
ELSE
color = np
key = myid
END IF
!
CALL mpi_comm_split( desc( la_comm_ ) , color, key, ccomm, ierr )
!
IF( myrow >= jb-1 .and. mycol <= jb-1 ) THEN
color = myrow
key = mycol
ELSE
color = np
key = myid
END IF
!
CALL mpi_comm_split( desc( la_comm_ ), color, key, rcomm, ierr )
!
! here every process can work independently, then we need a reduce.
!
IF( jb > 1 ) THEN
!
DO ib = 1, jb - 1
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( ib - 1 ) ) ) THEN
!
! remember: matrix ssnd is nr*nr, and procs on the diagonale have nr == nc
!
CALL ZHERK( 'L', 'N', nr, nc, -ONE, sll, ldx, zero, ssnd, ldx )
!
END IF
END DO
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
ssnd = sll
END IF
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
!
! accumulate on the diagonal block/proc
!
CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_COMPLEX, MPI_SUM, jb-1, rcomm, ierr )
!
END IF
!
END IF
!
! Only proj ( jb-1, jb-1 ) operates this
!
info = 0
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL ZPOTF2( 'L', jnr, sll, ldx, INFO )
IF( info /= 0 ) &
CALL errore( " pzpotrf ", " problems computing cholesky decomposition ", ABS( info ) )
END IF
!
IF( ( jb > 1 ) .AND. ( jb < np ) ) THEN
!
! Compute the current block column.
!
! processors ( 1 : jb - 1, jb ) should bcast their blocs
! along column to processor ( 1 : jb - 1, jb + 1 : nb )
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
CALL mpi_bcast( sll, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
CALL mpi_bcast( srcv, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
END IF
!
DO ib = jb + 1, np
CALL descla_local_dims( iir, inr, n, desc( la_nx_ ), np, ib-1 )
DO kb = 1, jb - 1
CALL descla_local_dims( kic, knc, n, desc( la_nx_ ), np, kb-1 )
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( kb - 1 ) ) ) THEN
CALL ZGEMM( 'N', 'C', inr, jnr, knc, -CONE, sll, ldx, srcv, ldx, czero, ssnd, ldx )
END IF
END DO
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
ssnd = sll
END IF
END DO
!
! processors ( jb, jb + 1 : nb ) should collect block along row,
! from processors ( 1 : jb - 1, jb + 1 : nb )
!
DO kb = jb + 1, np
IF( ( myrow == ( kb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
IF( ( jb == 1 ) ) THEN
IF( mycol == ( jb - 1 ) ) THEN
sll = ssnd
END IF
ELSE
CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_COMPLEX, MPI_SUM, jb-1, rcomm, ierr )
END IF
END IF
END DO
!
END IF
!
IF( jb < np ) THEN
!
! processor "jb,jb" should broadcast his block to procs ( jb+1 : nb, jb )
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL mpi_bcast( sll, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL mpi_bcast( srcv, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
END IF
!
DO ib = jb + 1, np
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL ZTRSM( 'R', 'L', 'C', 'N', nr, nc, CONE, srcv, ldx, sll, ldx )
END IF
END DO
!
END IF
!
CALL mpi_comm_free( rcomm, ierr )
CALL mpi_comm_free( ccomm, ierr )
!
END DO
DEALLOCATE( srcv, ssnd )
#else
CALL ZPOTRF( 'L', n, sll, ldx, info )
IF( info /= 0 ) &
CALL errore( " pzpotrf ", " problems computing cholesky decomposition ", ABS( info ) )
#endif
return
END SUBROUTINE pzpotf
! now the Double Precision subroutine
SUBROUTINE pdpotf( sll, ldx, n, desc )
!
use descriptors, ONLY: descla_local_dims, descla_siz_ , la_myr_ , la_myc_ , la_me_ , nlax_ , &
nlar_ , nlac_ , ilar_ , ilac_ , la_comm_ , la_nx_ , la_npr_ , la_npc_
use parallel_include
use kinds
!
implicit none
!
integer :: n, ldx
integer :: desc( descla_siz_ )
REAL(DP) :: one, zero
REAL(DP) :: sll( ldx, ldx )
integer :: myrow, mycol, ierr
integer :: jb, info, ib, kb
integer :: jnr, jir, jic, jnc
integer :: inr, iir, iic, inc
integer :: knr, kir, kic, knc
integer :: nr, nc
integer :: rcomm, ccomm, color, key, myid, np
REAL(DP), ALLOCATABLE :: ssnd( :, : ), srcv( :, : )
one = 1.0_DP
zero = 0.0_DP
#if defined __MPI
myrow = desc( la_myr_ )
mycol = desc( la_myc_ )
myid = desc( la_me_ )
np = desc( la_npr_ )
IF( desc( la_npr_ ) /= desc( la_npc_ ) ) THEN
CALL errore( ' pdpotf ', ' only square grid are allowed ', 1 )
END IF
IF( ldx /= desc( nlax_ ) ) THEN
CALL errore( ' pdpotf ', ' wrong leading dimension ldx ', ldx )
END IF
nr = desc( nlar_ )
nc = desc( nlac_ )
ALLOCATE( ssnd( ldx, ldx ) )
ALLOCATE( srcv( ldx, ldx ) )
DO jb = 1, np
!
! Update and factorize the current diagonal block and test
! for non-positive-definiteness.
!
CALL descla_local_dims( jir, jnr, n, desc( la_nx_ ), np, jb-1 )
!
! since we loop on diagonal blocks/procs we have jnc == jnr
!
jnc = jnr
!
! prepare row and colum communicators
IF( ( myrow >= ( jb-1 ) ) .AND. ( mycol <= ( jb-1 ) ) ) THEN
color = mycol
key = myrow
ELSE
color = np
key = myid
END IF
!
CALL mpi_comm_split( desc( la_comm_ ) , color, key, ccomm, ierr )
!
IF( myrow >= jb-1 .and. mycol <= jb-1 ) THEN
color = myrow
key = mycol
ELSE
color = np
key = myid
END IF
!
CALL mpi_comm_split( desc( la_comm_ ), color, key, rcomm, ierr )
!
! here every process can work independently, then we need a reduce.
!
IF( jb > 1 ) THEN
!
DO ib = 1, jb - 1
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( ib - 1 ) ) ) THEN
!
! remember: matrix ssnd is nr*nr, and procs on the diagonale have nr == nc
!
CALL DSYRK( 'L', 'N', nr, nc, -ONE, sll, ldx, zero, ssnd, ldx )
!
END IF
END DO
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
ssnd = sll
END IF
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
!
! accumulate on the diagonal block/proc
!
CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_PRECISION, MPI_SUM, jb-1, rcomm, ierr )
!
END IF
!
END IF
!
! Only proj ( jb-1, jb-1 ) operates this
!
info = 0
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL DPOTRF( 'L', jnr, sll, ldx, INFO )
IF( info /= 0 ) &
CALL errore( " pzpotrf ", " problems computing cholesky decomposition ", ABS( info ) )
END IF
!
IF( ( jb > 1 ) .AND. ( jb < np ) ) THEN
!
! Compute the current block column.
!
! processors ( 1 : jb - 1, jb ) should bcast their blocs
! along column to processor ( 1 : jb - 1, jb + 1 : nb )
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
CALL mpi_bcast( sll, ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
CALL mpi_bcast( srcv, ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
END IF
!
DO ib = jb + 1, np
CALL descla_local_dims( iir, inr, n, desc( la_nx_ ), np, ib-1 )
DO kb = 1, jb - 1
CALL descla_local_dims( kic, knc, n, desc( la_nx_ ), np, kb-1 )
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( kb - 1 ) ) ) THEN
CALL DGEMM( 'N', 'T', inr, jnr, knc, -ONE, sll, ldx, srcv, ldx, zero, ssnd, ldx )
END IF
END DO
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
ssnd = sll
END IF
END DO
!
! processors ( jb, jb + 1 : nb ) should collect block along row,
! from processors ( 1 : jb - 1, jb + 1 : nb )
!
DO kb = jb + 1, np
IF( ( myrow == ( kb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
IF( ( jb == 1 ) ) THEN
IF( mycol == ( jb - 1 ) ) THEN
sll = ssnd
END IF
ELSE
CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_PRECISION, MPI_SUM, jb-1, rcomm, ierr )
END IF
END IF
END DO
!
END IF
!
IF( jb < np ) THEN
!
! processor "jb,jb" should broadcast his block to procs ( jb+1 : nb, jb )
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL mpi_bcast( sll, ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL mpi_bcast( srcv, ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
END IF
!
DO ib = jb + 1, np
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL DTRSM( 'R', 'L', 'T', 'N', nr, nc, ONE, srcv, ldx, sll, ldx )
END IF
END DO
!
END IF
!
CALL mpi_comm_free( rcomm, ierr )
CALL mpi_comm_free( ccomm, ierr )
!
END DO
DEALLOCATE( srcv, ssnd )
#else
CALL DPOTRF( 'L', n, sll, ldx, info )
IF( info /= 0 ) &
CALL errore( " pzpotrf ", " problems computing cholesky decomposition ", ABS( info ) )
#endif
return
END SUBROUTINE pdpotf
!
!
!
!
SUBROUTINE pztrtri ( sll, ldx, n, desc )
! pztrtri computes the parallel inversion of a lower triangular matrix
! distribuited among the processes using a 2-D block partitioning.
! The algorithm is based on the schema below and executes the model
! recursively to each column C2 under the diagonal.
!
! |-------|-------| |--------------------|--------------------|
! | A1 | 0 | | C1 = trtri(A1) | 0 |
! A = |-------|-------| C = |--------------------|--------------------|
! | A2 | A3 | | C2 = -C3 * A2 * C1 | C3 = trtri(A3) |
! |-------|-------| |--------------------|--------------------|
!
! The recursive steps of multiplication (C2 = -C3 * A2 * C1) is based on the Cannon's algorithms
! for parallel matrix multiplication and is done with BLACS(dgemm)
!
!
! Arguments
! ============
!
! sll = local block of data
! ldx = leading dimension of one block
! n = size of the global array diributed among the blocks
! desc = descriptor of the matrix distribution
!
!
!
USE kinds
USE parallel_include
USE descriptors, ONLY: descla_local_dims, descla_siz_ , la_myr_ , la_myc_ , la_me_ , nlax_ , &
nlar_ , nlac_ , ilar_ , ilac_ , la_comm_ , la_nx_ , la_npr_ , la_npc_
IMPLICIT NONE
INTEGER, INTENT( IN ) :: n, ldx
INTEGER, INTENT( IN ) :: desc( descla_siz_ )
COMPLEX(DP), INTENT( INOUT ) :: sll( ldx, ldx )
COMPLEX(DP), PARAMETER :: ONE = (1.0_DP, 0.0_DP)
COMPLEX(DP), PARAMETER :: ZERO = (0.0_DP, 0.0_DP)
#if defined __MPI
INTEGER :: status(MPI_STATUS_SIZE)
#endif
INTEGER :: req(2), ierr, col_comm
INTEGER :: send, recv, group_rank, group_size
INTEGER :: myrow, mycol, np, myid, comm
! counters
INTEGER :: k, i, j, count, step_count, shiftcount, cicle
INTEGER :: C3dim ! Dimension of submatrix B
INTEGER :: nc, nr ! Local dimension of block
INTEGER :: info, sup_recv
INTEGER :: idrowref, idcolref, idref, idrecv
! B and BUF_RECV are used to overload the computation of matrix multiplication and the shift of the blocks
COMPLEX(DP), ALLOCATABLE, DIMENSION( :, : ) :: B, C, BUF_RECV
COMPLEX(DP) :: first
myrow = desc( la_myr_ )
mycol = desc( la_myc_ )
myid = desc( la_me_ )
np = desc( la_npr_ )
comm = desc( la_comm_ )
IF( desc( la_npr_ ) /= desc( la_npc_ ) ) THEN
CALL errore( ' pztrtri ', ' only square grid are allowed ', 1 )
END IF
IF( ldx /= desc( nlax_ ) ) THEN
CALL errore( ' pztrtri ', ' wrong leading dimension ldx ', ldx )
END IF
nr = desc( nlar_ )
nc = desc( nlac_ )
! clear elements outside local meaningful block nr*nc
DO j = nc+1, ldx
DO i = 1, ldx
sll( i, j ) = zero
END DO
END DO
DO j = 1, ldx
DO i = nr+1, ldx
sll( i, j ) = zero
END DO
END DO
#if defined __MPI
ALLOCATE( B( ldx, ldx ) )
ALLOCATE( C( ldx, ldx ) )
ALLOCATE( BUF_RECV ( ldx, ldx ) )
IF( np == 2 ) THEN
!
! special case with 4 proc, 2x2 grid
!
IF( myrow == mycol ) THEN
CALL compute_ztrtri()
END IF
!
CALL GRID2D_RANK( 'R', np, np, 1, 0, idref )
!
IF( myrow == 0 .AND. mycol == 0 ) THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_COMPLEX, idref, 0, comm, ierr)
END IF
!
IF( myrow == 1 .AND. mycol == 1 ) THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_COMPLEX, idref, 1, comm, ierr)
END IF
!
IF( myrow == 1 .AND. mycol == 0 ) THEN
!
CALL GRID2D_RANK( 'R', np, np, 0, 0, i )
CALL GRID2D_RANK( 'R', np, np, 1, 1, j )
!
CALL MPI_Irecv( B, ldx*ldx, MPI_DOUBLE_COMPLEX, i, 0, comm, req(1), ierr)
CALL MPI_Irecv( C, ldx*ldx, MPI_DOUBLE_COMPLEX, j, 1, comm, req(2), ierr)
!
CALL MPI_Wait(req(1), status, ierr)
!
CALL zgemm('N', 'N', ldx, ldx, ldx, ONE, sll, ldx, b, ldx, ZERO, buf_recv, ldx)
!
CALL MPI_Wait(req(2), status, ierr)
!
CALL zgemm('N', 'N', ldx, ldx, ldx, -ONE, c, ldx, buf_recv, ldx, ZERO, sll, ldx)
!
END IF
!
IF( myrow == 0 .AND. mycol == 1 ) THEN
!
sll = zero
!
END IF
!
DEALLOCATE( b, c, buf_recv )
!
RETURN
!
END IF
IF( myrow >= mycol ) THEN
!
! only procs on lower triangle partecipates
!
CALL MPI_Comm_split( comm, mycol, myrow, col_comm, ierr )
CALL MPI_Comm_size( col_comm, group_size, ierr )
CALL MPI_Comm_rank( col_comm, group_rank, ierr )
!
ELSE
!
! other procs stay at the window!
!
CALL MPI_Comm_split( comm, MPI_UNDEFINED, MPI_UNDEFINED, col_comm, ierr )
!
sll = zero
!
END IF
!
! Compute the inverse of a lower triangular
! along the diagonal of the global array with BLAS(ztrtri)
!
IF( mycol == myrow ) THEN
!
CALL compute_ztrtri()
!
ELSE IF( myrow > mycol ) THEN
!
buf_recv = sll
!
END IF
IF( myrow >= mycol ) THEN
!
! Broadcast the diagonal blocks to the processors under the diagonal
!
CALL MPI_Bcast( sll, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, col_comm, ierr )
!
END IF
! Compute A2 * C1 and start the Cannon's algorithm shifting the blocks of column one place to the North
!
IF( myrow > mycol ) THEN
!
CALL zgemm( 'N', 'N', ldx, ldx, ldx, ONE, buf_recv, ldx, sll, ldx, ZERO, c, ldx )
!
send = shift( 1, group_rank, 1, ( group_size - 1 ), 'N' )
recv = shift( 1, group_rank, 1, ( group_size - 1 ), 'S' )
!
CALL MPI_Sendrecv( c, ldx*ldx, MPI_DOUBLE_COMPLEX, send, 0, buf_recv, &
ldx*ldx, MPI_DOUBLE_COMPLEX, recv, 0, col_comm, status, ierr )
!
END IF
! Execute the Cannon's algorithm to compute ricorsively the multiplication of C2 = -C3 * A2 * C1
!
DO count = ( np - 2 ), 0, -1
C3dim = (np-1) - count ! Dimension of the submatrix C3
first = ZERO
cicle = 0
IF( ( myrow > count ) .AND. ( mycol >= count ) ) THEN
idcolref = count + 1
idrowref = myrow
CALL GRID2D_RANK( 'R', np, np, idrowref, idcolref, idref )
idrecv = idref - 1
! Compute C2 = -C3 * A2 * C1
DO shiftcount = count, np-2
IF(mycol>count)THEN
! Execute the virtual shift of the matrix C3 along the row in order to know which processor
! have to send the block to C2
IF( cicle == 0)THEN
! virtual shift of the block i,j of the submatrix C3 i place to West
send = shift(idref, myid, myrow-count, C3dim, 'W')
ELSE
! virtual shift of the block i,j of the submatrix C3 i place to West
send = shift(idref, send, 1, C3dim, 'E')
END IF
IF(send==idref)THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_COMPLEX, idrecv, myid, comm, ierr)
END IF
ELSE
IF( cicle == 0)THEN
! virtual shift of the block i,j of the submatrix C3 i place to West
sup_recv = shift(idref, myid+1, myrow-count, C3dim, 'E')
ELSE
! virtual shift of the block i,j of the submatrix C3 i place to West
sup_recv = shift(idref, sup_recv, 1, C3dim, 'W')
END IF
CALL MPI_Recv(C, ldx*ldx, MPI_DOUBLE_COMPLEX, sup_recv, sup_recv, comm, status, ierr)
send = shift(1, group_rank, 1, (group_size-1), 'S')
recv = shift(1, group_rank, 1, (group_size-1), 'N')
! with the no-blocking communication the computation and the shift of the column block are overapped
IF( MOD( cicle, 2 ) == 0 ) THEN
CALL MPI_Isend(BUF_RECV, ldx*ldx, MPI_DOUBLE_COMPLEX, send, group_rank+cicle, col_comm, req(1), ierr)
CALL MPI_Irecv(B, ldx*ldx, MPI_DOUBLE_COMPLEX, recv, recv+cicle, col_comm, req(2), ierr)
CALL zgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, BUF_RECV, ldx, first, sll, ldx)
ELSE
CALL MPI_Isend(B, ldx*ldx, MPI_DOUBLE_COMPLEX, send, group_rank+cicle, col_comm, req(1), ierr)
CALL MPI_Irecv(BUF_RECV, ldx*ldx, MPI_DOUBLE_COMPLEX, recv, recv+cicle, col_comm, req(2), ierr)
CALL zgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, B, ldx, ONE, sll, ldx)
END IF
CALL MPI_Wait(req(1), status, ierr)
CALL MPI_Wait(req(2), status, ierr)
END IF
cicle = cicle + 1
first = ONE
END DO
END IF
END DO
IF( myrow >= mycol ) THEN
CALL mpi_comm_free( col_comm, ierr )
END IF
DEALLOCATE(B)
DEALLOCATE(C)
DEALLOCATE(BUF_RECV)
#else
CALL compute_ztrtri()
#endif
CONTAINS
SUBROUTINE compute_ztrtri()
!
! clear the upper triangle (excluding diagonal terms) and
!
DO j = 1, ldx
DO i = 1, j-1
sll ( i, j ) = zero
END DO
END DO
!
CALL ztrtri( 'L', 'N', nr, sll, ldx, info )
!
IF( info /= 0 ) THEN
CALL errore( ' pztrtri ', ' problem in the local inversion ', info )
END IF
!
END SUBROUTINE compute_ztrtri
INTEGER FUNCTION shift ( idref, id, pos, size, dir )
IMPLICIT NONE
INTEGER :: idref, id, pos, size
CHARACTER ( LEN = 1 ) :: dir
IF( ( dir == 'E' ) .OR. ( dir == 'S' ) ) THEN
shift = idref + MOD ( ( id - idref ) + pos, size )
ELSE IF( ( dir == 'W' ) .OR. ( dir == 'N' ) ) THEN
shift = idref + MOD ( ( id - idref ) - pos + size, size )
ELSE
shift = -1
END IF
RETURN
END FUNCTION shift
END SUBROUTINE pztrtri
! now the Double Precision subroutine
SUBROUTINE pdtrtri ( sll, ldx, n, desc )
! pztrtri computes the parallel inversion of a lower triangular matrix
! distribuited among the processes using a 2-D block partitioning.
! The algorithm is based on the schema below and executes the model
! recursively to each column C2 under the diagonal.
!
! |-------|-------| |--------------------|--------------------|
! | A1 | 0 | | C1 = trtri(A1) | 0 |
! A = |-------|-------| C = |--------------------|--------------------|
! | A2 | A3 | | C2 = -C3 * A2 * C1 | C3 = trtri(A3) |
! |-------|-------| |--------------------|--------------------|
!
! The recursive steps of multiplication (C2 = -C3 * A2 * C1) is based on the Cannon's algorithms
! for parallel matrix multiplication and is done with BLACS(dgemm)
!
!
! Arguments
! ============
!
! sll = local block of data
! ldx = leading dimension of one block
! n = size of the global array diributed among the blocks
! desc = descriptor of the matrix distribution
!
!
!
USE kinds
USE parallel_include
USE descriptors, ONLY: descla_local_dims, descla_siz_ , la_myr_ , la_myc_ , la_me_ , nlax_ , &
nlar_ , nlac_ , ilar_ , ilac_ , la_comm_ , la_nx_ , la_npr_ , la_npc_
IMPLICIT NONE
INTEGER, INTENT( IN ) :: n, ldx
INTEGER, INTENT( IN ) :: desc( descla_siz_ )
REAL(DP), INTENT( INOUT ) :: sll( ldx, ldx )
REAL(DP), PARAMETER :: ONE = 1.0_DP
REAL(DP), PARAMETER :: ZERO = 0.0_DP
#if defined __MPI
INTEGER :: status(MPI_STATUS_SIZE)
#endif
INTEGER :: req(2), ierr, col_comm
INTEGER :: send, recv, group_rank, group_size
INTEGER :: myrow, mycol, np, myid, comm
! counters
INTEGER :: k, i, j, count, step_count, shiftcount, cicle
INTEGER :: C3dim ! Dimension of submatrix B
INTEGER :: nc, nr ! Local dimension of block
INTEGER :: info, sup_recv
INTEGER :: idrowref, idcolref, idref, idrecv
! B and BUF_RECV are used to overload the computation of matrix multiplication and the shift of the blocks
REAL(DP), ALLOCATABLE, DIMENSION( :, : ) :: B, C, BUF_RECV
REAL(DP) :: first
myrow = desc( la_myr_ )
mycol = desc( la_myc_ )
myid = desc( la_me_ )
np = desc( la_npr_ )
comm = desc( la_comm_ )
IF( desc( la_npr_ ) /= desc( la_npc_ ) ) THEN
CALL errore( ' pdtrtri ', ' only square grid are allowed ', 1 )
END IF
IF( ldx /= desc( nlax_ ) ) THEN
CALL errore( ' pdtrtri ', ' wrong leading dimension ldx ', ldx )
END IF
nr = desc( nlar_ )
nc = desc( nlac_ )
! clear elements outside local meaningful block nr*nc
DO j = nc+1, ldx
DO i = 1, ldx
sll( i, j ) = zero
END DO
END DO
DO j = 1, ldx
DO i = nr+1, ldx
sll( i, j ) = zero
END DO
END DO
#if defined __MPI
ALLOCATE( B( ldx, ldx ) )
ALLOCATE( C( ldx, ldx ) )
ALLOCATE( BUF_RECV ( ldx, ldx ) )
IF( np == 2 ) THEN
!
! special case with 4 proc, 2x2 grid
!
IF( myrow == mycol ) THEN
CALL compute_dtrtri()
END IF
!
CALL GRID2D_RANK( 'R', np, np, 1, 0, idref )
!
IF( myrow == 0 .AND. mycol == 0 ) THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_PRECISION, idref, 0, comm, ierr)
END IF
!
IF( myrow == 1 .AND. mycol == 1 ) THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_PRECISION, idref, 1, comm, ierr)
END IF
!
IF( myrow == 1 .AND. mycol == 0 ) THEN
!
CALL GRID2D_RANK( 'R', np, np, 0, 0, i )
CALL GRID2D_RANK( 'R', np, np, 1, 1, j )
!
CALL MPI_Irecv( B, ldx*ldx, MPI_DOUBLE_PRECISION, i, 0, comm, req(1), ierr)
CALL MPI_Irecv( C, ldx*ldx, MPI_DOUBLE_PRECISION, j, 1, comm, req(2), ierr)
!
CALL MPI_Wait(req(1), status, ierr)
!
CALL dgemm('N', 'N', ldx, ldx, ldx, ONE, sll, ldx, b, ldx, ZERO, buf_recv, ldx)
!
CALL MPI_Wait(req(2), status, ierr)
!
CALL dgemm('N', 'N', ldx, ldx, ldx, -ONE, c, ldx, buf_recv, ldx, ZERO, sll, ldx)
!
END IF
!
IF( myrow == 0 .AND. mycol == 1 ) THEN
!
sll = zero
!
END IF
!
DEALLOCATE( b, c, buf_recv )
!
RETURN
!
END IF
IF( myrow >= mycol ) THEN
!
! only procs on lower triangle partecipates
!
CALL MPI_Comm_split( comm, mycol, myrow, col_comm, ierr )
CALL MPI_Comm_size( col_comm, group_size, ierr )
CALL MPI_Comm_rank( col_comm, group_rank, ierr )
!
ELSE
!
! other procs stay at the window!
!
CALL MPI_Comm_split( comm, MPI_UNDEFINED, MPI_UNDEFINED, col_comm, ierr )
!
sll = zero
!
END IF
!
! Compute the inverse of a lower triangular
! along the diagonal of the global array with BLAS(ztrtri)
!
IF( mycol == myrow ) THEN
!
CALL compute_dtrtri()
!
ELSE IF( myrow > mycol ) THEN
!
buf_recv = sll
!
END IF
IF( myrow >= mycol ) THEN
!
! Broadcast the diagonal blocks to the processors under the diagonal
!
CALL MPI_Bcast( sll, ldx*ldx, MPI_DOUBLE_PRECISION, 0, col_comm, ierr )
!
END IF
! Compute A2 * C1 and start the Cannon's algorithm shifting the blocks of column one place to the North
!
IF( myrow > mycol ) THEN
!
CALL dgemm( 'N', 'N', ldx, ldx, ldx, ONE, buf_recv, ldx, sll, ldx, ZERO, c, ldx )
!
send = shift( 1, group_rank, 1, ( group_size - 1 ), 'N' )
recv = shift( 1, group_rank, 1, ( group_size - 1 ), 'S' )
!
CALL MPI_Sendrecv( c, ldx*ldx, MPI_DOUBLE_PRECISION, send, 0, buf_recv, &
ldx*ldx, MPI_DOUBLE_PRECISION, recv, 0, col_comm, status, ierr )
!
END IF
! Execute the Cannon's algorithm to compute ricorsively the multiplication of C2 = -C3 * A2 * C1
!
DO count = ( np - 2 ), 0, -1
C3dim = (np-1) - count ! Dimension of the submatrix C3
first = ZERO
cicle = 0
IF( ( myrow > count ) .AND. ( mycol >= count ) ) THEN
idcolref = count + 1
idrowref = myrow
CALL GRID2D_RANK( 'R', np, np, idrowref, idcolref, idref )
idrecv = idref - 1
! Compute C2 = -C3 * A2 * C1
DO shiftcount = count, np-2
IF(mycol>count)THEN
! Execute the virtual shift of the matrix C3 along the row in order to know which processor
! have to send the block to C2
IF( cicle == 0)THEN
! virtual shift of the block i,j of the submatrix C3 i place to West
send = shift(idref, myid, myrow-count, C3dim, 'W')
ELSE
! virtual shift of the block i,j of the submatrix C3 i place to West
send = shift(idref, send, 1, C3dim, 'E')
END IF
IF(send==idref)THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_PRECISION, idrecv, myid, comm, ierr)
END IF
ELSE
IF( cicle == 0)THEN
! virtual shift of the block i,j of the submatrix C3 i place to West
sup_recv = shift(idref, myid+1, myrow-count, C3dim, 'E')
ELSE
! virtual shift of the block i,j of the submatrix C3 i place to West
sup_recv = shift(idref, sup_recv, 1, C3dim, 'W')
END IF
CALL MPI_Recv(C, ldx*ldx, MPI_DOUBLE_PRECISION, sup_recv, sup_recv, comm, status, ierr)
send = shift(1, group_rank, 1, (group_size-1), 'S')
recv = shift(1, group_rank, 1, (group_size-1), 'N')
! with the no-blocking communication the computation and the shift of the column block are overapped
IF( MOD( cicle, 2 ) == 0 ) THEN
CALL MPI_Isend(BUF_RECV, ldx*ldx, MPI_DOUBLE_PRECISION, send, group_rank+cicle, col_comm, req(1), ierr)
CALL MPI_Irecv(B, ldx*ldx, MPI_DOUBLE_PRECISION, recv, recv+cicle, col_comm, req(2), ierr)
CALL dgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, BUF_RECV, ldx, first, sll, ldx)
ELSE
CALL MPI_Isend(B, ldx*ldx, MPI_DOUBLE_PRECISION, send, group_rank+cicle, col_comm, req(1), ierr)
CALL MPI_Irecv(BUF_RECV, ldx*ldx, MPI_DOUBLE_PRECISION, recv, recv+cicle, col_comm, req(2), ierr)
CALL dgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, B, ldx, ONE, sll, ldx)
END IF
CALL MPI_Wait(req(1), status, ierr)
CALL MPI_Wait(req(2), status, ierr)
END IF
cicle = cicle + 1
first = ONE
END DO
END IF
END DO
IF( myrow >= mycol ) THEN
CALL mpi_comm_free( col_comm, ierr )
END IF
DEALLOCATE(B)
DEALLOCATE(C)
DEALLOCATE(BUF_RECV)
#else
CALL compute_dtrtri()
#endif
CONTAINS
SUBROUTINE compute_dtrtri()
!
! clear the upper triangle (excluding diagonal terms) and
!
DO j = 1, ldx
DO i = 1, j-1
sll ( i, j ) = zero
END DO
END DO
!
CALL dtrtri( 'L', 'N', nr, sll, ldx, info )
!
IF( info /= 0 ) THEN
CALL errore( ' pdtrtri ', ' problem in the local inversion ', info )
END IF
!
END SUBROUTINE compute_dtrtri
INTEGER FUNCTION shift ( idref, id, pos, size, dir )
IMPLICIT NONE
INTEGER :: idref, id, pos, size
CHARACTER ( LEN = 1 ) :: dir
IF( ( dir == 'E' ) .OR. ( dir == 'S' ) ) THEN
shift = idref + MOD ( ( id - idref ) + pos, size )
ELSE IF( ( dir == 'W' ) .OR. ( dir == 'N' ) ) THEN
shift = idref + MOD ( ( id - idref ) - pos + size, size )
ELSE
shift = -1
END IF
RETURN
END FUNCTION shift
END SUBROUTINE pdtrtri