quantum-espresso/LAXlib/ptoolkit.f90

6395 lines
178 KiB
Fortran

!
! Copyright (C) 2001-2006 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
! ---------------------------------------------------------------------------------
MODULE laxlib_ptoolkit
IMPLICIT NONE
SAVE
CONTAINS
SUBROUTINE laxlib_dsqmred_x_x( na, a, lda, desca, nb, b, ldb, descb )
!
!! Double precision SQuare Matrix REDistribution
!!
!! Copy a global "na * na" matrix locally stored in "a",
!! and distributed as described by "desca", into a larger
!! global "nb * nb" matrix stored in "b" and distributed
!! as described in "descb".
!!
!
! If you want to read, get prepared for an headache!
! Written struggling by Carlo Cavazzoni.
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: na
!! global dimension of matrix a
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
REAL(DP) :: a(lda,lda)
!! matrix to be redistributed into b
TYPE(la_descriptor), INTENT(IN) :: desca
!! laxlib descriptor of matrix a
INTEGER, INTENT(IN) :: nb
!! global dimension of matrix b
INTEGER, INTENT(IN) :: ldb
!! leading dimension of matrix b
REAL(DP) :: b(ldb,ldb)
!! redistributed matrix
TYPE(la_descriptor), INTENT(IN) :: descb
!! laxlib descriptor of matrix b
INTEGER :: ipc, ipr, npc, npr
INTEGER :: ipr_old, ir_old, nr_old, irx_old
INTEGER :: ipc_old, ic_old, nc_old, icx_old
INTEGER :: myrow, mycol, ierr, rank
INTEGER :: col_comm, row_comm, comm, sreq
INTEGER :: nr_new, ir_new, irx_new, ir, nr, nrtot, irb, ire
INTEGER :: nc_new, ic_new, icx_new, ic, nc, nctot, icb, ice
INTEGER :: ib, i, j, myid
INTEGER :: nrsnd( desca%npr )
INTEGER :: ncsnd( desca%npr )
INTEGER :: displ( desca%npr )
INTEGER :: irb_new( desca%npr )
INTEGER :: ire_new( desca%npr )
INTEGER :: icb_new( desca%npr )
INTEGER :: ice_new( desca%npr )
REAL(DP), ALLOCATABLE :: buf(:)
REAL(DP), ALLOCATABLE :: ab(:,:)
REAL(DP), ALLOCATABLE :: tst1(:,:)
REAL(DP), ALLOCATABLE :: tst2(:,:)
#if defined __MPI
INTEGER :: istatus( MPI_STATUS_SIZE )
#endif
IF( desca%active_node <= 0 ) THEN
RETURN
END IF
! preliminary consistency checks
IF( nb < na ) &
CALL lax_error__( " dsqmred ", " nb < na, this sub. work only with nb >= na ", nb )
IF( nb /= descb%n ) &
CALL lax_error__( " dsqmred ", " wrong global dim nb ", nb )
IF( na /= desca%n ) &
CALL lax_error__( " dsqmred ", " wrong global dim na ", na )
IF( ldb /= descb%nrcx ) &
CALL lax_error__( " dsqmred ", " wrong leading dim ldb ", ldb )
IF( lda /= desca%nrcx ) &
CALL lax_error__( " dsqmred ", " wrong leading dim lda ", lda )
npr = desca%npr
myrow = desca%myr
npc = desca%npc
mycol = desca%myc
comm = desca%comm
#if defined __MPI
! split communicator into row and col communicators
CALL MPI_Comm_rank( comm, myid, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in MPI_Comm_rank 1 ", ABS( ierr ) )
CALL MPI_Comm_split( comm, mycol, myrow, col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in MPI_Comm_split 1 ", ABS( ierr ) )
CALL MPI_Comm_split( comm, myrow, mycol, row_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in MPI_Comm_split 2 ", ABS( ierr ) )
CALL MPI_Comm_rank( col_comm, rank, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in MPI_Comm_rank 2 ", ABS( ierr ) )
IF( rank /= myrow ) &
CALL lax_error__( " dsqmred ", " building col_comm ", rank )
CALL MPI_Comm_rank( row_comm, rank, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in MPI_Comm_rank 3 ", ABS( ierr ) )
IF( rank /= mycol ) &
CALL lax_error__( " dsqmred ", " building row_comm ", rank )
ALLOCATE( buf( descb%nrcx * descb%nrcx ) )
ALLOCATE( ab( descb%nrcx, desca%nrcx ) )
! write( 3000 + myid, * ) 'na, nb = ', na, nb
DO j = 1, descb%nc
DO i = 1, descb%nr
b( i, j ) = 0.0d0
END DO
END DO
ab = 0.0d0
! first redistribute rows, column groups work in parallel
DO ipr = 1, npr
!
CALL descla_local_dims( ir_new, nr_new, nb, descb%nx, npr, ipr-1 )
!
irx_new = ir_new + nr_new - 1
! write( 3000 + myid, * ) 'ir_new, nr_new, irx_new = ', ir_new, nr_new, irx_new
!
DO ipr_old = 1, npr
!
CALL descla_local_dims( ir_old, nr_old, na, desca%nx, npr, ipr_old-1 )
!
irx_old = ir_old + nr_old - 1
!
! write( 3000 + myid, * ) 'ir_old, nr_old, irx_old = ', ir_old, nr_old, irx_old
!
IF( ir_old >= ir_new .AND. ir_old <= irx_new ) THEN
!
nrsnd( ipr_old ) = MIN( nr_old, irx_new - ir_old + 1 )
irb = 1
ire = nrsnd( ipr_old )
irb_new( ipr_old ) = ir_old - ir_new + 1
ire_new( ipr_old ) = irb_new( ipr_old ) + nrsnd( ipr_old ) - 1
!
ELSE IF( ir_new >= ir_old .AND. ir_new <= irx_old ) THEN
!
nrsnd( ipr_old ) = irx_old - ir_new + 1
irb = ir_new - ir_old + 1
ire = nr_old
irb_new( ipr_old ) = 1
ire_new( ipr_old ) = nrsnd( ipr_old )
!
ELSE
nrsnd( ipr_old ) = 0
irb = 0
ire = 0
irb_new( ipr_old ) = 0
ire_new( ipr_old ) = 0
END IF
!
! write( 3000 + myid, * ) 'ipr_old, nrsnd = ', ipr_old, nrsnd( ipr_old )
! write( 3000 + myid, * ) 'ipr_old, irb, ire = ', ipr_old, irb, ire
! write( 3000 + myid, * ) 'ipr_old, irb_new, ire_new = ', ipr_old, irb_new( ipr_old ), ire_new( ipr_old )
!
IF( ( myrow == ipr_old - 1 ) .AND. ( nrsnd( ipr_old ) > 0 ) ) THEN
IF( myrow /= ipr - 1 ) THEN
ib = 0
DO j = 1, desca%nc
DO i = irb, ire
ib = ib + 1
buf( ib ) = a( i, j )
END DO
END DO
CALL mpi_isend( buf, ib, MPI_DOUBLE_PRECISION, ipr-1, ipr, col_comm, sreq, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in mpi_isend ", ABS( ierr ) )
ELSE
DO j = 1, desca%nc
ib = irb
DO i = irb_new( ipr_old ), ire_new( ipr_old )
ab( i, j ) = a( ib, j )
ib = ib + 1
END DO
END DO
END IF
END IF
!
IF( nrsnd( ipr_old ) /= ire - irb + 1 ) &
CALL lax_error__( " dsqmred ", " something wrong with row 1 ", nrsnd( ipr_old ) )
IF( nrsnd( ipr_old ) /= ire_new( ipr_old ) - irb_new( ipr_old ) + 1 ) &
CALL lax_error__( " dsqmred ", " something wrong with row 2 ", nrsnd( ipr_old ) )
!
nrsnd( ipr_old ) = nrsnd( ipr_old ) * desca%nc
!
END DO
!
IF( myrow == ipr - 1 ) THEN
DO ipr_old = 1, npr
IF( nrsnd( ipr_old ) > 0 ) THEN
IF( myrow /= ipr_old - 1 ) THEN
CALL mpi_recv( buf, nrsnd(ipr_old), MPI_DOUBLE_PRECISION, ipr_old-1, ipr, col_comm, istatus, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in mpi_recv ", ABS( ierr ) )
CALL mpi_get_count( istatus, MPI_DOUBLE_PRECISION, ib, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in mpi_get_count ", ABS( ierr ) )
IF( ib /= nrsnd(ipr_old) ) &
CALL lax_error__( " dsqmred ", " something wrong with row 3 ", ib )
ib = 0
DO j = 1, desca%nc
DO i = irb_new( ipr_old ), ire_new( ipr_old )
ib = ib + 1
ab( i, j ) = buf( ib )
END DO
END DO
END IF
END IF
END DO
ELSE
DO ipr_old = 1, npr
IF( myrow == ipr_old - 1 .AND. nrsnd( ipr_old ) > 0 ) THEN
CALL MPI_Wait( sreq, istatus, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in MPI_Wait ", ABS( ierr ) )
END IF
END DO
END IF
!
END DO
! then redistribute cols, row groups work in parallel
DO ipc = 1, npc
!
CALL descla_local_dims( ic_new, nc_new, nb, descb%nx, npc, ipc-1 )
!
icx_new = ic_new + nc_new - 1
!
! write( 3000 + myid, * ) 'ic_new, nc_new, icx_new = ', ic_new, nc_new, icx_new
!
DO ipc_old = 1, npc
!
CALL descla_local_dims( ic_old, nc_old, na, desca%nx, npc, ipc_old-1 )
!
icx_old = ic_old + nc_old - 1
!
! write( 3000 + myid, * ) 'ic_old, nc_old, icx_old = ', ic_old, nc_old, icx_old
!
IF( ic_old >= ic_new .AND. ic_old <= icx_new ) THEN
!
ncsnd( ipc_old ) = MIN( nc_old, icx_new - ic_old + 1 )
icb = 1
ice = ncsnd( ipc_old )
icb_new( ipc_old ) = ic_old - ic_new + 1
ice_new( ipc_old ) = icb_new( ipc_old ) + ncsnd( ipc_old ) - 1
!
ELSE IF( ic_new >= ic_old .AND. ic_new <= icx_old ) THEN
!
ncsnd( ipc_old ) = icx_old - ic_new + 1
icb = ic_new - ic_old + 1
ice = nc_old
icb_new( ipc_old ) = 1
ice_new( ipc_old ) = ncsnd( ipc_old )
!
ELSE
ncsnd( ipc_old ) = 0
icb = 0
ice = 0
icb_new( ipc_old ) = 0
ice_new( ipc_old ) = 0
END IF
!
! write( 3000 + myid, * ) 'ipc_old, ncsnd = ', ipc_old, ncsnd( ipc_old )
! write( 3000 + myid, * ) 'ipc_old, icb, ice = ', ipc_old, icb, ice
! write( 3000 + myid, * ) 'ipc_old, icb_new, ice_new = ', ipc_old, icb_new( ipc_old ), ice_new( ipc_old )
IF( ( mycol == ipc_old - 1 ) .AND. ( ncsnd( ipc_old ) > 0 ) ) THEN
IF( mycol /= ipc - 1 ) THEN
ib = 0
DO j = icb, ice
DO i = 1, descb%nrcx
ib = ib + 1
buf( ib ) = ab( i, j )
END DO
END DO
CALL mpi_isend( buf, ib, MPI_DOUBLE_PRECISION, ipc-1, ipc, row_comm, sreq, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in mpi_isend 2 ", ABS( ierr ) )
ELSE
ib = icb
DO j = icb_new( ipc_old ), ice_new( ipc_old )
DO i = 1, descb%nrcx
b( i, j ) = ab( i, ib )
END DO
ib = ib + 1
END DO
END IF
END IF
IF( ncsnd( ipc_old ) /= ice-icb+1 ) &
CALL lax_error__( " dsqmred ", " something wrong with col 1 ", ncsnd( ipc_old ) )
IF( ncsnd( ipc_old ) /= ice_new( ipc_old ) - icb_new( ipc_old ) + 1 ) &
CALL lax_error__( " dsqmred ", " something wrong with col 2 ", ncsnd( ipc_old ) )
!
ncsnd( ipc_old ) = ncsnd( ipc_old ) * descb%nrcx
!
END DO
!
IF( mycol == ipc - 1 ) THEN
DO ipc_old = 1, npc
IF( ncsnd( ipc_old ) > 0 ) THEN
IF( mycol /= ipc_old - 1 ) THEN
ib = icb_new( ipc_old )
CALL mpi_recv( b( 1, ib ), ncsnd(ipc_old), MPI_DOUBLE_PRECISION, ipc_old-1, ipc, row_comm, istatus, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in mpi_recv 2 ", ABS( ierr ) )
CALL MPI_GET_COUNT( istatus, MPI_DOUBLE_PRECISION, ib, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in MPI_GET_COUNT 2 ", ABS( ierr ) )
IF( ib /= ncsnd(ipc_old) ) &
CALL lax_error__( " dsqmred ", " something wrong with col 3 ", ib )
END IF
END IF
END DO
ELSE
DO ipc_old = 1, npc
IF( mycol == ipc_old - 1 .AND. ncsnd( ipc_old ) > 0 ) THEN
CALL MPI_Wait( sreq, istatus, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in MPI_Wait 2 ", ABS( ierr ) )
END IF
END DO
END IF
!
END DO
DEALLOCATE( ab )
DEALLOCATE( buf )
CALL mpi_comm_free( col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in mpi_comm_free 1 ", ABS( ierr ) )
CALL mpi_comm_free( row_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmred ", " in mpi_comm_free 2 ", ABS( ierr ) )
#if defined __PIPPO
! this is for debugging, tests through global matrix, if
! the two matrix (pre and before the redistribution) coincide.
ALLOCATE( tst1( nb, nb ) )
ALLOCATE( tst2( nb, nb ) )
ALLOCATE( ab( nb, nb ) )
ab = 0.0d0
do j = 1, desca%nc
do i = 1, desca%nr
ab( i + desca%ir - 1, j + desca%ic - 1 ) = a( i , j )
end do
end do
CALL MPI_REDUCE( ab, tst1, nb*nb, MPI_DOUBLE_PRECISION, MPI_SUM, 0, comm, ierr )
ab = 0.0d0
do j = 1, descb%nc
do i = 1, descb%nr
ab( i + descb%ir - 1, j + descb%ic - 1 ) = b( i , j )
end do
end do
CALL MPI_REDUCE( ab, tst2, nb*nb, MPI_DOUBLE_PRECISION, MPI_SUM, 0, comm, ierr )
IF( myid == 0 ) THEN
write( 1000, * ) na, nb, SUM( ABS( tst2 - tst1 ) )
END IF
DEALLOCATE( ab )
DEALLOCATE( tst2 )
DEALLOCATE( tst1 )
#endif
#endif
RETURN
END SUBROUTINE laxlib_dsqmred_x_x
SUBROUTINE laxlib_zsqmred_x_x( na, a, lda, desca, nb, b, ldb, descb )
!
!! double complex (Z) SQuare Matrix REDistribution
!!
!! Copy a global "na * na" matrix locally stored in "a",
!! and distributed as described by "desca", into a larger
!! global "nb * nb" matrix stored in "b" and distributed
!! as described in "descb".
!!
!
! If you want to read, get prepared for an headache!
! Written struggling by Carlo Cavazzoni.
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: na
!! global dimension of matrix a
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
COMPLEX(DP) :: a(lda,lda)
!! matrix to be redistributed into b
TYPE(la_descriptor), INTENT(IN) :: desca
!! laxlib descriptor of matrix a
INTEGER, INTENT(IN) :: nb
!! global dimension of matrix b
INTEGER, INTENT(IN) :: ldb
!! leading dimension of matrix b
COMPLEX(DP) :: b(ldb,ldb)
!! redistributed matrix
TYPE(la_descriptor), INTENT(IN) :: descb
!! laxlib descriptor matrix b
INTEGER :: ipc, ipr, npc, npr
INTEGER :: ipr_old, ir_old, nr_old, irx_old
INTEGER :: ipc_old, ic_old, nc_old, icx_old
INTEGER :: myrow, mycol, ierr, rank
INTEGER :: col_comm, row_comm, comm, sreq
INTEGER :: nr_new, ir_new, irx_new, ir, nr, nrtot, irb, ire
INTEGER :: nc_new, ic_new, icx_new, ic, nc, nctot, icb, ice
INTEGER :: ib, i, j, myid
INTEGER :: nrsnd( desca%npr )
INTEGER :: ncsnd( desca%npr )
INTEGER :: displ( desca%npr )
INTEGER :: irb_new( desca%npr )
INTEGER :: ire_new( desca%npr )
INTEGER :: icb_new( desca%npr )
INTEGER :: ice_new( desca%npr )
COMPLEX(DP), ALLOCATABLE :: buf(:)
COMPLEX(DP), ALLOCATABLE :: ab(:,:)
COMPLEX(DP), ALLOCATABLE :: tst1(:,:)
COMPLEX(DP), ALLOCATABLE :: tst2(:,:)
#if defined __MPI
INTEGER :: istatus( MPI_STATUS_SIZE )
#endif
IF( desca%active_node <= 0 ) THEN
RETURN
END IF
! preliminary consistency checks
IF( nb < na ) &
CALL lax_error__( " zsqmred ", " nb < na, this sub. work only with nb >= na ", nb )
IF( nb /= descb%n ) &
CALL lax_error__( " zsqmred ", " wrong global dim nb ", nb )
IF( na /= desca%n ) &
CALL lax_error__( " zsqmred ", " wrong global dim na ", na )
IF( ldb /= descb%nrcx ) &
CALL lax_error__( " zsqmred ", " wrong leading dim ldb ", ldb )
IF( lda /= desca%nrcx ) &
CALL lax_error__( " zsqmred ", " wrong leading dim lda ", lda )
npr = desca%npr
myrow = desca%myr
npc = desca%npc
mycol = desca%myc
comm = desca%comm
#if defined __MPI
! split communicator into row and col communicators
CALL MPI_Comm_rank( comm, myid, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in MPI_Comm_rank 1 ", ABS( ierr ) )
CALL MPI_Comm_split( comm, mycol, myrow, col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in MPI_Comm_split 1 ", ABS( ierr ) )
CALL MPI_Comm_split( comm, myrow, mycol, row_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in MPI_Comm_split 2 ", ABS( ierr ) )
CALL MPI_Comm_rank( col_comm, rank, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in MPI_Comm_rank 2 ", ABS( ierr ) )
IF( rank /= myrow ) &
CALL lax_error__( " zsqmred ", " building col_comm ", rank )
CALL MPI_Comm_rank( row_comm, rank, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in MPI_Comm_rank 3 ", ABS( ierr ) )
IF( rank /= mycol ) &
CALL lax_error__( " zsqmred ", " building row_comm ", rank )
ALLOCATE( buf( descb%nrcx * descb%nrcx ) )
ALLOCATE( ab( descb%nrcx, desca%nrcx ) )
DO j = 1, descb%nc
DO i = 1, descb%nr
b( i, j ) = ( 0_DP , 0_DP )
END DO
END DO
ab = ( 0_DP , 0_DP )
! first redistribute rows, column groups work in parallel
DO ipr = 1, npr
!
CALL descla_local_dims( ir_new, nr_new, nb, descb%nx, npr, ipr-1 )
!
irx_new = ir_new + nr_new - 1
!
DO ipr_old = 1, npr
!
CALL descla_local_dims( ir_old, nr_old, na, desca%nx, npr, ipr_old-1 )
!
irx_old = ir_old + nr_old - 1
!
IF( ir_old >= ir_new .AND. ir_old <= irx_new ) THEN
!
nrsnd( ipr_old ) = MIN( nr_old, irx_new - ir_old + 1 )
irb = 1
ire = nrsnd( ipr_old )
irb_new( ipr_old ) = ir_old - ir_new + 1
ire_new( ipr_old ) = irb_new( ipr_old ) + nrsnd( ipr_old ) - 1
!
ELSE IF( ir_new >= ir_old .AND. ir_new <= irx_old ) THEN
!
nrsnd( ipr_old ) = irx_old - ir_new + 1
irb = ir_new - ir_old + 1
ire = nr_old
irb_new( ipr_old ) = 1
ire_new( ipr_old ) = nrsnd( ipr_old )
!
ELSE
nrsnd( ipr_old ) = 0
irb = 0
ire = 0
irb_new( ipr_old ) = 0
ire_new( ipr_old ) = 0
END IF
!
IF( ( myrow == ipr_old - 1 ) .AND. ( nrsnd( ipr_old ) > 0 ) ) THEN
IF( myrow /= ipr - 1 ) THEN
ib = 0
DO j = 1, desca%nc
DO i = irb, ire
ib = ib + 1
buf( ib ) = a( i, j )
END DO
END DO
CALL mpi_isend( buf, ib, MPI_DOUBLE_COMPLEX, ipr-1, ipr, col_comm, sreq, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in mpi_isend 1 ", ABS( ierr ) )
ELSE
DO j = 1, desca%nc
ib = irb
DO i = irb_new( ipr_old ), ire_new( ipr_old )
ab( i, j ) = a( ib, j )
ib = ib + 1
END DO
END DO
END IF
END IF
!
IF( nrsnd( ipr_old ) /= ire - irb + 1 ) &
CALL lax_error__( " zsqmred ", " something wrong with row 1 ", nrsnd( ipr_old ) )
IF( nrsnd( ipr_old ) /= ire_new( ipr_old ) - irb_new( ipr_old ) + 1 ) &
CALL lax_error__( " zsqmred ", " something wrong with row 2 ", nrsnd( ipr_old ) )
!
nrsnd( ipr_old ) = nrsnd( ipr_old ) * desca%nc
!
END DO
!
IF( myrow == ipr - 1 ) THEN
DO ipr_old = 1, npr
IF( nrsnd( ipr_old ) > 0 ) THEN
IF( myrow /= ipr_old - 1 ) THEN
CALL mpi_recv( buf, nrsnd(ipr_old), MPI_DOUBLE_COMPLEX, ipr_old-1, ipr, col_comm, istatus, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in mpi_recv 1 ", ABS( ierr ) )
CALL MPI_GET_COUNT( istatus, MPI_DOUBLE_COMPLEX, ib, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in MPI_GET_COUNT 1 ", ABS( ierr ) )
IF( ib /= nrsnd(ipr_old) ) &
CALL lax_error__( " zsqmred ", " something wrong with row 3 ", ib )
ib = 0
DO j = 1, desca%nc
DO i = irb_new( ipr_old ), ire_new( ipr_old )
ib = ib + 1
ab( i, j ) = buf( ib )
END DO
END DO
END IF
END IF
END DO
ELSE
DO ipr_old = 1, npr
IF( myrow == ipr_old - 1 .AND. nrsnd( ipr_old ) > 0 ) THEN
CALL MPI_Wait( sreq, istatus, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in MPI_Wait 1 ", ABS( ierr ) )
END IF
END DO
END IF
!
END DO
! then redistribute cols, row groups work in parallel
DO ipc = 1, npc
!
CALL descla_local_dims( ic_new, nc_new, nb, descb%nx, npc, ipc-1 )
!
icx_new = ic_new + nc_new - 1
!
DO ipc_old = 1, npc
!
CALL descla_local_dims( ic_old, nc_old, na, desca%nx, npc, ipc_old-1 )
!
icx_old = ic_old + nc_old - 1
!
IF( ic_old >= ic_new .AND. ic_old <= icx_new ) THEN
!
ncsnd( ipc_old ) = MIN( nc_old, icx_new - ic_old + 1 )
icb = 1
ice = ncsnd( ipc_old )
icb_new( ipc_old ) = ic_old - ic_new + 1
ice_new( ipc_old ) = icb_new( ipc_old ) + ncsnd( ipc_old ) - 1
!
ELSE IF( ic_new >= ic_old .AND. ic_new <= icx_old ) THEN
!
ncsnd( ipc_old ) = icx_old - ic_new + 1
icb = ic_new - ic_old + 1
ice = nc_old
icb_new( ipc_old ) = 1
ice_new( ipc_old ) = ncsnd( ipc_old )
!
ELSE
ncsnd( ipc_old ) = 0
icb = 0
ice = 0
icb_new( ipc_old ) = 0
ice_new( ipc_old ) = 0
END IF
!
IF( ( mycol == ipc_old - 1 ) .AND. ( ncsnd( ipc_old ) > 0 ) ) THEN
IF( mycol /= ipc - 1 ) THEN
ib = 0
DO j = icb, ice
DO i = 1, descb%nrcx
ib = ib + 1
buf( ib ) = ab( i, j )
END DO
END DO
CALL mpi_isend( buf, ib, MPI_DOUBLE_COMPLEX, ipc-1, ipc, row_comm, sreq, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in mpi_isend 2 ", ABS( ierr ) )
ELSE
ib = icb
DO j = icb_new( ipc_old ), ice_new( ipc_old )
DO i = 1, descb%nrcx
b( i, j ) = ab( i, ib )
END DO
ib = ib + 1
END DO
END IF
END IF
IF( ncsnd( ipc_old ) /= ice-icb+1 ) &
CALL lax_error__( " zsqmred ", " something wrong with col 1 ", ncsnd( ipc_old ) )
IF( ncsnd( ipc_old ) /= ice_new( ipc_old ) - icb_new( ipc_old ) + 1 ) &
CALL lax_error__( " zsqmred ", " something wrong with col 2 ", ncsnd( ipc_old ) )
!
ncsnd( ipc_old ) = ncsnd( ipc_old ) * descb%nrcx
!
END DO
!
IF( mycol == ipc - 1 ) THEN
DO ipc_old = 1, npc
IF( ncsnd( ipc_old ) > 0 ) THEN
IF( mycol /= ipc_old - 1 ) THEN
ib = icb_new( ipc_old )
CALL mpi_recv( b( 1, ib ), ncsnd(ipc_old), MPI_DOUBLE_COMPLEX, ipc_old-1, ipc, row_comm, istatus, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in mpi_recv 2 ", ABS( ierr ) )
CALL MPI_GET_COUNT( istatus, MPI_DOUBLE_COMPLEX, ib, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in MPI_GET_COUNT 2 ", ABS( ierr ) )
IF( ib /= ncsnd(ipc_old) ) &
CALL lax_error__( " zsqmred ", " something wrong with col 3 ", ib )
END IF
END IF
END DO
ELSE
DO ipc_old = 1, npc
IF( mycol == ipc_old - 1 .AND. ncsnd( ipc_old ) > 0 ) THEN
CALL MPI_Wait( sreq, istatus, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in MPI_Wait 2 ", ABS( ierr ) )
END IF
END DO
END IF
!
END DO
DEALLOCATE( ab )
DEALLOCATE( buf )
CALL mpi_comm_free( col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in mpi_comm_free 1 ", ABS( ierr ) )
CALL mpi_comm_free( row_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmred ", " in mpi_comm_free 2 ", ABS( ierr ) )
#if defined __PIPPO
! this is for debugging, tests through global matrix, if
! the two matrix (pre and before the redistribution) coincide.
ALLOCATE( tst1( nb, nb ) )
ALLOCATE( tst2( nb, nb ) )
ALLOCATE( ab( nb, nb ) )
ab = 0.0d0
do j = 1, desca%nc
do i = 1, desca%nr
ab( i + desca%ir - 1, j + desca%ic - 1 ) = a( i , j )
end do
end do
CALL MPI_REDUCE( ab, tst1, nb*nb, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, comm, ierr )
ab = 0.0d0
do j = 1, descb%nc
do i = 1, descb%nr
ab( i + descb%ir - 1, j + descb%ic - 1 ) = b( i , j )
end do
end do
CALL MPI_REDUCE( ab, tst2, nb*nb, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, comm, ierr )
IF( myid == 0 ) THEN
write( 4000, * ) na, nb, SUM( ABS( tst2 - tst1 ) )
END IF
DEALLOCATE( ab )
DEALLOCATE( tst2 )
DEALLOCATE( tst1 )
#endif
#endif
RETURN
END SUBROUTINE laxlib_zsqmred_x_x
END MODULE laxlib_ptoolkit
! ---------------------------------------------------------------------------------
SUBROUTINE laxlib_dsqmdst_x( n, ar, ldar, a, lda, desc )
!
!! Double precision SQuare Matrix DiSTribution
!! This subroutine take a replicated square matrix "ar" and distribute it
!! across processors as described by descriptor "desc"
!
USE laxlib_descriptor
!
implicit none
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: ldar
!! leading dimension of matrix ar
REAL(DP) :: ar(ldar,*)
!! matrix to be splitted, replicated on all proc
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
REAL(DP) :: a(lda,*)
!! distributed matrix a
TYPE(la_descriptor), INTENT(IN) :: desc
!! laxlib descriptor for matrix a
!!
!
REAL(DP), PARAMETER :: zero = 0_DP
!
INTEGER :: i, j, nr, nc, ic, ir, nx
!
IF( desc%active_node <= 0 ) THEN
RETURN
END IF
nx = desc%nrcx
ir = desc%ir
ic = desc%ic
nr = desc%nr
nc = desc%nc
IF( lda < nx ) &
CALL lax_error__( " dsqmdst ", " inconsistent dimension lda ", lda )
IF( n /= desc%n ) &
CALL lax_error__( " dsqmdst ", " inconsistent dimension n ", n )
DO j = 1, nc
DO i = 1, nr
a( i, j ) = ar( i + ir - 1, j + ic - 1 )
END DO
DO i = nr+1, nx
a( i, j ) = zero
END DO
END DO
DO j = nc + 1, nx
DO i = 1, nx
a( i, j ) = zero
END DO
END DO
RETURN
END SUBROUTINE laxlib_dsqmdst_x
SUBROUTINE laxlib_zsqmdst_x( n, ar, ldar, a, lda, desc )
!
!! double complex (Z) SQuare Matrix DiSTribution
!! This subroutine take a replicated square matrix "ar" and distribute it
!! across processors as described by descriptor "desc"
!
USE laxlib_descriptor
!
implicit none
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: ldar
!! leading dimension of matrix ar
COMPLEX(DP) :: ar(ldar,*)
!! matrix to be splitted, replicated on all proc
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
COMPLEX(DP) :: a(lda,*)
!! distributed matrix a
TYPE(la_descriptor), INTENT(IN) :: desc
!! laxlib descriptor for matrix a
!!
!
COMPLEX(DP), PARAMETER :: zero = ( 0_DP , 0_DP )
!
INTEGER :: i, j, nr, nc, ic, ir, nx
!
IF( desc%active_node <= 0 ) THEN
RETURN
END IF
nx = desc%nrcx
ir = desc%ir
ic = desc%ic
nr = desc%nr
nc = desc%nc
IF( lda < nx ) &
CALL lax_error__( " zsqmdst ", " inconsistent dimension lda ", lda )
IF( n /= desc%n ) &
CALL lax_error__( " zsqmdst ", " inconsistent dimension n ", n )
DO j = 1, nc
DO i = 1, nr
a( i, j ) = ar( i + ir - 1, j + ic - 1 )
END DO
DO i = nr+1, nx
a( i, j ) = zero
END DO
END DO
DO j = nc + 1, nx
DO i = 1, nx
a( i, j ) = zero
END DO
END DO
RETURN
END SUBROUTINE laxlib_zsqmdst_x
! ---------------------------------------------------------------------------------
SUBROUTINE laxlib_dsqmcll_x( n, a, lda, ar, ldar, desc, comm )
!
!! Double precision SQuare Matrix CoLLect
!! This sub. take a distributed square matrix "a" and collect
!! the block assigned to processors into a replicated matrix "ar",
!! matrix is distributed as described by descriptor desc
!!
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
implicit none
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: ldar
!! leading dimension of matrix ar
REAL(DP) :: ar(ldar,*)
!! matrix to be merged, replicated on all proc
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
REAL(DP) :: a(lda,*)
!! distributed matrix a
TYPE(la_descriptor), INTENT(IN) :: desc
!! laxlib descriptor for matrix a
INTEGER, INTENT(IN) :: comm
!! mpi communicator
!
INTEGER :: i, j
#if defined __MPI
!
INTEGER :: np, nx, ipc, ipr, npr, npc, noff
INTEGER :: ierr, ir, ic, nr, nc
REAL(DP), ALLOCATABLE :: buf(:,:)
!
IF( desc%active_node > 0 ) THEN
!
np = desc%npr * desc%npc
nx = desc%nrcx
npr = desc%npr
npc = desc%npc
!
IF( desc%myr == 0 .AND. desc%myc == 0 ) THEN
ALLOCATE( buf( nx, nx * np ) )
ELSE
ALLOCATE( buf( 1, 1 ) )
END IF
!
IF( lda /= nx ) &
CALL lax_error__( " dsqmcll ", " inconsistent dimension lda ", lda )
!
IF( desc%n /= n ) &
CALL lax_error__( " dsqmcll ", " inconsistent dimension n ", n )
!
CALL mpi_gather( a, nx*nx, mpi_double_precision, &
buf, nx*nx, mpi_double_precision, 0, desc%comm , ierr )
!
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmcll ", " in gather ", ABS( ierr ) )
!
IF( desc%myr == 0 .AND. desc%myc == 0 ) THEN
DO ipc = 1, npc
CALL descla_local_dims( ic, nc, n, desc%nx, npc, ipc-1 )
DO ipr = 1, npr
CALL descla_local_dims( ir, nr, n, desc%nx, npr, ipr-1 )
noff = ( ipc - 1 + npc * ( ipr - 1 ) ) * nx
DO j = 1, nc
DO i = 1, nr
ar( i + ir - 1, j + ic - 1 ) = buf( i, j + noff )
END DO
END DO
END DO
END DO
END IF
!
DEALLOCATE( buf )
!
END IF
!
CALL mpi_bcast( ar, ldar * n, mpi_double_precision, 0, comm, ierr )
!
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmcll ", " in bcast ", ABS( ierr ) )
#else
DO j = 1, n
DO i = 1, n
ar( i, j ) = a( i, j )
END DO
END DO
#endif
RETURN
END SUBROUTINE laxlib_dsqmcll_x
SUBROUTINE laxlib_zsqmcll_x( n, a, lda, ar, ldar, desc, comm )
!
! double complex (Z) SQuare Matrix CoLLect
! This sub. take a distributed square matrix "a" and collect
! the block assigned to processors into a replicated matrix "ar",
! matrix is distributed as described by descriptor desc
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
implicit none
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: ldar
!! leading dimension of matrix ar
COMPLEX(DP) :: ar(ldar,*)
!! matrix to be merged, replicated on all proc
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
COMPLEX(DP) :: a(lda,*)
!! distributed matrix a
TYPE(la_descriptor), INTENT(IN) :: desc
!! laxlib descriptor for matrix a
INTEGER, INTENT(IN) :: comm
!! mpi communicator
!
INTEGER :: i, j
#if defined __MPI
!
INTEGER :: np, nx, ipc, ipr, npr, npc, noff
INTEGER :: ierr, ir, ic, nr, nc
COMPLEX(DP), ALLOCATABLE :: buf(:,:)
!
IF( desc%active_node > 0 ) THEN
!
np = desc%npr * desc%npc
nx = desc%nrcx
npr = desc%npr
npc = desc%npc
!
IF( desc%myr == 0 .AND. desc%myc == 0 ) THEN
ALLOCATE( buf( nx, nx * np ) )
ELSE
ALLOCATE( buf( 1, 1 ) )
END IF
!
IF( lda /= nx ) &
CALL lax_error__( " zsqmcll ", " inconsistent dimension lda ", lda )
!
IF( desc%n /= n ) &
CALL lax_error__( " zsqmcll ", " inconsistent dimension n ", n )
!
CALL mpi_gather( a, nx*nx, mpi_double_complex, &
buf, nx*nx, mpi_double_complex, 0, desc%comm , ierr )
!
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmcll ", " in gather ", ABS( ierr ) )
!
IF( desc%myr == 0 .AND. desc%myc == 0 ) THEN
DO ipc = 1, npc
CALL descla_local_dims( ic, nc, n, desc%nx, npc, ipc-1 )
DO ipr = 1, npr
CALL descla_local_dims( ir, nr, n, desc%nx, npr, ipr-1 )
noff = ( ipc - 1 + npc * ( ipr - 1 ) ) * nx
DO j = 1, nc
DO i = 1, nr
ar( i + ir - 1, j + ic - 1 ) = buf( i, j + noff )
END DO
END DO
END DO
END DO
END IF
!
DEALLOCATE( buf )
!
END IF
!
CALL mpi_bcast( ar, ldar * n, mpi_double_complex, 0, comm, ierr )
!
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmcll ", " in bcast ", ABS( ierr ) )
#else
DO j = 1, n
DO i = 1, n
ar( i, j ) = a( i, j )
END DO
END DO
#endif
RETURN
END SUBROUTINE laxlib_zsqmcll_x
! ---------------------------------------------------------------------------------
SUBROUTINE laxlib_dsqmwpb_x( n, a, lda, desc )
!
!! Double precision SQuare Matrix WiPe Border subroutine
!! initialize to zero the distributed matrix border
!
USE laxlib_descriptor
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension of matrix a
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
REAL(DP) :: a(lda,*)
!! distributed matrix a
TYPE(la_descriptor), INTENT(IN) :: desc
!! laxlib descriptor
!
INTEGER :: i, j
!
DO j = 1, desc%nc
DO i = desc%nr + 1, desc%nrcx
a( i, j ) = 0_DP
END DO
END DO
DO j = desc%nc + 1, desc%nrcx
DO i = 1, desc%nrcx
a( i, j ) = 0_DP
END DO
END DO
!
RETURN
END SUBROUTINE laxlib_dsqmwpb_x
! ---------------------------------------------------------------------------------
SUBROUTINE laxlib_dsqmsym_x( n, a, lda, idesc )
!
!! Double precision SQuare Matrix SYMmetrization
!!
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension of matrix a
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
REAL(DP) :: a(lda,*)
!! distributed matrix a
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! laxlib descriptor
!
TYPE(la_descriptor) :: desc
#if defined __MPI
INTEGER :: istatus( MPI_STATUS_SIZE )
#endif
INTEGER :: i, j
INTEGER :: comm
INTEGER :: nr, nc, dest, sreq, ierr, sour
REAL(DP) :: atmp
#if defined __MPI
CALL laxlib_intarray_to_desc(desc,idesc)
IF( desc%active_node <= 0 ) THEN
RETURN
END IF
IF( n /= desc%n ) &
CALL lax_error__( " dsqmsym ", " wrong global dim n ", n )
IF( lda /= desc%nrcx ) &
CALL lax_error__( " dsqmsym ", " wrong leading dim lda ", lda )
comm = desc%comm
nr = desc%nr
nc = desc%nc
IF( desc%myc == desc%myr ) THEN
!
! diagonal block, procs work locally
!
DO j = 1, nc
DO i = j + 1, nr
a(i,j) = a(j,i)
END DO
END DO
!
ELSE IF( desc%myc > desc%myr ) THEN
!
! super diagonal block, procs send the block to sub diag.
!
CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
desc%myc, desc%myr, dest )
CALL mpi_isend( a, lda*lda, MPI_DOUBLE_PRECISION, dest, 1, comm, sreq, ierr )
!
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmsym ", " in isend ", ABS( ierr ) )
!
ELSE IF( desc%myc < desc%myr ) THEN
!
! sub diagonal block, procs receive the block from super diag,
! then transpose locally
!
CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
desc%myc, desc%myr, sour )
CALL mpi_recv( a, lda*lda, MPI_DOUBLE_PRECISION, sour, 1, comm, istatus, ierr )
!
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmsym ", " in recv ", ABS( ierr ) )
!
DO j = 1, lda
DO i = j + 1, lda
atmp = a(i,j)
a(i,j) = a(j,i)
a(j,i) = atmp
END DO
END DO
!
END IF
IF( desc%myc > desc%myr ) THEN
!
CALL MPI_Wait( sreq, istatus, ierr )
!
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmsym ", " in wait ", ABS( ierr ) )
!
END IF
#else
DO j = 1, n
!
DO i = j + 1, n
!
a(i,j) = a(j,i)
!
END DO
!
END DO
#endif
RETURN
END SUBROUTINE laxlib_dsqmsym_x
! ---------------------------------------------------------------------------------
#if defined (__CUDA)
SUBROUTINE laxlib_dsqmsym_gpu_x( n, a, lda, idesc )
!
!! Double precision SQuare Matrix SYMmetrization
!! GPU version
!!
!
USE laxlib_descriptor
USE laxlib_parallel_include
USE cudafor
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension of matrix a
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
REAL(DP), INTENT(INOUT), DEVICE :: a(:,:)
ATTRIBUTES(DEVICE) :: a
!! distributed matrix a
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! laxlib descriptor
!
TYPE(la_descriptor) :: desc
#if defined __MPI
INTEGER :: istatus( MPI_STATUS_SIZE )
#if ! defined(__GPU_MPI)
REAL(DP), ALLOCATABLE :: a_h(:,:)
#endif
#endif
INTEGER :: i, j
INTEGER :: comm
INTEGER :: nr, nc, dest, sreq, ierr, sour
REAL(DP) :: atmp
#if defined __MPI
CALL laxlib_intarray_to_desc(desc,idesc)
IF( desc%active_node <= 0 ) THEN
RETURN
END IF
IF( n /= desc%n ) &
CALL lax_error__( " dsqmsym ", " wrong global dim n ", n )
IF( lda /= desc%nrcx ) &
CALL lax_error__( " dsqmsym ", " wrong leading dim lda ", lda )
comm = desc%comm
nr = desc%nr
nc = desc%nc
IF( desc%myc == desc%myr ) THEN
!
! diagonal block, procs work locally
!
!$cuf kernel do(1) <<<*,*>>>
DO j = 1, nc
DO i = j + 1, nr
a(i,j) = a(j,i)
END DO
END DO
!
ELSE IF( desc%myc > desc%myr ) THEN
!
! super diagonal block, procs send the block to sub diag.
!
CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
desc%myc, desc%myr, dest )
#if defined(__GPU_MPI)
ierr = cudaDeviceSynchronize()
CALL mpi_isend( a, SIZE(a), MPI_DOUBLE_PRECISION, dest, 1, comm, sreq, ierr )
#else
ALLOCATE(a_h, SOURCE = a )
CALL mpi_isend( a_h, SIZE(a_h), MPI_DOUBLE_PRECISION, dest, 1, comm, sreq, ierr )
#endif
!
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmsym ", " in isend ", ABS( ierr ) )
!
ELSE IF( desc%myc < desc%myr ) THEN
!
! sub diagonal block, procs receive the block from super diag,
! then transpose locally
!
CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
desc%myc, desc%myr, sour )
#if defined(__GPU_MPI)
ierr = cudaDeviceSynchronize()
CALL mpi_recv( a, SIZE(a), MPI_DOUBLE_PRECISION, sour, 1, comm, istatus, ierr )
!$cuf kernel do(1) <<<*,*>>>
DO j = 1, lda
DO i = j + 1, lda
atmp = a(i,j)
a(i,j) = a(j,i)
a(j,i) = atmp
END DO
END DO
#else
ALLOCATE(a_h(SIZE(a,1),SIZE(a,2)))
CALL mpi_recv( a_h, SIZE(a_h), MPI_DOUBLE_PRECISION, sour, 1, comm, istatus, ierr )
DO j = 1, lda
DO i = j + 1, lda
atmp = a_h(i,j)
a_h(i,j) = a_h(j,i)
a_h(j,i) = atmp
END DO
END DO
a = a_h
DEALLOCATE(a_h)
#endif
!
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmsym ", " in recv ", ABS( ierr ) )
!
END IF
IF( desc%myc > desc%myr ) THEN
!
CALL MPI_Wait( sreq, istatus, ierr )
!
#if defined(__GPU_MPI)
#else
DEALLOCATE(a_h)
#endif
IF( ierr /= 0 ) &
CALL lax_error__( " dsqmsym ", " in wait ", ABS( ierr ) )
!
END IF
#else
!$cuf kernel do(1) <<<*,*>>>
DO j = 1, n
DO i = j + 1, n
a(i,j) = a(j,i)
END DO
END DO
#endif
RETURN
END SUBROUTINE laxlib_dsqmsym_gpu_x
#endif
! ---------------------------------------------------------------------------------
SUBROUTINE laxlib_zsqmher_x( n, a, lda, idesc )
!
!! double complex (Z) SQuare Matrix HERmitianize
!!
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension of matrix a
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
COMPLEX(DP) :: a(lda,lda)
!! distributed matrix a
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! laxlib descriptor
!
TYPE(la_descriptor) :: desc
#if defined __MPI
INTEGER :: istatus( MPI_STATUS_SIZE )
#endif
INTEGER :: i, j
INTEGER :: comm, myid
INTEGER :: nr, nc, dest, sreq, ierr, sour
COMPLEX(DP) :: atmp
COMPLEX(DP), ALLOCATABLE :: tst1(:,:)
COMPLEX(DP), ALLOCATABLE :: tst2(:,:)
#if defined __MPI
CALL laxlib_intarray_to_desc(desc,idesc)
IF( desc%active_node <= 0 ) THEN
RETURN
END IF
IF( n /= desc%n ) &
CALL lax_error__( " zsqmsym ", " wrong global dim n ", n )
IF( lda /= desc%nrcx ) &
CALL lax_error__( " zsqmsym ", " wrong leading dim lda ", lda )
comm = desc%comm
nr = desc%nr
nc = desc%nc
IF( desc%myc == desc%myr ) THEN
!
! diagonal block, procs work locally
!
DO j = 1, nc
a(j,j) = CMPLX( DBLE( a(j,j) ), 0_DP, KIND=DP )
DO i = j + 1, nr
a(i,j) = CONJG( a(j,i) )
END DO
END DO
!
ELSE IF( desc%myc > desc%myr ) THEN
!
! super diagonal block, procs send the block to sub diag.
!
CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
desc%myc, desc%myr, dest )
CALL mpi_isend( a, lda*lda, MPI_DOUBLE_COMPLEX, dest, 1, comm, sreq, ierr )
!
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmher ", " in mpi_isend ", ABS( ierr ) )
!
ELSE IF( desc%myc < desc%myr ) THEN
!
! sub diagonal block, procs receive the block from super diag,
! then transpose locally
!
CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
desc%myc, desc%myr, sour )
CALL mpi_recv( a, lda*lda, MPI_DOUBLE_COMPLEX, sour, 1, comm, istatus, ierr )
!
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmher ", " in mpi_recv ", ABS( ierr ) )
!
DO j = 1, lda
DO i = j + 1, lda
atmp = a(i,j)
a(i,j) = a(j,i)
a(j,i) = atmp
END DO
END DO
DO j = 1, nc
DO i = 1, nr
a(i,j) = CONJG( a(i,j) )
END DO
END DO
!
END IF
IF( desc%myc > desc%myr ) THEN
!
CALL MPI_Wait( sreq, istatus, ierr )
!
IF( ierr /= 0 ) &
CALL lax_error__( " zsqmher ", " in MPI_Wait ", ABS( ierr ) )
!
END IF
#if defined __PIPPO
CALL MPI_Comm_rank( comm, myid, ierr )
ALLOCATE( tst1( n, n ) )
ALLOCATE( tst2( n, n ) )
tst1 = 0.0d0
tst2 = 0.0d0
do j = 1, desc%nc
do i = 1, desc%nr
tst1( i + desc%ir - 1, j + desc%ic - 1 ) = a( i , j )
end do
end do
CALL MPI_REDUCE( tst1, tst2, n*n, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, comm, ierr )
IF( myid == 0 ) THEN
DO j = 1, n
!
IF( tst2(j,j) /= CMPLX( DBLE( tst2(j,j) ), 0_DP, KIND=DP ) ) &
WRITE( 4000, * ) j, tst2(j,j)
!
DO i = j + 1, n
!
IF( tst2(i,j) /= CONJG( tst2(j,i) ) ) WRITE( 4000, * ) i,j, tst2(i,j)
!
END DO
!
END DO
END IF
DEALLOCATE( tst1 )
DEALLOCATE( tst2 )
#endif
#else
DO j = 1, n
!
a(j,j) = CMPLX( DBLE( a(j,j) ), 0_DP, KIND=DP )
!
DO i = j + 1, n
!
a(i,j) = CONJG( a(j,i) )
!
END DO
!
END DO
#endif
RETURN
END SUBROUTINE laxlib_zsqmher_x
! ---------------------------------------------------------------------------------
SUBROUTINE laxlib_dsqmred_x( na, a, lda, idesca, nb, b, ldb, idescb )
!
!! Double precision SQuare Matrix REDistribution
!!
!! Copy a global "na * na" matrix locally stored in "a",
!! and distributed as described by integer "idesca", into a larger
!! global "nb * nb" matrix stored in "b" and distributed
!! as described in integer "idescb".
!
!
USE laxlib_descriptor
USE laxlib_ptoolkit
!
IMPLICIT NONE
!
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: na
!! global dimension of matrix a
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
REAL(DP) :: a(lda,lda)
!! matrix to be redistributed into b
INTEGER, INTENT(IN) :: idesca(LAX_DESC_SIZE)
!! integer laxlib descriptor matrix a
INTEGER, INTENT(IN) :: nb
!! global dimension of matrix b
INTEGER, INTENT(IN) :: ldb
!! leading dimension of matrix b
REAL(DP) :: b(ldb,ldb)
!! redistributed matrix
INTEGER, INTENT(IN) :: idescb(LAX_DESC_SIZE)
!! integer laxlib descriptor matrix b
!
TYPE(la_descriptor) :: desca
TYPE(la_descriptor) :: descb
!
CALL laxlib_intarray_to_desc(desca,idesca)
CALL laxlib_intarray_to_desc(descb,idescb)
CALL laxlib_dsqmred_x_x( na, a, lda, desca, nb, b, ldb, descb )
END SUBROUTINE
SUBROUTINE laxlib_zsqmred_x( na, a, lda, idesca, nb, b, ldb, idescb )
!
!! double complex (Z) SQuare Matrix REDistribution
!!
!! Copy a global "na * na" matrix locally stored in "a",
!! and distributed as described by integer "idesca", into a larger
!! global "nb * nb" matrix stored in "b" and distributed
!! as described in integer "idescb".
!!
USE laxlib_descriptor
USE laxlib_ptoolkit
!
IMPLICIT NONE
!
include 'laxlib_param.fh'
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: na
!! global dimension of matrix a
INTEGER, INTENT(IN) :: lda
!! leading dimension of matrix a
COMPLEX(DP) :: a(lda,lda)
!! matrix to be redistributed into b
INTEGER, INTENT(IN) :: idesca(LAX_DESC_SIZE)
!! integer laxlib descriptor matrix a
INTEGER, INTENT(IN) :: nb
!! global dimension of matrix b
INTEGER, INTENT(IN) :: ldb
!! leading dimension of matrix b
COMPLEX(DP) :: b(ldb,ldb)
!! redistributed matrix
INTEGER, INTENT(IN) :: idescb(LAX_DESC_SIZE)
!! integer laxlib descriptor matrix b
!
TYPE(la_descriptor) :: desca
TYPE(la_descriptor) :: descb
!
CALL laxlib_intarray_to_desc(desca,idesca)
CALL laxlib_intarray_to_desc(descb,idescb)
CALL laxlib_zsqmred_x_x( na, a, lda, desca, nb, b, ldb, descb )
END SUBROUTINE
! ---------------------------------------------------------------------------------
SUBROUTINE rep_matmul_drv( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC, comm )
!
!! Parallel matrix multiplication with replicated matrix
!!
!! DGEMM PERFORMS ONE OF THE MATRIX-MATRIX OPERATIONS
!!
!! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!!
!! WHERE OP( X ) IS ONE OF
!!
!! OP( X ) = X OR OP( X ) = X',
!!
!! ALPHA AND BETA ARE SCALARS, AND A, B AND C ARE MATRICES, WITH OP( A )
!! AN M BY K MATRIX, OP( B ) A K BY N MATRIX AND C AN M BY N MATRIX.
!
!
! written by Carlo Cavazzoni
!
USE laxlib_parallel_include
implicit none
include 'laxlib_kinds.fh'
!
CHARACTER(LEN=1), INTENT(IN) :: transa
!! specifies the form of op( A ) to be used in the matrix multiplication as follows:
!! 'N' or 'n', op( A ) = A.
!! 'T' or 't', op( A ) = A**T.
!! 'C' or 'c', op( A ) = A**T.
CHARACTER(LEN=1), INTENT(IN) :: transb
!! specifies the form of op( B ) to be used in the matrix multiplication as
!follows:
!! 'N' or 'n', op( B ) = B.
!! 'T' or 't', op( B ) = B**T.
!! 'C' or 'c', op( B ) = B**T.
INTEGER, INTENT(IN) :: m
!! number of rows of the matrix A and C
INTEGER, INTENT(IN) :: n
!! number of columns of the matrix B and C
INTEGER, INTENT(IN) :: k
!! number of columns of A and rows of B
REAL(DP), INTENT(IN) :: alpha
!! scalar alpha
REAL(DP), INTENT(IN) :: beta
!! scalar beta
INTEGER, INTENT(IN) :: lda
!! leading dimension of A
INTEGER, INTENT(IN) :: ldb
!! leading dimension of B
INTEGER, INTENT(IN) :: ldc
!! leading dimension of C
REAL(DP) :: a(lda,*)
!! matrix A
REAL(DP) :: b(ldb,*)
!! matrix B
REAL(DP) :: c(ldc,*)
!! matrix C
INTEGER, INTENT(IN) :: comm
!! mpi communicator
!
#if defined __MPI
!
INTEGER :: ME, I, II, J, JJ, IP, SOUR, DEST, INFO, IERR, ioff, ldx
INTEGER :: NB, IB_S, NB_SOUR, IB_SOUR, IBUF
INTEGER :: nproc, mpime, q, r
REAL(DP), ALLOCATABLE :: auxa( : )
REAL(DP), ALLOCATABLE :: auxc( : )
!
! ... BODY
!
CALL MPI_COMM_SIZE(comm, NPROC, IERR)
CALL MPI_COMM_RANK(comm, MPIME, IERR)
IF ( NPROC == 1 ) THEN
! if there is only one proc no need of using parallel alg.
CALL dgemm(TRANSA, TRANSB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
RETURN
END IF
ME = MPIME + 1
Q = INT( m / NPROC )
R = MOD( m , NPROC )
! ... Find out the number of elements in the local block
! along "M" first dimension os matrix A
NB = Q
IF( ME <= R ) NB = NB + 1
! ... Find out the global index of the local first row
IF( ME <= R ) THEN
ib_s = (Q+1)*(ME-1) + 1
ELSE
ib_s = Q*(ME-1) + R + 1
END IF
ldx = m / nproc + 1
ALLOCATE( auxa( MAX( n, k ) * ldx ) )
ALLOCATE( auxc( MAX( n, m ) * ldx ) )
IF( TRANSA == 'N' .OR. TRANSA == 'n' ) THEN
ibuf = 0
ioff = ib_s - 1
DO J = 1, k
DO I = 1, NB
auxa( ibuf + I ) = A( I + ioff, J )
END DO
ibuf = ibuf + ldx
END DO
ELSE
ibuf = 0
ioff = ib_s - 1
DO J = 1, k
DO I = 1, NB
auxa( ibuf + I ) = A( J, I + ioff )
END DO
ibuf = ibuf + ldx
END DO
!ioff = ib_s - 1
!call mytranspose( A( 1, ioff + 1 ), lda, auxa(1), ldx, m, nb)
END IF
IF( beta /= 0.0_DP ) THEN
ibuf = 0
ioff = ib_s - 1
DO J = 1, n
DO I = 1, NB
auxc( ibuf + I ) = C( I + ioff, J )
END DO
ibuf = ibuf + ldx
END DO
END IF
CALL dgemm( 'N', transb, nb, n, k, alpha, auxa(1), ldx, B, ldb, beta, auxc(1), ldx )
! ... Here processors exchange blocks
DO IP = 0, NPROC-1
! ... Find out the number of elements in the block of processor SOUR
NB_SOUR = q
IF( (IP+1) .LE. r ) NB_SOUR = NB_SOUR+1
! ... Find out the global index of the first row owned by SOUR
IF( (IP+1) .LE. r ) THEN
ib_sour = (Q+1)*IP + 1
ELSE
ib_sour = Q*IP + R + 1
END IF
IF( mpime == ip ) auxa(1:n*ldx) = auxc(1:n*ldx)
CALL MPI_BCAST( auxa(1), ldx*n, mpi_double_precision, ip, comm, IERR)
IF( ierr /= 0 ) &
CALL lax_error__( " rep_matmul_drv ", " in MPI_BCAST ", ABS( ierr ) )
IBUF = 0
ioff = IB_SOUR - 1
DO J = 1, N
DO I = 1, NB_SOUR
C( I + ioff, J ) = AUXA( IBUF + I )
END DO
IBUF = IBUF + ldx
END DO
END DO
DEALLOCATE( auxa, auxc )
#else
! if we are not compiling with __MPI this is equivalent to a blas call
CALL dgemm(TRANSA, TRANSB, m, N, k, alpha, A, lda, B, ldb, beta, C, ldc)
#endif
RETURN
END SUBROUTINE rep_matmul_drv
SUBROUTINE zrep_matmul_drv( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC, comm )
!
!! Parallel matrix multiplication with replicated matrix
!!
!! DGEMM PERFORMS ONE OF THE MATRIX-MATRIX OPERATIONS
!!
!! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!!
!! WHERE OP( X ) IS ONE OF
!!
!! OP( X ) = X OR OP( X ) = X',
!!
!! ALPHA AND BETA ARE SCALARS, AND A, B AND C ARE MATRICES, WITH OP( A )
!! AN M BY K MATRIX, OP( B ) A K BY N MATRIX AND C AN M BY N MATRIX.
!!
! written by Carlo Cavazzoni
!
USE laxlib_parallel_include
implicit none
include 'laxlib_kinds.fh'
!
CHARACTER(LEN=1), INTENT(IN) :: transa
!! specifies the form of op( A ) to be used in the matrix multiplication as
!follows:
!! 'N' or 'n', op( A ) = A.
!! 'T' or 't', op( A ) = A**T.
!! 'C' or 'c', op( A ) = A**T.
CHARACTER(LEN=1), INTENT(IN) :: transb
!! specifies the form of op( B ) to be used in the matrix multiplication as
!follows:
!! 'N' or 'n', op( B ) = B.
!! 'T' or 't', op( B ) = B**T.
!! 'C' or 'c', op( B ) = B**T.
INTEGER, INTENT(IN) :: m
!! number of rows of the matrix A and C
INTEGER, INTENT(IN) :: n
!! number of columns of the matrix B and C
INTEGER, INTENT(IN) :: k
!! number of columns of A and rows of B
REAL(DP), INTENT(IN) :: alpha
!! scalar alpha
REAL(DP), INTENT(IN) :: beta
!! scalar beta
INTEGER, INTENT(IN) :: lda
!! leading dimension of A
INTEGER, INTENT(IN) :: ldb
!! leading dimension of B
INTEGER, INTENT(IN) :: ldc
!! leading dimension of C
COMPLEX(DP) :: a(lda,*)
!! matrix A
COMPLEX(DP) :: b(ldb,*)
!! matrix B
COMPLEX(DP) :: c(ldc,*)
!! matrix C
INTEGER, INTENT(IN) :: comm
!! mpi communicator
!
#if defined __MPI
!
INTEGER :: ME, I, II, J, JJ, IP, SOUR, DEST, INFO, IERR, ioff, ldx
INTEGER :: NB, IB_S, NB_SOUR, IB_SOUR, IBUF
INTEGER :: nproc, mpime, q, r
COMPLEX(DP), ALLOCATABLE :: auxa( : )
COMPLEX(DP), ALLOCATABLE :: auxc( : )
!
! ... BODY
!
CALL MPI_COMM_SIZE(comm, NPROC, IERR)
CALL MPI_COMM_RANK(comm, MPIME, IERR)
IF ( NPROC == 1 ) THEN
! if there is only one proc no need of using parallel alg.
CALL zgemm(TRANSA, TRANSB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
RETURN
END IF
ME = MPIME + 1
Q = INT( m / NPROC )
R = MOD( m , NPROC )
! ... Find out the number of elements in the local block
! along "M" first dimension os matrix A
NB = Q
IF( ME <= R ) NB = NB + 1
! ... Find out the global index of the local first row
IF( ME <= R ) THEN
ib_s = (Q+1)*(ME-1) + 1
ELSE
ib_s = Q*(ME-1) + R + 1
END IF
ldx = m / nproc + 1
ALLOCATE( auxa( MAX( n, k ) * ldx ) )
ALLOCATE( auxc( MAX( n, m ) * ldx ) )
IF( TRANSA == 'N' .OR. TRANSA == 'n' ) THEN
ibuf = 0
ioff = ib_s - 1
DO J = 1, k
DO I = 1, NB
auxa( ibuf + I ) = A( I + ioff, J )
END DO
ibuf = ibuf + ldx
END DO
ELSE
ibuf = 0
ioff = ib_s - 1
DO J = 1, k
DO I = 1, NB
auxa( ibuf + I ) = CONJG( A( J, I + ioff ) )
END DO
ibuf = ibuf + ldx
END DO
!ioff = ib_s - 1
!call mytranspose( A( 1, ioff + 1 ), lda, auxa(1), ldx, m, nb)
END IF
IF( beta /= 0.0_DP ) THEN
ibuf = 0
ioff = ib_s - 1
DO J = 1, n
DO I = 1, NB
auxc( ibuf + I ) = C( I + ioff, J )
END DO
ibuf = ibuf + ldx
END DO
END IF
CALL zgemm( 'N', transb, nb, n, k, alpha, auxa(1), ldx, B, ldb, beta, auxc(1), ldx )
! ... Here processors exchange blocks
DO IP = 0, NPROC-1
! ... Find out the number of elements in the block of processor SOUR
NB_SOUR = q
IF( (IP+1) .LE. r ) NB_SOUR = NB_SOUR+1
! ... Find out the global index of the first row owned by SOUR
IF( (IP+1) .LE. r ) THEN
ib_sour = (Q+1)*IP + 1
ELSE
ib_sour = Q*IP + R + 1
END IF
IF( mpime == ip ) auxa(1:n*ldx) = auxc(1:n*ldx)
CALL MPI_BCAST( auxa(1), ldx*n, mpi_double_complex, ip, comm, IERR)
IF( ierr /= 0 ) &
CALL lax_error__( " zrep_matmul_drv ", " in MPI_BCAST ", ABS( ierr ) )
IBUF = 0
ioff = IB_SOUR - 1
DO J = 1, N
DO I = 1, NB_SOUR
C( I + ioff, J ) = AUXA( IBUF + I )
END DO
IBUF = IBUF + ldx
END DO
END DO
DEALLOCATE( auxa, auxc )
#else
! if we are not compiling with __MPI this is equivalent to a blas call
CALL zgemm(TRANSA, TRANSB, m, N, k, alpha, A, lda, B, ldb, beta, C, ldc)
#endif
RETURN
END SUBROUTINE zrep_matmul_drv
!
!
!=----------------------------------------------------------------------------=!
!
!
! Cannon's algorithms for parallel matrix multiplication
! written by Carlo Cavazzoni
!
!
!
SUBROUTINE sqr_dmm_cannon_x( transa, transb, n, alpha, a, lda, b, ldb, beta, c, ldc, idesc )
!
!!
!! Double precision parallel square matrix multiplication with Cannon's algorithm
!! performs one of the matrix-matrix operations
!!
!! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!!
!! where op( x ) is one of
!!
!! OP( X ) = X OR OP( X ) = X',
!!
!! alpha and beta are scalars, and a, b and c are square matrices
!!
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
CHARACTER(LEN=1), INTENT(IN) :: transa
!! specifies the form of op( A ) to be used in the matrix multiplication as
!follows:
!! 'N' or 'n', op( A ) = A.
!! 'T' or 't', op( A ) = A**T.
!! 'C' or 'c', op( A ) = A**T.
CHARACTER(LEN=1), INTENT(IN) :: transb
!! specifies the form of op( B ) to be used in the matrix multiplication as
!follows:
!! 'N' or 'n', op( B ) = B.
!! 'T' or 't', op( B ) = B**T.
!! 'C' or 'c', op( B ) = B**T.
INTEGER, INTENT(IN) :: n
!! global dimension
REAL(DP), INTENT(IN) :: alpha
!! scalar alpha
REAL(DP), INTENT(IN) :: beta
!! scalar beta
INTEGER, INTENT(IN) :: lda
!! leading dimension of A
INTEGER, INTENT(IN) :: ldb
!! leading dimension of B
INTEGER, INTENT(IN) :: ldc
!! leading dimension of C
REAL(DP) :: a(lda,*)
!! matrix A
REAL(DP) :: b(ldb,*)
!! matrix B
REAL(DP) :: c(ldc,*)
!! matrix C
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
TYPE(la_descriptor) :: desc
!
integer :: ierr
integer :: np
integer :: i, j, nr, nc, nb, iter, rowid, colid
logical :: ta, tb
INTEGER :: comm
!
!
real(DP), allocatable :: bblk(:,:), ablk(:,:)
!
#if defined (__MPI)
!
integer :: istatus( MPI_STATUS_SIZE )
!
#endif
!
CALL laxlib_intarray_to_desc(desc,idesc)
!
IF( desc%active_node < 0 ) THEN
!
! processors not interested in this computation return quickly
!
RETURN
!
END IF
IF( n < 1 ) THEN
RETURN
END IF
IF( desc%npr == 1 ) THEN
!
! quick return if only one processor is used
!
CALL dgemm( TRANSA, TRANSB, n, n, n, alpha, a, lda, b, ldb, beta, c, ldc)
!
RETURN
!
END IF
IF( desc%npr /= desc%npc ) &
CALL lax_error__( ' sqr_mm_cannon ', ' works only with square processor mesh ', 1 )
!
! Retrieve communicator and mesh geometry
!
np = desc%npr
comm = desc%comm
rowid = desc%myr
colid = desc%myc
!
! Retrieve the size of the local block
!
nr = desc%nr
nc = desc%nc
nb = desc%nrcx
!
#if defined (__MPI)
CALL MPI_BARRIER( comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_mm_cannon ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
!
allocate( ablk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
ablk( i, j ) = a( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
DO j = nc+1, nb
DO i = 1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
!
!
allocate( bblk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
bblk( i, j ) = b( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
DO j = nc+1, nb
DO i = 1, nb
bblk( i, j ) = 0.0_DP
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
bblk( i, j ) = 0.0_DP
END DO
END DO
!
!
ta = ( TRANSA == 'T' .OR. TRANSA == 't' )
tb = ( TRANSB == 'T' .OR. TRANSB == 't' )
!
! Shift A rowid+1 places to the west
!
IF( ta ) THEN
CALL shift_exch_block( ablk, 'W', 1 )
ELSE
CALL shift_block( ablk, 'W', rowid+1, 1 )
END IF
!
! Shift B colid+1 places to the north
!
IF( tb ) THEN
CALL shift_exch_block( bblk, 'N', np+1 )
ELSE
CALL shift_block( bblk, 'N', colid+1, np+1 )
END IF
!
! Accumulate on C
!
CALL dgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, beta, c, ldc)
!
DO iter = 2, np
!
! Shift A 1 places to the east
!
CALL shift_block( ablk, 'E', 1, iter )
!
! Shift B 1 places to the south
!
CALL shift_block( bblk, 'S', 1, np+iter )
!
! Accumulate on C
!
CALL dgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, 1.0_DP, c, ldc)
!
END DO
deallocate( ablk, bblk )
RETURN
CONTAINS
SUBROUTINE shift_block( blk, dir, ln, tag )
!
! Block shift
!
IMPLICIT NONE
REAL(DP) :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir ! shift direction
INTEGER, INTENT(IN) :: ln ! shift length
INTEGER, INTENT(IN) :: tag ! communication tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
!
IF( dir == 'W' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid - ln + np, np )
icsrc = MOD( colid + ln + np, np )
!
ELSE IF( dir == 'E' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid + ln + np, np )
icsrc = MOD( colid - ln + np, np )
!
ELSE IF( dir == 'N' ) THEN
irdst = MOD( rowid - ln + np, np )
irsrc = MOD( rowid + ln + np, np )
icdst = colid
icsrc = colid
ELSE IF( dir == 'S' ) THEN
irdst = MOD( rowid + ln + np, np )
irsrc = MOD( rowid - ln + np, np )
icdst = colid
icsrc = colid
ELSE
CALL lax_error__( ' sqr_mm_cannon ', ' unknown shift direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_PRECISION, &
idest, tag, isour, tag, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_mm_cannon ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
!
#endif
RETURN
END SUBROUTINE shift_block
SUBROUTINE shift_exch_block( blk, dir, tag )
!
! Combined block shift and exchange
! only used for the first step
!
IMPLICIT NONE
REAL(DP) :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir
INTEGER, INTENT(IN) :: tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
INTEGER :: icol, irow
!
IF( dir == 'W' ) THEN
!
icol = rowid
irow = colid
!
irdst = irow
icdst = MOD( icol - irow-1 + np, np )
!
irow = rowid
icol = MOD( colid + rowid+1 + np, np )
!
irsrc = icol
icsrc = irow
!
ELSE IF( dir == 'N' ) THEN
!
icol = rowid
irow = colid
!
icdst = icol
irdst = MOD( irow - icol-1 + np, np )
!
irow = MOD( rowid + colid+1 + np, np )
icol = colid
!
irsrc = icol
icsrc = irow
ELSE
CALL lax_error__( ' sqr_mm_cannon ', ' unknown shift_exch direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_PRECISION, &
idest, tag, isour, tag, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_mm_cannon ", " in MPI_SENDRECV_REPLACE 2 ", ABS( ierr ) )
!
#endif
RETURN
END SUBROUTINE shift_exch_block
END SUBROUTINE sqr_dmm_cannon_x
!=----------------------------------------------------------------------------=!
#if defined (__CUDA)
SUBROUTINE sqr_dmm_cannon_gpu_x( transa, transb, n, alpha, a, lda, b, ldb, beta, c, ldc, idesc )
!
!!
!! Double precision parallel square matrix multiplication with Cannon's algorithm
!! performs one of the matrix-matrix operations
!!
!! GPU version
!!
!! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!!
!! where op( x ) is one of
!!
!! OP( X ) = X OR OP( X ) = X',
!!
!! alpha and beta are scalars, and a, b and c are square matrices
!!
!
USE laxlib_descriptor
USE laxlib_parallel_include
USE cudafor
USE cublas
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
CHARACTER(LEN=1), INTENT(IN) :: transa
!! specifies the form of op( A ) to be used in the matrix multiplication as
!follows:
!! 'N' or 'n', op( A ) = A.
!! 'T' or 't', op( A ) = A**T.
!! 'C' or 'c', op( A ) = A**T.
CHARACTER(LEN=1), INTENT(IN) :: transb
!! specifies the form of op( B ) to be used in the matrix multiplication as
!follows:
!! 'N' or 'n', op( B ) = B.
!! 'T' or 't', op( B ) = B**T.
!! 'C' or 'c', op( B ) = B**T.
INTEGER, INTENT(IN) :: n
!! global dimension
REAL(DP), INTENT(IN) :: alpha
!! scalar alpha
REAL(DP), INTENT(IN) :: beta
!! scalar beta
INTEGER, INTENT(IN) :: lda
!! leading dimension of A
INTEGER, INTENT(IN) :: ldb
!! leading dimension of B
INTEGER, INTENT(IN) :: ldc
!! leading dimension of C
REAL(DP), DEVICE :: a(lda,*)
!! matrix A
REAL(DP), DEVICE :: b(ldb,*)
!! matrix B
REAL(DP), DEVICE :: c(ldc,*)
!! matrix C
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
TYPE(la_descriptor) :: desc
!
integer :: ierr
integer :: np
integer :: i, j, nr, nc, nb, iter, rowid, colid
logical :: ta, tb
INTEGER :: comm
!
!
real(DP), DEVICE, allocatable :: bblk(:,:), ablk(:,:)
!
#if defined (__MPI)
!
integer :: istatus( MPI_STATUS_SIZE )
!
#endif
!
ierr = cudaDeviceSynchronize()
!
CALL laxlib_intarray_to_desc(desc,idesc)
!
IF( desc%active_node < 0 ) THEN
!
! processors not interested in this computation return quickly
!
RETURN
!
END IF
IF( n < 1 ) THEN
RETURN
END IF
IF( desc%npr == 1 ) THEN
!
! quick return if only one processor is used
!
CALL cublasdgemm( TRANSA, TRANSB, n, n, n, alpha, a, lda, b, ldb, beta, c, ldc)
!
RETURN
!
END IF
IF( desc%npr /= desc%npc ) &
CALL lax_error__( ' sqr_mm_cannon ', ' works only with square processor mesh ', 1 )
!
! Retrieve communicator and mesh geometry
!
np = desc%npr
comm = desc%comm
rowid = desc%myr
colid = desc%myc
!
! Retrieve the size of the local block
!
nr = desc%nr
nc = desc%nc
nb = desc%nrcx
!
#if defined (__MPI)
CALL MPI_BARRIER( comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_mm_cannon ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
!
allocate( ablk( nb, nb ) )
!$cuf kernel do(2) <<<*,*>>>
DO j = 1, nc
DO i = 1, nr
ablk( i, j ) = a( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
!$cuf kernel do(2) <<<*,*>>>
DO j = nc+1, nb
DO i = 1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
!$cuf kernel do(2) <<<*,*>>>
DO j = 1, nb
DO i = nr+1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
!
!
allocate( bblk( nb, nb ) )
!$cuf kernel do(2) <<<*,*>>>
DO j = 1, nc
DO i = 1, nr
bblk( i, j ) = b( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
!$cuf kernel do(2) <<<*,*>>>
DO j = nc+1, nb
DO i = 1, nb
bblk( i, j ) = 0.0_DP
END DO
END DO
!$cuf kernel do(2) <<<*,*>>>
DO j = 1, nb
DO i = nr+1, nb
bblk( i, j ) = 0.0_DP
END DO
END DO
!
ierr = cudaDeviceSynchronize()
!
ta = ( TRANSA == 'T' .OR. TRANSA == 't' )
tb = ( TRANSB == 'T' .OR. TRANSB == 't' )
!
! Shift A rowid+1 places to the west
!
IF( ta ) THEN
CALL shift_exch_block( ablk, 'W', 1 )
ELSE
CALL shift_block( ablk, 'W', rowid+1, 1 )
END IF
!
! Shift B colid+1 places to the north
!
IF( tb ) THEN
CALL shift_exch_block( bblk, 'N', np+1 )
ELSE
CALL shift_block( bblk, 'N', colid+1, np+1 )
END IF
!
! Accumulate on C
!
CALL cublasdgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, beta, c, ldc)
!
DO iter = 2, np
!
ierr = cudaDeviceSynchronize()
!
! Shift A 1 places to the east
!
CALL shift_block( ablk, 'E', 1, iter )
!
! Shift B 1 places to the south
!
CALL shift_block( bblk, 'S', 1, np+iter )
!
! Accumulate on C
!
CALL cublasdgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, 1.0_DP, c, ldc)
!
END DO
ierr = cudaDeviceSynchronize()
deallocate( ablk, bblk )
RETURN
CONTAINS
SUBROUTINE shift_block( blk, dir, ln, tag )
!
! Block shift
!
IMPLICIT NONE
REAL(DP), DEVICE :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir ! shift direction
INTEGER, INTENT(IN) :: ln ! shift length
INTEGER, INTENT(IN) :: tag ! communication tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
#if ! defined(__GPU_MPI)
REAL(DP), ALLOCATABLE :: blk_h( :, : )
ALLOCATE( blk_h, SOURCE = blk )
ierr = cudaDeviceSynchronize()
#endif
!
IF( dir == 'W' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid - ln + np, np )
icsrc = MOD( colid + ln + np, np )
!
ELSE IF( dir == 'E' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid + ln + np, np )
icsrc = MOD( colid - ln + np, np )
!
ELSE IF( dir == 'N' ) THEN
irdst = MOD( rowid - ln + np, np )
irsrc = MOD( rowid + ln + np, np )
icdst = colid
icsrc = colid
ELSE IF( dir == 'S' ) THEN
irdst = MOD( rowid + ln + np, np )
irsrc = MOD( rowid - ln + np, np )
icdst = colid
icsrc = colid
ELSE
CALL lax_error__( ' sqr_mm_cannon ', ' unknown shift direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
#if defined(__GPU_MPI)
ierr = cudaDeviceSynchronize()
CALL MPI_SENDRECV_REPLACE(blk, SIZE(blk), MPI_DOUBLE_PRECISION, &
idest, tag, isour, tag, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_mm_cannon_gpu ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
#else
CALL MPI_SENDRECV_REPLACE(blk_h, SIZE(blk_h), MPI_DOUBLE_PRECISION, &
idest, tag, isour, tag, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_mm_cannon_gpu ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
blk = blk_h
ierr = cudaDeviceSynchronize()
DEALLOCATE( blk_h )
#endif
!
#endif
RETURN
END SUBROUTINE shift_block
SUBROUTINE shift_exch_block( blk, dir, tag )
!
! Combined block shift and exchange
! only used for the first step
!
IMPLICIT NONE
REAL(DP), DEVICE :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir
INTEGER, INTENT(IN) :: tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
INTEGER :: icol, irow
#if ! defined(__GPU_MPI)
REAL(DP), ALLOCATABLE :: blk_h( :, : )
ALLOCATE( blk_h, SOURCE = blk )
ierr = cudaDeviceSynchronize()
#endif
!
IF( dir == 'W' ) THEN
!
icol = rowid
irow = colid
!
irdst = irow
icdst = MOD( icol - irow-1 + np, np )
!
irow = rowid
icol = MOD( colid + rowid+1 + np, np )
!
irsrc = icol
icsrc = irow
!
ELSE IF( dir == 'N' ) THEN
!
icol = rowid
irow = colid
!
icdst = icol
irdst = MOD( irow - icol-1 + np, np )
!
irow = MOD( rowid + colid+1 + np, np )
icol = colid
!
irsrc = icol
icsrc = irow
ELSE
CALL lax_error__( ' sqr_mm_cannon_gpu ', ' unknown shift_exch direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
#if defined(__GPU_MPI)
ierr = cudaDeviceSynchronize()
CALL MPI_SENDRECV_REPLACE(blk, SIZE(blk), MPI_DOUBLE_PRECISION, &
idest, tag, isour, tag, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_mm_cannon_gpu ", " in MPI_SENDRECV_REPLACE 2 ", ABS( ierr ) )
#else
CALL MPI_SENDRECV_REPLACE(blk_h, SIZE(blk_h), MPI_DOUBLE_PRECISION, &
idest, tag, isour, tag, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_mm_cannon_gpu ", " in MPI_SENDRECV_REPLACE 2 ", ABS( ierr ) )
blk = blk_h
ierr = cudaDeviceSynchronize()
DEALLOCATE( blk_h )
#endif
!
#endif
RETURN
END SUBROUTINE shift_exch_block
END SUBROUTINE sqr_dmm_cannon_gpu_x
#endif
!=----------------------------------------------------------------------------=!
SUBROUTINE sqr_smm_cannon_x( transa, transb, n, alpha, a, lda, b, ldb, beta, c, ldc, idesc )
!
!! Single precision parallel square matrix multiplication with Cannon's algorithm
!! performs one of the matrix-matrix operations
!!
!! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!!
!! where op( x ) is one of
!!
!! OP( X ) = X OR OP( X ) = X',
!!
!! alpha and beta are scalars, and a, b and c are square matrices
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
CHARACTER(LEN=1), INTENT(IN) :: transa
!! specifies the form of op( A ) to be used in the matrix multiplication as
!follows:
!! 'N' or 'n', op( A ) = A.
!! 'T' or 't', op( A ) = A**T.
!! 'C' or 'c', op( A ) = A**T.
CHARACTER(LEN=1), INTENT(IN) :: transb
!! specifies the form of op( B ) to be used in the matrix multiplication as
!follows:
!! 'N' or 'n', op( B ) = B.
!! 'T' or 't', op( B ) = B**T.
!! 'C' or 'c', op( B ) = B**T.
INTEGER, INTENT(IN) :: n
!! global dimension
REAL(SP), INTENT(IN) :: alpha
!! scalar alpha
REAL(SP), INTENT(IN) :: beta
!! scalar beta
INTEGER, INTENT(IN) :: lda
!! leading dimension of A
INTEGER, INTENT(IN) :: ldb
!! leading dimension of B
INTEGER, INTENT(IN) :: ldc
!! leading dimension of C
REAL(SP) :: a(lda,*)
!! matrix A
REAL(SP) :: b(ldb,*)
!! matrix B
REAL(SP) :: c(ldc,*)
!! matrix C
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
TYPE(la_descriptor) :: desc
!
integer :: ierr
integer :: np
integer :: i, j, nr, nc, nb, iter, rowid, colid
logical :: ta, tb
INTEGER :: comm
!
!
real(SP), allocatable :: bblk(:,:), ablk(:,:)
!
#if defined (__MPI)
!
integer :: istatus( MPI_STATUS_SIZE )
!
#endif
!
CALL laxlib_intarray_to_desc(desc,idesc)
!
IF( desc%active_node < 0 ) THEN
!
! processors not interested in this computation return quickly
!
RETURN
!
END IF
IF( n < 1 ) THEN
RETURN
END IF
IF( desc%npr == 1 ) THEN
!
! quick return if only one processor is used
!
CALL sgemm( TRANSA, TRANSB, n, n, n, alpha, a, lda, b, ldb, beta, c, ldc)
!
RETURN
!
END IF
IF( desc%npr /= desc%npc ) &
CALL lax_error__( ' sqr_smm_cannon ', ' works only with square processor mesh ', 1 )
!
! Retrieve communicator and mesh geometry
!
np = desc%npr
comm = desc%comm
rowid = desc%myr
colid = desc%myc
!
! Retrieve the size of the local block
!
nr = desc%nr
nc = desc%nc
nb = desc%nrcx
!
#if defined (__MPI)
CALL MPI_BARRIER( comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_smm_cannon ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
!
allocate( ablk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
ablk( i, j ) = a( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
DO j = nc+1, nb
DO i = 1, nb
ablk( i, j ) = 0.0_SP
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
ablk( i, j ) = 0.0_SP
END DO
END DO
!
!
allocate( bblk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
bblk( i, j ) = b( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
DO j = nc+1, nb
DO i = 1, nb
bblk( i, j ) = 0.0_SP
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
bblk( i, j ) = 0.0_SP
END DO
END DO
!
!
ta = ( TRANSA == 'T' .OR. TRANSA == 't' )
tb = ( TRANSB == 'T' .OR. TRANSB == 't' )
!
! Shift A rowid+1 places to the west
!
IF( ta ) THEN
CALL shift_exch_block( ablk, 'W', 1 )
ELSE
CALL shift_block( ablk, 'W', rowid+1, 1 )
END IF
!
! Shift B colid+1 places to the north
!
IF( tb ) THEN
CALL shift_exch_block( bblk, 'N', np+1 )
ELSE
CALL shift_block( bblk, 'N', colid+1, np+1 )
END IF
!
! Accumulate on C
!
CALL sgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, beta, c, ldc)
!
DO iter = 2, np
!
! Shift A 1 places to the east
!
CALL shift_block( ablk, 'E', 1, iter )
!
! Shift B 1 places to the south
!
CALL shift_block( bblk, 'S', 1, np+iter )
!
! Accumulate on C
!
CALL sgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, 1.0_SP, c, ldc)
!
END DO
deallocate( ablk, bblk )
RETURN
CONTAINS
SUBROUTINE shift_block( blk, dir, ln, tag )
!
! Block shift
!
IMPLICIT NONE
REAL(SP) :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir ! shift direction
INTEGER, INTENT(IN) :: ln ! shift length
INTEGER, INTENT(IN) :: tag ! communication tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
!
IF( dir == 'W' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid - ln + np, np )
icsrc = MOD( colid + ln + np, np )
!
ELSE IF( dir == 'E' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid + ln + np, np )
icsrc = MOD( colid - ln + np, np )
!
ELSE IF( dir == 'N' ) THEN
irdst = MOD( rowid - ln + np, np )
irsrc = MOD( rowid + ln + np, np )
icdst = colid
icsrc = colid
ELSE IF( dir == 'S' ) THEN
irdst = MOD( rowid + ln + np, np )
irsrc = MOD( rowid - ln + np, np )
icdst = colid
icsrc = colid
ELSE
CALL lax_error__( ' sqr_smm_cannon ', ' unknown shift direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_REAL, &
idest, tag, isour, tag, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_smm_cannon ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
!
#endif
RETURN
END SUBROUTINE shift_block
SUBROUTINE shift_exch_block( blk, dir, tag )
!
! Combined block shift and exchange
! only used for the first step
!
IMPLICIT NONE
REAL(SP) :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir
INTEGER, INTENT(IN) :: tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
INTEGER :: icol, irow
!
IF( dir == 'W' ) THEN
!
icol = rowid
irow = colid
!
irdst = irow
icdst = MOD( icol - irow-1 + np, np )
!
irow = rowid
icol = MOD( colid + rowid+1 + np, np )
!
irsrc = icol
icsrc = irow
!
ELSE IF( dir == 'N' ) THEN
!
icol = rowid
irow = colid
!
icdst = icol
irdst = MOD( irow - icol-1 + np, np )
!
irow = MOD( rowid + colid+1 + np, np )
icol = colid
!
irsrc = icol
icsrc = irow
ELSE
CALL lax_error__( ' sqr_smm_cannon ', ' unknown shift_exch direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_REAL, &
idest, tag, isour, tag, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_smm_cannon ", " in MPI_SENDRECV_REPLACE 2 ", ABS( ierr ) )
!
#endif
RETURN
END SUBROUTINE shift_exch_block
END SUBROUTINE sqr_smm_cannon_x
!=----------------------------------------------------------------------------=!
SUBROUTINE sqr_zmm_cannon_x( transa, transb, n, alpha, a, lda, b, ldb, beta, c, ldc, idesc )
!
!! Double precision complex (Z) parallel square matrix multiplication with Cannon's algorithm
!! performs one of the matrix-matrix operations
!!
!! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!!
!! where op( x ) is one of
!!
!! OP( X ) = X OR OP( X ) = X',
!!
!! alpha and beta are scalars, and a, b and c are square matrices
!
USE laxlib_descriptor
!
USE laxlib_parallel_include
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
CHARACTER(LEN=1), INTENT(IN) :: transa
!! specifies the form of op( A ) to be used in the matrix multiplication as
!follows:
!! 'N' or 'n', op( A ) = A.
!! 'T' or 't', op( A ) = A**T.
!! 'C' or 'c', op( A ) = A**T.
CHARACTER(LEN=1), INTENT(IN) :: transb
!! specifies the form of op( B ) to be used in the matrix multiplication as
!follows:
!! 'N' or 'n', op( B ) = B.
!! 'T' or 't', op( B ) = B**T.
!! 'C' or 'c', op( B ) = B**T.
INTEGER, INTENT(IN) :: n
!! global dimension
COMPLEX(DP), INTENT(IN) :: alpha
!! scalar alpha
COMPLEX(DP), INTENT(IN) :: beta
!! scalar beta
INTEGER, INTENT(IN) :: lda
!! leading dimension of A
INTEGER, INTENT(IN) :: ldb
!! leading dimension of B
INTEGER, INTENT(IN) :: ldc
!! leading dimension of C
COMPLEX(DP) :: a(lda,*)
!! matrix A
COMPLEX(DP) :: b(ldb,*)
!! matrix B
COMPLEX(DP) :: c(ldc,*)
!! matrix C
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
TYPE(la_descriptor) :: desc
!
INTEGER :: ierr
INTEGER :: np
INTEGER :: i, j, nr, nc, nb, iter, rowid, colid
LOGICAL :: ta, tb
INTEGER :: comm
!
!
COMPLEX(DP), ALLOCATABLE :: bblk(:,:), ablk(:,:)
COMPLEX(DP) :: zone = ( 1.0_DP, 0.0_DP )
COMPLEX(DP) :: zzero = ( 0.0_DP, 0.0_DP )
!
#if defined (__MPI)
!
integer :: istatus( MPI_STATUS_SIZE )
!
#endif
!
CALL laxlib_intarray_to_desc(desc,idesc)
!
IF( desc%active_node < 0 ) THEN
!
! processors not interested in this computation return quickly
!
RETURN
!
END IF
IF( n < 1 ) THEN
RETURN
END IF
IF( desc%npr == 1 ) THEN
!
! quick return if only one processor is used
!
CALL zgemm( TRANSA, TRANSB, n, n, n, alpha, a, lda, b, ldb, beta, c, ldc)
!
RETURN
!
END IF
IF( desc%npr /= desc%npc ) &
CALL lax_error__( ' sqr_zmm_cannon ', ' works only with square processor mesh ', 1 )
!
! Retrieve communicator and mesh geometry
!
np = desc%npr
comm = desc%comm
rowid = desc%myr
colid = desc%myc
!
! Retrieve the size of the local block
!
nr = desc%nr
nc = desc%nc
nb = desc%nrcx
!
#if defined (__MPI)
CALL MPI_BARRIER( comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_zmm_cannon ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
!
allocate( ablk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
ablk( i, j ) = a( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
DO j = nc+1, nb
DO i = 1, nb
ablk( i, j ) = zzero
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
ablk( i, j ) = zzero
END DO
END DO
!
!
allocate( bblk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
bblk( i, j ) = b( i, j )
END DO
END DO
!
! Clear memory outside the matrix block
!
DO j = nc+1, nb
DO i = 1, nb
bblk( i, j ) = zzero
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
bblk( i, j ) = zzero
END DO
END DO
!
!
ta = ( TRANSA == 'C' .OR. TRANSA == 'c' )
tb = ( TRANSB == 'C' .OR. TRANSB == 'c' )
!
! Shift A rowid+1 places to the west
!
IF( ta ) THEN
CALL shift_exch_block( ablk, 'W', 1 )
ELSE
CALL shift_block( ablk, 'W', rowid+1, 1 )
END IF
!
! Shift B colid+1 places to the north
!
IF( tb ) THEN
CALL shift_exch_block( bblk, 'N', np+1 )
ELSE
CALL shift_block( bblk, 'N', colid+1, np+1 )
END IF
!
! Accumulate on C
!
CALL zgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, beta, c, ldc)
!
DO iter = 2, np
!
! Shift A 1 places to the east
!
CALL shift_block( ablk, 'E', 1, iter )
!
! Shift B 1 places to the south
!
CALL shift_block( bblk, 'S', 1, np+iter )
!
! Accumulate on C
!
CALL zgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, zone, c, ldc)
!
END DO
deallocate( ablk, bblk )
RETURN
CONTAINS
SUBROUTINE shift_block( blk, dir, ln, tag )
!
! Block shift
!
IMPLICIT NONE
COMPLEX(DP) :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir ! shift direction
INTEGER, INTENT(IN) :: ln ! shift length
INTEGER, INTENT(IN) :: tag ! communication tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
!
IF( dir == 'W' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid - ln + np, np )
icsrc = MOD( colid + ln + np, np )
!
ELSE IF( dir == 'E' ) THEN
!
irdst = rowid
irsrc = rowid
icdst = MOD( colid + ln + np, np )
icsrc = MOD( colid - ln + np, np )
!
ELSE IF( dir == 'N' ) THEN
irdst = MOD( rowid - ln + np, np )
irsrc = MOD( rowid + ln + np, np )
icdst = colid
icsrc = colid
ELSE IF( dir == 'S' ) THEN
irdst = MOD( rowid + ln + np, np )
irsrc = MOD( rowid - ln + np, np )
icdst = colid
icsrc = colid
ELSE
CALL lax_error__( ' sqr_zmm_cannon ', ' unknown shift direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_COMPLEX, &
idest, tag, isour, tag, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_zmm_cannon ", " in MPI_SENDRECV_REPLACE 1 ", ABS( ierr ) )
!
#endif
RETURN
END SUBROUTINE shift_block
!
SUBROUTINE shift_exch_block( blk, dir, tag )
!
! Combined block shift and exchange
! only used for the first step
!
IMPLICIT NONE
COMPLEX(DP) :: blk( :, : )
CHARACTER(LEN=1), INTENT(IN) :: dir
INTEGER, INTENT(IN) :: tag
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
INTEGER :: icol, irow
!
IF( dir == 'W' ) THEN
!
icol = rowid
irow = colid
!
irdst = irow
icdst = MOD( icol - irow-1 + np, np )
!
irow = rowid
icol = MOD( colid + rowid+1 + np, np )
!
irsrc = icol
icsrc = irow
!
ELSE IF( dir == 'N' ) THEN
!
icol = rowid
irow = colid
!
icdst = icol
irdst = MOD( irow - icol-1 + np, np )
!
irow = MOD( rowid + colid+1 + np, np )
icol = colid
!
irsrc = icol
icsrc = irow
ELSE
CALL lax_error__( ' sqr_zmm_cannon ', ' unknown shift_exch direction ', 1 )
END IF
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_COMPLEX, &
idest, tag, isour, tag, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_zmm_cannon ", " in MPI_SENDRECV_REPLACE 2 ", ABS( ierr ) )
!
#endif
RETURN
END SUBROUTINE shift_exch_block
END SUBROUTINE sqr_zmm_cannon_x
!
!
!
!
SUBROUTINE sqr_tr_cannon_x( n, a, lda, b, ldb, idesc )
!
!! Parallel square matrix transposition with Cannon's algorithm
!!
!
USE laxlib_parallel_include
IMPLICIT NONE
!
include 'laxlib_param.fh'
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: lda
!! leading dimension of A
INTEGER, INTENT(IN) :: ldb
!! leading dimension of B
REAL(DP) :: a(lda,*)
!! matrix A
REAL(DP) :: b(ldb,*)
!! matrix B
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
INTEGER :: ierr
INTEGER :: np, rowid, colid
INTEGER :: i, j, nr, nc, nb
INTEGER :: comm
!
REAL(DP), ALLOCATABLE :: ablk(:,:)
!
#if defined (__MPI)
!
INTEGER :: istatus( MPI_STATUS_SIZE )
!
#endif
!
IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
RETURN
END IF
IF( n < 1 ) THEN
RETURN
END IF
IF( idesc(LAX_DESC_NPR) == 1 ) THEN
CALL mytranspose( a, lda, b, ldb, n, n )
RETURN
END IF
IF( idesc(LAX_DESC_NPR) /= idesc(LAX_DESC_NPC) ) &
CALL lax_error__( ' sqr_tr_cannon ', ' works only with square processor mesh ', 1 )
IF( n /= idesc(LAX_DESC_N) ) &
CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size n ', 1 )
IF( lda /= idesc(LAX_DESC_NRCX) ) &
CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size lda ', 1 )
IF( ldb /= idesc(LAX_DESC_NRCX) ) &
CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size ldb ', 1 )
comm = idesc(LAX_DESC_COMM)
rowid = idesc(LAX_DESC_MYR)
colid = idesc(LAX_DESC_MYC)
np = idesc(LAX_DESC_NPR)
!
! Compute the size of the local block
!
nr = idesc(LAX_DESC_NR)
nc = idesc(LAX_DESC_NC)
nb = idesc(LAX_DESC_NRCX)
!
allocate( ablk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
ablk( i, j ) = a( i, j )
END DO
END DO
DO j = nc+1, nb
DO i = 1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
!
CALL exchange_block( ablk )
!
#if defined (__MPI)
CALL MPI_BARRIER( comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_tr_cannon ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
!
DO j = 1, nr
DO i = 1, nc
b( j, i ) = ablk( i, j )
END DO
END DO
!
deallocate( ablk )
RETURN
CONTAINS
SUBROUTINE exchange_block( blk )
!
! Block exchange ( transpose )
!
IMPLICIT NONE
REAL(DP) :: blk( :, : )
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
!
irdst = colid
icdst = rowid
irsrc = colid
icsrc = rowid
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_PRECISION, &
idest, np+np+1, isour, np+np+1, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_tr_cannon ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
!
#endif
RETURN
END SUBROUTINE
END SUBROUTINE
!
#if defined (__CUDA)
SUBROUTINE sqr_tr_cannon_gpu_x( n, a, lda, b, ldb, idesc )
!
!! Parallel square matrix transposition with Cannon's algorithm
!! GPU version
!!
!
USE laxlib_parallel_include
USE cudafor
USE cublas
!
IMPLICIT NONE
!
include 'laxlib_param.fh'
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: lda
!! leading dimension of A
INTEGER, INTENT(IN) :: ldb
!! leading dimension of B
REAL(DP), INTENT(IN), DEVICE :: a(:,:)
!! matrix A
REAL(DP), INTENT(OUT), DEVICE :: b(:,:)
!! matrix B
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
INTEGER :: ierr
INTEGER :: np, rowid, colid
INTEGER :: i, j, nr, nc, nb, ldx
INTEGER :: comm
!
REAL(DP) :: tmp
#if defined (__GPU_MPI)
REAL(DP), ALLOCATABLE, DEVICE :: ablk(:,:)
#else
REAL(DP), ALLOCATABLE :: ablk(:,:)
#endif
!
#if defined (__MPI)
!
INTEGER :: istatus( MPI_STATUS_SIZE )
!
#endif
!
IF( n < 2 ) THEN
GOTO 1000
END IF
!
IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
GOTO 1000
END IF
!
IF( idesc(LAX_DESC_NPR) == 1 .AND. idesc(LAX_DESC_NPC) == 1 ) THEN
!$cuf kernel do(2) <<<*,*>>>
DO j = 1, n
DO i = 1, n
b( j, i ) = a( i, j )
END DO
END DO
GOTO 1000
END IF
IF( idesc(LAX_DESC_NPR) /= idesc(LAX_DESC_NPC) ) &
CALL lax_error__( ' sqr_tr_cannon_gpu ', ' works only with square processor mesh ', 1 )
IF( n /= idesc(LAX_DESC_N) ) &
CALL lax_error__( ' sqr_tr_cannon_gpu ', ' inconsistent size n ', 1 )
IF( lda /= idesc(LAX_DESC_NRCX) .OR. lda /= SIZE(a,1) ) &
CALL lax_error__( ' sqr_tr_cannon_gpu ', ' inconsistent size lda ', 1 )
IF( ldb /= idesc(LAX_DESC_NRCX) .OR. ldb /= SIZE(b,1) ) &
CALL lax_error__( ' sqr_tr_cannon_gpu ', ' inconsistent size ldb ', 1 )
comm = idesc(LAX_DESC_COMM)
rowid = idesc(LAX_DESC_MYR)
colid = idesc(LAX_DESC_MYC)
np = idesc(LAX_DESC_NPR)
!
! Compute the size of the local block
!
nr = idesc(LAX_DESC_NR)
nc = idesc(LAX_DESC_NC)
nb = idesc(LAX_DESC_NRCX)
ldx = MAX(lda,ldb)
!
ALLOCATE( ablk(nb,nb) )
ablk(1:nr,1:nc) = a(1:nr,1:nc)
DO j = nc+1, nb
DO i = 1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
ablk( i, j ) = 0.0_DP
END DO
END DO
!
CALL exchange_block( ablk )
!
#if defined (__MPI)
CALL MPI_BARRIER( comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_tr_cannon_gpu ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
!
#if defined (__GPU_MPI)
!$cuf kernel do(2) <<<*,*>>>
DO j = 1, nr
DO i = 1, nc
b( j, i ) = ablk( i, j )
END DO
END DO
ierr = cudaDeviceSynchronize()
#else
DO j = 1, ldx
DO i = j + 1, ldx
tmp = ablk(i,j)
ablk(i,j) = ablk(j,i)
ablk(j,i) = tmp
END DO
END DO
!ierr = cudaMemcpy2D(b, SIZE(b,1), ablk, ldx, nc, nr, cudaMemcpyHostToDevice )
b(1:nr,1:nc) = ablk(1:nr,1:nc)
!DO j = 1, nr
! DO i = 1, nc
! b( j, i ) = ablk( i, j )
! END DO
!END DO
#endif
!
!
deallocate( ablk )
1000 CONTINUE
ierr = cudaDeviceSynchronize()
RETURN
CONTAINS
SUBROUTINE exchange_block( blk )
!
! Block exchange ( transpose )
!
IMPLICIT NONE
#if defined (__GPU_MPI)
REAL(DP), DEVICE :: blk( :, : )
#else
REAL(DP) :: blk( :, : )
#endif
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
!
irdst = colid
icdst = rowid
irsrc = colid
icsrc = rowid
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
#if defined (__GPU_MPI)
ierr = cudaDeviceSynchronize()
#endif
CALL MPI_SENDRECV_REPLACE(blk, SIZE(blk), MPI_DOUBLE_PRECISION, &
idest, np+np+1, isour, np+np+1, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_tr_cannon_gpu ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
!
#endif
RETURN
END SUBROUTINE
END SUBROUTINE sqr_tr_cannon_gpu_x
#endif
!
SUBROUTINE sqr_tr_cannon_sp_x( n, a, lda, b, ldb, idesc )
!
!! Parallel square matrix transposition with Cannon's algorithm
!! single precision version
!
USE laxlib_parallel_include
IMPLICIT NONE
!
include 'laxlib_param.fh'
include 'laxlib_kinds.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: lda
!! leading dimension of A
INTEGER, INTENT(IN) :: ldb
!! leading dimension of B
REAL(SP) :: a(lda,*)
!! matrix A
REAL(SP) :: b(ldb,*)
!! matrix B
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
INTEGER :: ierr
INTEGER :: np, rowid, colid
INTEGER :: i, j, nr, nc, nb
INTEGER :: comm
!
REAL(SP), ALLOCATABLE :: ablk(:,:)
!
#if defined (__MPI)
!
INTEGER :: istatus( MPI_STATUS_SIZE )
!
#endif
!
IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
RETURN
END IF
IF( n < 1 ) THEN
RETURN
END IF
IF( idesc(LAX_DESC_NPR) == 1 ) THEN
CALL mytranspose_sp( a, lda, b, ldb, n, n )
RETURN
END IF
IF( idesc(LAX_DESC_NPR) /= idesc(LAX_DESC_NPC) ) &
CALL lax_error__( ' sqr_tr_cannon ', ' works only with square processor mesh ', 1 )
IF( n /= idesc(LAX_DESC_N) ) &
CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size n ', 1 )
IF( lda /= idesc(LAX_DESC_NRCX) ) &
CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size lda ', 1 )
IF( ldb /= idesc(LAX_DESC_NRCX) ) &
CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size ldb ', 1 )
comm = idesc(LAX_DESC_COMM)
rowid = idesc(LAX_DESC_MYR)
colid = idesc(LAX_DESC_MYC)
np = idesc(LAX_DESC_NPR)
!
! Compute the size of the local block
!
nr = idesc(LAX_DESC_NR)
nc = idesc(LAX_DESC_NC)
nb = idesc(LAX_DESC_NRCX)
!
allocate( ablk( nb, nb ) )
DO j = 1, nc
DO i = 1, nr
ablk( i, j ) = a( i, j )
END DO
END DO
DO j = nc+1, nb
DO i = 1, nb
ablk( i, j ) = 0.0_SP
END DO
END DO
DO j = 1, nb
DO i = nr+1, nb
ablk( i, j ) = 0.0_SP
END DO
END DO
!
CALL exchange_block( ablk )
!
#if defined (__MPI)
CALL MPI_BARRIER( comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_tr_cannon_sp ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
!
DO j = 1, nr
DO i = 1, nc
b( j, i ) = ablk( i, j )
END DO
END DO
!
deallocate( ablk )
RETURN
CONTAINS
SUBROUTINE exchange_block( blk )
!
! Block exchange ( transpose )
!
IMPLICIT NONE
REAL(SP) :: blk( :, : )
!
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
!
irdst = colid
icdst = rowid
irsrc = colid
icsrc = rowid
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_REAL, &
idest, np+np+1, isour, np+np+1, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " sqr_tr_cannon_sp ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
!
#endif
RETURN
END SUBROUTINE
END SUBROUTINE
SUBROUTINE redist_row2col_x( n, a, b, ldx, nx, idesc )
!
!! redistribute a, array whose second dimension is distributed over processor row,
!! to obtain b, with the second dim. distributed over processor column
!
!
USE laxlib_parallel_include
IMPLICIT NONE
!
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: ldx
!! local rows
INTEGER, INTENT(IN) :: nx
!! local columns
REAL(DP) :: a(ldx,nx)
!! matrix A
REAL(DP) :: b(ldx,nx)
!! matrix B
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
INTEGER :: ierr
INTEGER :: np, rowid, colid
INTEGER :: comm
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
!
#if defined (__MPI)
!
INTEGER :: istatus( MPI_STATUS_SIZE )
!
#endif
!
IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
RETURN
END IF
IF( n < 1 ) THEN
RETURN
END IF
IF( idesc(LAX_DESC_NPR) == 1 ) THEN
b = a
RETURN
END IF
IF( idesc(LAX_DESC_NPR) /= idesc(LAX_DESC_NPC) ) &
CALL lax_error__( ' redist_row2col ', ' works only with square processor mesh ', 1 )
IF( n /= idesc(LAX_DESC_N) ) &
CALL lax_error__( ' redist_row2col ', ' inconsistent size n ', 1 )
IF( nx /= idesc(LAX_DESC_NRCX) ) &
CALL lax_error__( ' redist_row2col ', ' inconsistent size lda ', 1 )
comm = idesc(LAX_DESC_COMM)
rowid = idesc(LAX_DESC_MYR)
colid = idesc(LAX_DESC_MYC)
np = idesc(LAX_DESC_NPR)
!
irdst = colid
icdst = rowid
irsrc = colid
icsrc = rowid
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_BARRIER( comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col ", " in MPI_BARRIER ", ABS( ierr ) )
!
CALL MPI_SENDRECV(a, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
b, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col ", " in MPI_SENDRECV ", ABS( ierr ) )
!
#else
b = a
#endif
!
RETURN
END SUBROUTINE redist_row2col_x
#if defined (__CUDA)
SUBROUTINE redist_row2col_gpu_x( n, a, b, ldx, nx, idesc )
!
!! redistribute a, array whose second dimension is distributed over processor row,
!! to obtain b, with the second dim. distributed over processor column
!! GPU version
!
USE cudafor
USE laxlib_parallel_include
IMPLICIT NONE
!
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: ldx
!! local rows
INTEGER, INTENT(IN) :: nx
!! local columns
REAL(DP), DEVICE :: a(:,:)
!! matrix A
REAL(DP), DEVICE :: b(:,:)
!! matrix B
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
REAL(DP), ALLOCATABLE :: a_h(:,:), b_h(:,:)
INTEGER :: ierr
INTEGER :: np, rowid, colid
INTEGER :: comm
INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
!
#if defined (__MPI)
!
INTEGER :: istatus( MPI_STATUS_SIZE )
!
#endif
!
IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
RETURN
END IF
IF( n < 1 ) THEN
RETURN
END IF
IF( idesc(LAX_DESC_NPR) == 1 ) THEN
b = a
RETURN
END IF
IF( idesc(LAX_DESC_NPR) /= idesc(LAX_DESC_NPC) ) &
CALL lax_error__( ' redist_row2col_gpu ', ' works only with square processor mesh ', 1 )
IF( n /= idesc(LAX_DESC_N) ) &
CALL lax_error__( ' redist_row2col_gpu ', ' inconsistent size n ', 1 )
IF( nx /= idesc(LAX_DESC_NRCX) ) &
CALL lax_error__( ' redist_row2col_gpu ', ' inconsistent size lda ', 1 )
comm = idesc(LAX_DESC_COMM)
rowid = idesc(LAX_DESC_MYR)
colid = idesc(LAX_DESC_MYC)
np = idesc(LAX_DESC_NPR)
!
irdst = colid
icdst = rowid
irsrc = colid
icsrc = rowid
!
CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
!
#if defined (__MPI)
!
CALL MPI_BARRIER( comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " in MPI_BARRIER ", ABS( ierr ) )
!
#if defined(__GPU_MPI)
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " allocating wrk ", ABS( ierr ) )
!
ierr = cudaDeviceSynchronize()
CALL MPI_SENDRECV(a, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
b, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
#else
ALLOCATE( a_h, SOURCE=a, STAT=ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " allocating a_h ", ABS( ierr ) )
ALLOCATE( b_h, MOLD=b, STAT=ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " allocating b_h ", ABS( ierr ) )
!
CALL MPI_SENDRECV(a_h, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
b_h, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
!
b = b_h
!
DEALLOCATE( a_h, b_h )
#endif
!
#else
b = a
#endif
!
RETURN
END SUBROUTINE redist_row2col_gpu_x
#endif
!
!
!
SUBROUTINE cyc2blk_redist_x( n, a, lda, nca, b, ldb, ncb, idesc )
!
!! Parallel square matrix redistribution. Double precision
!! A (input) is cyclically distributed by rows across processors
!! B (output) is distributed by block across 2D processors grid
!!
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
IMPLICIT NONE
!
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: lda
!! local rows of A
INTEGER, INTENT(IN) :: nca
!! local columns of A
INTEGER, INTENT(IN) :: ldb
!! local rows of B
INTEGER, INTENT(IN) :: ncb
!! local columns of B
REAL(DP) :: a(lda,nca)
!! matrix A
REAL(DP) :: b(ldb,ncb)
!! matrix B
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
TYPE(la_descriptor) :: desc
integer :: ierr, itag
integer :: np, ip, me, nproc, comm_a
integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
INTEGER :: me_ortho(2), np_ortho(2)
!
real(DP), allocatable :: rcvbuf(:,:,:)
real(DP), allocatable :: sndbuf(:,:)
TYPE(la_descriptor) :: ip_desc
!
character(len=256) :: msg
!
#if defined (__MPI)
CALL laxlib_intarray_to_desc(desc,idesc)
IF( desc%active_node < 0 ) THEN
RETURN
END IF
np = desc%npr ! dimension of the processor mesh
nb = desc%nrcx ! leading dimension of the local matrix block
me = desc%mype ! my processor id (starting from 0)
comm_a = desc%comm
nproc = desc%npr * desc%npc
IF( np /= desc%npc ) &
CALL lax_error__( ' cyc2blk_redist ', ' works only with square processor mesh ', 1 )
IF( n < 1 ) &
CALL lax_error__( ' cyc2blk_redist ', ' incorrect first argument (n <= 0)', 1 )
IF( desc%n < nproc ) &
CALL lax_error__( ' cyc2blk_redist ', ' number of bands < number of proc ', 1 )
nbuf = (nb/nproc+2) * nb
!
ALLOCATE( sndbuf( nb/nproc+2, nb ) )
ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )
!
! loop over all processors
!
DO ip = 0, nproc - 1
!
! 2D proc ortho grid sizes
!
np_ortho(1) = desc%npr
np_ortho(2) = desc%npc
!
! compute other processor coordinates me_ortho
!
CALL GRID2D_COORDS( 'R', ip, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
!
! initialize other processor descriptor
!
CALL descla_init( ip_desc, desc%n, desc%nx, np_ortho, me_ortho, desc%comm, desc%cntx, 1 )
IF( ip_desc%nrcx /= nb ) &
CALL lax_error__( ' cyc2blk_redist ', ' inconsistent block dim nb ', 1 )
!
IF( ip_desc%active_node > 0 ) THEN
ip_nr = ip_desc%nr
ip_nc = ip_desc%nc
ip_ir = ip_desc%ir
ip_ic = ip_desc%ic
!
DO j = 1, ip_nc
jj = j + ip_ic - 1
il = 1
DO i = 1, ip_nr
ii = i + ip_ir - 1
IF( MOD( ii - 1, nproc ) == me ) THEN
CALL check_sndbuf_index()
sndbuf( il, j ) = a( ( ii - 1 )/nproc + 1, jj )
il = il + 1
END IF
END DO
END DO
END IF
CALL mpi_barrier( comm_a, ierr )
CALL mpi_gather( sndbuf, nbuf, mpi_double_precision, &
rcvbuf, nbuf, mpi_double_precision, ip, comm_a, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " cyc2blk_redist ", " in mpi_gather ", ABS( ierr ) )
END DO
!
nr = desc%nr
nc = desc%nc
ir = desc%ir
ic = desc%ic
!
DO ip = 0, nproc - 1
DO j = 1, nc
il = 1
DO i = 1, nr
ii = i + ir - 1
IF( MOD( ii - 1, nproc ) == ip ) THEN
CALL check_rcvbuf_index()
b( i, j ) = rcvbuf( il, j, ip+1 )
il = il + 1
END IF
END DO
END DO
END DO
!
DEALLOCATE( rcvbuf )
DEALLOCATE( sndbuf )
#else
b( 1:n, 1:n ) = a( 1:n, 1:n )
#endif
RETURN
CONTAINS
SUBROUTINE check_sndbuf_index()
CHARACTER(LEN=38), SAVE :: msg = ' check_sndbuf_index in cyc2blk_redist '
IF( j > SIZE(sndbuf,2) ) CALL lax_error__( msg, ' j > SIZE(sndbuf,2) ', ip+1 )
IF( il > SIZE(sndbuf,1) ) CALL lax_error__( msg, ' il > SIZE(sndbuf,1) ', ip+1 )
IF( ( ii - 1 )/nproc + 1 < 1 ) CALL lax_error__( msg, ' ( ii - 1 )/nproc + 1 < 1 ', ip+1 )
IF( ( ii - 1 )/nproc + 1 > lda ) CALL lax_error__( msg, ' ( ii - 1 )/nproc + 1 > SIZE(a,1) ', ip+1 )
IF( jj < 1 ) CALL lax_error__( msg, ' jj < 1 ', ip+1 )
IF( jj > n ) CALL lax_error__( msg, ' jj > n ', ip+1 )
RETURN
END SUBROUTINE check_sndbuf_index
SUBROUTINE check_rcvbuf_index()
CHARACTER(LEN=38), SAVE :: msg = ' check_rcvbuf_index in cyc2blk_redist '
IF( i > ldb ) CALL lax_error__( msg, ' i > ldb ', ip+1 )
IF( j > ldb ) CALL lax_error__( msg, ' j > ldb ', ip+1 )
IF( j > nb ) CALL lax_error__( msg, ' j > nb ', ip+1 )
IF( il > SIZE( rcvbuf, 1 ) ) CALL lax_error__( msg, ' il too large ', ip+1 )
RETURN
END SUBROUTINE check_rcvbuf_index
END SUBROUTINE cyc2blk_redist_x
SUBROUTINE cyc2blk_zredist_x( n, a, lda, nca, b, ldb, ncb, idesc )
!
!! Parallel square matrix redistribution. Double precision complex (Z)
!! A (input) is cyclically distributed by rows across processors
!! B (output) is distributed by block across 2D processors grid
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
IMPLICIT NONE
!
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: lda
!! local rows of A
INTEGER, INTENT(IN) :: nca
!! local columns of A
INTEGER, INTENT(IN) :: ldb
!! local rows of B
INTEGER, INTENT(IN) :: ncb
!! local columns of B
COMPLEX(DP) :: a(lda,nca)
!! matrix A
COMPLEX(DP) :: b(ldb,ncb)
!! matrix B
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
TYPE(la_descriptor) :: desc
!
integer :: ierr, itag
integer :: np, ip, me, nproc, comm_a
integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
INTEGER :: me_ortho(2), np_ortho(2)
!
COMPLEX(DP), allocatable :: rcvbuf(:,:,:)
COMPLEX(DP), allocatable :: sndbuf(:,:)
TYPE(la_descriptor) :: ip_desc
!
character(len=256) :: msg
!
#if defined (__MPI)
CALL laxlib_intarray_to_desc(desc,idesc)
IF( desc%active_node < 0 ) THEN
RETURN
END IF
np = desc%npr ! dimension of the processor mesh
nb = desc%nrcx ! leading dimension of the local matrix block
me = desc%mype ! my processor id (starting from 0)
comm_a = desc%comm
nproc = desc%npr * desc%npc
IF( np /= desc%npc ) &
CALL lax_error__( ' cyc2blk_zredist ', ' works only with square processor mesh ', 1 )
IF( n < 1 ) &
CALL lax_error__( ' cyc2blk_zredist ', ' n less or equal zero ', 1 )
IF( desc%n < nproc ) &
CALL lax_error__( ' cyc2blk_zredist ', ' nb less than the number of proc ', 1 )
!
nbuf = (nb/nproc+2) * nb
!
ALLOCATE( sndbuf( nb/nproc+2, nb ) )
ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )
DO ip = 0, nproc - 1
!
! 2D proc ortho grid sizes
!
np_ortho(1) = desc%npr
np_ortho(2) = desc%npc
!
! compute other processor coordinates me_ortho
!
CALL GRID2D_COORDS( 'R', ip, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
!
! initialize other processor descriptor
!
CALL descla_init( ip_desc, desc%n, desc%nx, np_ortho, me_ortho, desc%comm, desc%cntx, 1 )
ip_nr = ip_desc%nr
ip_nc = ip_desc%nc
ip_ir = ip_desc%ir
ip_ic = ip_desc%ic
!
DO j = 1, ip_nc
jj = j + ip_ic - 1
il = 1
DO i = 1, ip_nr
ii = i + ip_ir - 1
IF( MOD( ii - 1, nproc ) == me ) THEN
CALL check_sndbuf_index()
sndbuf( il, j ) = a( ( ii - 1 )/nproc + 1, jj )
il = il + 1
END IF
END DO
END DO
CALL mpi_barrier( comm_a, ierr )
CALL mpi_gather( sndbuf, nbuf, mpi_double_complex, &
rcvbuf, nbuf, mpi_double_complex, ip, comm_a, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " cyc2blk_zredist ", " in mpi_gather ", ABS( ierr ) )
END DO
!
nr = desc%nr
nc = desc%nc
ir = desc%ir
ic = desc%ic
!
DO ip = 0, nproc - 1
DO j = 1, nc
il = 1
DO i = 1, nr
ii = i + ir - 1
IF( MOD( ii - 1, nproc ) == ip ) THEN
CALL check_rcvbuf_index()
b( i, j ) = rcvbuf( il, j, ip+1 )
il = il + 1
END IF
END DO
END DO
END DO
!
!
DEALLOCATE( rcvbuf )
DEALLOCATE( sndbuf )
#else
b( 1:n, 1:n ) = a( 1:n, 1:n )
#endif
RETURN
CONTAINS
SUBROUTINE check_sndbuf_index()
CHARACTER(LEN=40), SAVE :: msg = ' check_sndbuf_index in cyc2blk_zredist '
IF( j > SIZE(sndbuf,2) ) CALL lax_error__( msg, ' j > SIZE(sndbuf,2) ', ip+1 )
IF( il > SIZE(sndbuf,1) ) CALL lax_error__( msg, ' il > SIZE(sndbuf,1) ', ip+1 )
IF( ( ii - 1 )/nproc + 1 < 1 ) CALL lax_error__( msg, ' ( ii - 1 )/nproc + 1 < 1 ', ip+1 )
IF( ( ii - 1 )/nproc + 1 > SIZE(a,1) ) CALL lax_error__( msg, ' ( ii - 1 )/nproc + 1 > SIZE(a,1) ', ip+1 )
IF( jj < 1 ) CALL lax_error__( msg, ' jj < 1 ', ip+1 )
IF( jj > n ) CALL lax_error__( msg, ' jj > n ', ip+1 )
RETURN
END SUBROUTINE check_sndbuf_index
SUBROUTINE check_rcvbuf_index()
CHARACTER(LEN=40), SAVE :: msg = ' check_rcvbuf_index in cyc2blk_zredist '
IF( i > ldb ) CALL lax_error__( msg, ' i > ldb ', ip+1 )
IF( j > ldb ) CALL lax_error__( msg, ' j > ldb ', ip+1 )
IF( j > nb ) CALL lax_error__( msg, ' j > nb ', ip+1 )
IF( il > SIZE( rcvbuf, 1 ) ) CALL lax_error__( msg, ' il too large ', ip+1 )
RETURN
END SUBROUTINE check_rcvbuf_index
END SUBROUTINE cyc2blk_zredist_x
SUBROUTINE blk2cyc_redist_x( n, a, lda, nca, b, ldb, ncb, idesc )
!
!! Parallel square matrix redistribution. Double precision.
!! A (output) is cyclically distributed by rows across processors
!! B (input) is distributed by block across 2D processors grid
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
IMPLICIT NONE
!
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: lda
!! local rows of A
INTEGER, INTENT(IN) :: nca
!! local columns of A
INTEGER, INTENT(IN) :: ldb
!! local rows of B
INTEGER, INTENT(IN) :: ncb
!! local columns of B
REAL(DP) :: a(lda,nca)
!! matrix A
REAL(DP) :: b(ldb,ncb)
!! matrix B
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
TYPE(la_descriptor) :: desc
integer :: ierr, itag
integer :: np, ip, me, comm_a, nproc
integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
INTEGER :: me_ortho(2), np_ortho(2)
!
REAL(DP), allocatable :: rcvbuf(:,:,:)
REAL(DP), allocatable :: sndbuf(:,:)
TYPE(la_descriptor) :: ip_desc
!
character(len=256) :: msg
!
#if defined (__MPI)
CALL laxlib_intarray_to_desc(desc,idesc)
IF( desc%active_node < 0 ) THEN
RETURN
END IF
np = desc%npr ! dimension of the processor mesh
nb = desc%nrcx ! leading dimension of the local matrix block
me = desc%mype ! my processor id (starting from 0)
comm_a = desc%comm
nproc = desc%npr * desc%npc
IF( np /= desc%npc ) &
CALL lax_error__( ' blk2cyc_redist ', ' works only with square processor mesh ', 1 )
IF( n < 1 ) &
CALL lax_error__( ' blk2cyc_redist ', ' n less or equal zero ', 1 )
IF( desc%n < nproc ) &
CALL lax_error__( ' blk2cyc_redist ', ' nb less than the number of proc ', 1 )
!
nbuf = (nb/nproc+2) * nb
!
ALLOCATE( sndbuf( nb/nproc+2, nb ) )
ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )
!
nr = desc%nr
nc = desc%nc
ir = desc%ir
ic = desc%ic
!
DO ip = 0, nproc - 1
DO j = 1, nc
il = 1
DO i = 1, nr
ii = i + ir - 1
IF( MOD( ii - 1, nproc ) == ip ) THEN
sndbuf( il, j ) = b( i, j )
il = il + 1
END IF
END DO
END DO
CALL mpi_barrier( comm_a, ierr )
CALL mpi_gather( sndbuf, nbuf, mpi_double_precision, &
rcvbuf, nbuf, mpi_double_precision, ip, comm_a, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " blk2cyc_redist ", " in mpi_gather ", ABS( ierr ) )
END DO
!
DO ip = 0, nproc - 1
!
! 2D proc ortho grid sizes
!
np_ortho(1) = desc%npr
np_ortho(2) = desc%npc
!
! compute other processor coordinates me_ortho
!
CALL GRID2D_COORDS( 'R', ip, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
!
! initialize other processor descriptor
!
CALL descla_init( ip_desc, desc%n, desc%nx, np_ortho, me_ortho, desc%comm, desc%cntx, 1 )
!
ip_nr = ip_desc%nr
ip_nc = ip_desc%nc
ip_ir = ip_desc%ir
ip_ic = ip_desc%ic
!
DO j = 1, ip_nc
jj = j + ip_ic - 1
il = 1
DO i = 1, ip_nr
ii = i + ip_ir - 1
IF( MOD( ii - 1, nproc ) == me ) THEN
a( ( ii - 1 )/nproc + 1, jj ) = rcvbuf( il, j, ip+1 )
il = il + 1
END IF
END DO
END DO
END DO
!
DEALLOCATE( rcvbuf )
DEALLOCATE( sndbuf )
#else
a( 1:n, 1:n ) = b( 1:n, 1:n )
#endif
RETURN
END SUBROUTINE blk2cyc_redist_x
SUBROUTINE blk2cyc_zredist_x( n, a, lda, nca, b, ldb, ncb, idesc )
!
!! Parallel square matrix redistribution. Double precision complex (Z)
!! A (output) is cyclically distributed by rows across processors
!! B (input) is distributed by block across 2D processors grid
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
IMPLICIT NONE
!
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: lda
!! local rows of A
INTEGER, INTENT(IN) :: nca
!! local columns of A
INTEGER, INTENT(IN) :: ldb
!! local rows of B
INTEGER, INTENT(IN) :: ncb
!! local columns of B
COMPLEX(DP) :: a(lda,nca)
!! matrix A
COMPLEX(DP) :: b(ldb,ncb)
!! matrix B
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
!
TYPE(la_descriptor) :: desc
!
integer :: ierr, itag
integer :: np, ip, me, comm_a, nproc
integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
INTEGER :: me_ortho(2), np_ortho(2)
!
COMPLEX(DP), allocatable :: rcvbuf(:,:,:)
COMPLEX(DP), allocatable :: sndbuf(:,:)
TYPE(la_descriptor) :: ip_desc
!
character(len=256) :: msg
!
#if defined (__MPI)
CALL laxlib_intarray_to_desc(desc,idesc)
IF( desc%active_node < 0 ) THEN
RETURN
END IF
np = desc%npr ! dimension of the processor mesh
nb = desc%nrcx ! leading dimension of the local matrix block
me = desc%mype ! my processor id (starting from 0)
comm_a = desc%comm
nproc = desc%npr * desc%npc
IF( np /= desc%npc ) &
CALL lax_error__( ' blk2cyc_zredist ', ' works only with square processor mesh ', 1 )
IF( n < 1 ) &
CALL lax_error__( ' blk2cyc_zredist ', ' n less or equal zero ', 1 )
IF( desc%n < nproc ) &
CALL lax_error__( ' blk2cyc_zredist ', ' nb less than the number of proc ', 1 )
!
nbuf = (nb/nproc+2) * nb
!
ALLOCATE( sndbuf( nb/nproc+2, nb ) )
ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )
!
nr = desc%nr
nc = desc%nc
ir = desc%ir
ic = desc%ic
!
DO ip = 0, nproc - 1
DO j = 1, nc
il = 1
DO i = 1, nr
ii = i + ir - 1
IF( MOD( ii - 1, nproc ) == ip ) THEN
sndbuf( il, j ) = b( i, j )
il = il + 1
END IF
END DO
END DO
CALL mpi_barrier( comm_a, ierr )
CALL mpi_gather( sndbuf, nbuf, mpi_double_complex, &
rcvbuf, nbuf, mpi_double_complex, ip, comm_a, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " blk2cyc_zredist ", " in mpi_gather ", ABS( ierr ) )
END DO
!
DO ip = 0, nproc - 1
!
! 2D proc ortho grid sizes
!
np_ortho(1) = desc%npr
np_ortho(2) = desc%npc
!
! compute other processor coordinates me_ortho
!
CALL GRID2D_COORDS( 'R', ip, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
!
! initialize other processor descriptor
!
CALL descla_init( ip_desc, desc%n, desc%nx, np_ortho, me_ortho, desc%comm, desc%cntx, 1 )
!
ip_nr = ip_desc%nr
ip_nc = ip_desc%nc
ip_ir = ip_desc%ir
ip_ic = ip_desc%ic
!
DO j = 1, ip_nc
jj = j + ip_ic - 1
il = 1
DO i = 1, ip_nr
ii = i + ip_ir - 1
IF( MOD( ii - 1, nproc ) == me ) THEN
a( ( ii - 1 )/nproc + 1, jj ) = rcvbuf( il, j, ip+1 )
il = il + 1
END IF
END DO
END DO
END DO
!
DEALLOCATE( rcvbuf )
DEALLOCATE( sndbuf )
#else
a( 1:n, 1:n ) = b( 1:n, 1:n )
#endif
RETURN
END SUBROUTINE blk2cyc_zredist_x
!
!
!
! Double Complex and Double Precision Cholesky Factorization of
! an Hermitan/Symmetric block distributed matrix
! written by Carlo Cavazzoni
!
!
SUBROUTINE laxlib_pzpotrf_x( sll, ldx, n, idesc )
!
!! Double precision Complex (Z) Cholesky Factorization of
!! an Hermitan/Symmetric block distributed matrix
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
implicit none
!
include 'laxlib_param.fh'
include 'laxlib_kinds.fh'
!
integer :: n
!! global dimension
integer :: ldx
!! leading dimension of sll
integer, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
complex(DP) :: sll( ldx, ldx )
!! matrix sll
!
real(DP) :: one, zero
complex(DP) :: cone, czero
integer :: myrow, mycol, ierr
integer :: jb, info, ib, kb
integer :: jnr, jir, jic, jnc
integer :: inr, iir, iic, inc
integer :: knr, kir, kic, knc
integer :: nr, nc
integer :: rcomm, ccomm, color, key, myid, np
complex(DP), allocatable :: ssnd( :, : ), srcv( :, : )
TYPE(la_descriptor) :: desc
one = 1.0_DP
cone = 1.0_DP
zero = 0.0_DP
czero = 0.0_DP
#if defined __MPI
CALL laxlib_intarray_to_desc(desc,idesc)
myrow = desc%myr
mycol = desc%myc
myid = desc%mype
np = desc%npr
IF( desc%npr /= desc%npc ) THEN
CALL lax_error__( ' pzpotrf ', ' only square grid are allowed ', 1 )
END IF
IF( ldx /= desc%nrcx ) THEN
CALL lax_error__( ' pzpotrf ', ' wrong leading dimension ldx ', ldx )
END IF
nr = desc%nr
nc = desc%nc
ALLOCATE( ssnd( ldx, ldx ) )
ALLOCATE( srcv( ldx, ldx ) )
DO jb = 1, np
!
! Update and factorize the current diagonal block and test
! for non-positive-definiteness.
!
CALL descla_local_dims( jir, jnr, n, desc%nx, np, jb-1 )
!
! since we loop on diagonal blocks/procs we have jnc == jnr
!
jnc = jnr
!
! prepare row and colum communicators
IF( ( myrow >= ( jb-1 ) ) .AND. ( mycol <= ( jb-1 ) ) ) THEN
color = mycol
key = myrow
ELSE
color = np
key = myid
END IF
!
CALL mpi_comm_split( desc%comm , color, key, ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pzpotrf ", " in mpi_comm_split 1 ", ABS( ierr ) )
!
IF( myrow >= jb-1 .and. mycol <= jb-1 ) THEN
color = myrow
key = mycol
ELSE
color = np
key = myid
END IF
!
CALL mpi_comm_split( desc%comm, color, key, rcomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pzpotrf ", " in mpi_comm_split 2 ", ABS( ierr ) )
!
! here every process can work independently, then we need a reduce.
!
IF( jb > 1 ) THEN
!
DO ib = 1, jb - 1
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( ib - 1 ) ) ) THEN
!
! this is because only the lover triangle of ssnd will be set to 0 by ZHERK
!
ssnd = 0.0_DP
!
! remember: matrix ssnd is nr*nr, and procs on the diagonale have nr == nc
!
CALL ZHERK( 'L', 'N', nr, nc, -ONE, sll, ldx, zero, ssnd, ldx )
!
END IF
END DO
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
ssnd = sll
END IF
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
!
! accumulate on the diagonal block/proc
!
CALL mpi_barrier( rcomm, ierr )
CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_COMPLEX, MPI_SUM, jb-1, rcomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pzpotrf ", " in MPI_REDUCE 1 ", ABS( ierr ) )
!
END IF
!
END IF
!
! Only proj ( jb-1, jb-1 ) operates this
!
info = 0
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL ZPOTF2( 'L', jnr, sll, ldx, INFO )
IF( info /= 0 ) &
CALL lax_error__( " pzpotrf ", " problems computing cholesky decomposition ", ABS( info ) )
END IF
!
IF( ( jb > 1 ) .AND. ( jb < np ) ) THEN
!
! Compute the current block column.
!
! processors ( 1 : jb - 1, jb ) should bcast their blocs
! along column to processor ( 1 : jb - 1, jb + 1 : nb )
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
CALL mpi_barrier( ccomm, ierr )
CALL mpi_bcast( sll, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pzpotrf ", " in mpi_bcast 1 ", ABS( ierr ) )
ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
CALL mpi_barrier( ccomm, ierr )
CALL mpi_bcast( srcv, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pzpotrf ", " in mpi_bcast 2 ", ABS( ierr ) )
END IF
!
DO ib = jb + 1, np
CALL descla_local_dims( iir, inr, n, desc%nx, np, ib-1 )
DO kb = 1, jb - 1
CALL descla_local_dims( kic, knc, n, desc%nx, np, kb-1 )
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( kb - 1 ) ) ) THEN
CALL zgemm( 'N', 'C', inr, jnr, knc, -CONE, sll, ldx, srcv, ldx, czero, ssnd, ldx )
END IF
END DO
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
ssnd = sll
END IF
END DO
!
! processors ( jb, jb + 1 : nb ) should collect block along row,
! from processors ( 1 : jb - 1, jb + 1 : nb )
!
DO kb = jb + 1, np
IF( ( myrow == ( kb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
IF( ( jb == 1 ) ) THEN
IF( mycol == ( jb - 1 ) ) THEN
sll = ssnd
END IF
ELSE
CALL mpi_barrier( rcomm, ierr )
CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_COMPLEX, MPI_SUM, jb-1, rcomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pzpotrf ", " in mpi_reduce 2 ", ABS( ierr ) )
END IF
END IF
END DO
!
END IF
!
IF( jb < np ) THEN
!
! processor "jb,jb" should broadcast his block to procs ( jb+1 : nb, jb )
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL mpi_barrier( ccomm, ierr )
CALL mpi_bcast( sll, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pzpotrf ", " in mpi_bcast 3 ", ABS( ierr ) )
ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL mpi_barrier( ccomm, ierr )
CALL mpi_bcast( srcv, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pzpotrf ", " in mpi_bcast 4 ", ABS( ierr ) )
END IF
!
DO ib = jb + 1, np
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL ZTRSM( 'R', 'L', 'C', 'N', nr, nc, CONE, srcv, ldx, sll, ldx )
END IF
END DO
!
END IF
!
CALL mpi_comm_free( rcomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pzpotrf ", " in mpi_comm_free 1 ", ABS( ierr ) )
!
CALL mpi_comm_free( ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pzpotrf ", " in mpi_comm_free 2 ", ABS( ierr ) )
!
END DO
DEALLOCATE( srcv, ssnd )
#else
CALL ZPOTRF( 'L', n, sll, ldx, info )
IF( info /= 0 ) &
CALL lax_error__( " pzpotrf ", " problems computing cholesky decomposition ", ABS( info ) )
#endif
return
END SUBROUTINE laxlib_pzpotrf_x
! now the Double Precision subroutine
SUBROUTINE laxlib_pdpotrf_x( sll, ldx, n, idesc )
!
!! Double precision Cholesky Factorization of
!! an Hermitan/Symmetric block distributed matrix
!
USE laxlib_descriptor
USE laxlib_parallel_include
!
implicit none
!
include 'laxlib_param.fh'
include 'laxlib_kinds.fh'
!
integer :: n
!! global dimension
integer :: ldx
!! leading dimension of sll
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
REAL(DP) :: sll( ldx, ldx )
!! matrix sll
REAL(DP) :: one, zero
integer :: myrow, mycol, ierr
integer :: jb, info, ib, kb
integer :: jnr, jir, jic, jnc
integer :: inr, iir, iic, inc
integer :: knr, kir, kic, knc
integer :: nr, nc
integer :: rcomm, ccomm, color, key, myid, np
REAL(DP), ALLOCATABLE :: ssnd( :, : ), srcv( :, : )
TYPE(la_descriptor) :: desc
one = 1.0_DP
zero = 0.0_DP
#if defined __MPI
CALL laxlib_intarray_to_desc(desc,idesc)
myrow = desc%myr
mycol = desc%myc
myid = desc%mype
np = desc%npr
IF( desc%npr /= desc%npc ) THEN
CALL lax_error__( ' pdpotrf ', ' only square grid are allowed ', 1 )
END IF
IF( ldx /= desc%nrcx ) THEN
CALL lax_error__( ' pdpotrf ', ' wrong leading dimension ldx ', ldx )
END IF
nr = desc%nr
nc = desc%nc
ALLOCATE( ssnd( ldx, ldx ) )
ALLOCATE( srcv( ldx, ldx ) )
DO jb = 1, np
!
! Update and factorize the current diagonal block and test
! for non-positive-definiteness.
!
CALL descla_local_dims( jir, jnr, n, desc%nx, np, jb-1 )
!
! since we loop on diagonal blocks/procs we have jnc == jnr
!
jnc = jnr
!
! prepare row and colum communicators
IF( ( myrow >= ( jb-1 ) ) .AND. ( mycol <= ( jb-1 ) ) ) THEN
color = mycol
key = myrow
ELSE
color = np
key = myid
END IF
!
CALL mpi_comm_split( desc%comm , color, key, ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdpotrf ", " in mpi_comm_split 1 ", ABS( ierr ) )
!
IF( myrow >= jb-1 .and. mycol <= jb-1 ) THEN
color = myrow
key = mycol
ELSE
color = np
key = myid
END IF
!
CALL mpi_comm_split( desc%comm, color, key, rcomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdpotrf ", " in mpi_comm_split 2 ", ABS( ierr ) )
!
! here every process can work independently, then we need a reduce.
!
IF( jb > 1 ) THEN
!
DO ib = 1, jb - 1
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( ib - 1 ) ) ) THEN
!
! this is because only the lover triangle of ssnd will be set to 0 by ZHERK
!
ssnd = 0_DP
!
! remember: matrix ssnd is nr*nr, and procs on the diagonale have nr == nc
!
CALL DSYRK( 'L', 'N', nr, nc, -ONE, sll, ldx, zero, ssnd, ldx )
!
END IF
END DO
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
ssnd = sll
END IF
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
!
! accumulate on the diagonal block/proc
!
CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_PRECISION, MPI_SUM, jb-1, rcomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdpotrf ", " in MPI_REDUCE 1 ", ABS( ierr ) )
!
END IF
!
END IF
!
! Only proj ( jb-1, jb-1 ) operates this
!
info = 0
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL DPOTRF( 'L', jnr, sll, ldx, INFO )
IF( info /= 0 ) &
CALL lax_error__( " pdpotrf ", " problems computing cholesky decomposition ", ABS( info ) )
END IF
!
IF( ( jb > 1 ) .AND. ( jb < np ) ) THEN
!
! Compute the current block column.
!
! processors ( 1 : jb - 1, jb ) should bcast their blocs
! along column to processor ( 1 : jb - 1, jb + 1 : nb )
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
CALL mpi_barrier( ccomm, ierr )
CALL mpi_bcast( sll, ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdpotrf ", " in mpi_bcast 1 ", ABS( ierr ) )
ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
CALL mpi_barrier( ccomm, ierr )
CALL mpi_bcast( srcv, ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdpotrf ", " in mpi_bcast 2 ", ABS( ierr ) )
END IF
!
DO ib = jb + 1, np
CALL descla_local_dims( iir, inr, n, desc%nx, np, ib-1 )
DO kb = 1, jb - 1
CALL descla_local_dims( kic, knc, n, desc%nx, np, kb-1 )
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( kb - 1 ) ) ) THEN
CALL dgemm( 'N', 'T', inr, jnr, knc, -ONE, sll, ldx, srcv, ldx, zero, ssnd, ldx )
END IF
END DO
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
ssnd = sll
END IF
END DO
!
! processors ( jb, jb + 1 : nb ) should collect block along row,
! from processors ( 1 : jb - 1, jb + 1 : nb )
!
DO kb = jb + 1, np
IF( ( myrow == ( kb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
IF( ( jb == 1 ) ) THEN
IF( mycol == ( jb - 1 ) ) THEN
sll = ssnd
END IF
ELSE
CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_PRECISION, MPI_SUM, jb-1, rcomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdpotrf ", " in mpi_reduce 2 ", ABS( ierr ) )
END IF
END IF
END DO
!
END IF
!
IF( jb < np ) THEN
!
! processor "jb,jb" should broadcast his block to procs ( jb+1 : nb, jb )
!
IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL mpi_barrier( ccomm, ierr )
CALL mpi_bcast( sll, ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdpotrf ", " in mpi_bcast 3 ", ABS( ierr ) )
ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL mpi_barrier( ccomm, ierr )
CALL mpi_bcast( srcv, ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdpotrf ", " in mpi_bcast 4 ", ABS( ierr ) )
END IF
!
DO ib = jb + 1, np
IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
CALL DTRSM( 'R', 'L', 'T', 'N', nr, nc, ONE, srcv, ldx, sll, ldx )
END IF
END DO
!
END IF
!
CALL mpi_comm_free( rcomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdpotrf ", " in mpi_comm_free 1 ", ABS( ierr ) )
CALL mpi_comm_free( ccomm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdpotrf ", " in mpi_comm_free 2 ", ABS( ierr ) )
!
END DO
DEALLOCATE( srcv, ssnd )
#else
CALL DPOTRF( 'L', n, sll, ldx, info )
IF( info /= 0 ) &
CALL lax_error__( " pzpotrf ", " problems computing cholesky decomposition ", ABS( info ) )
#endif
return
END SUBROUTINE laxlib_pdpotrf_x
!
!
!
!
SUBROUTINE laxlib_pztrtri_x ( sll, ldx, n, idesc )
!
!! pztrtri computes the parallel inversion of a lower triangular matrix
!! distribuited among the processes using a 2-D block partitioning.
!! The algorithm is based on the schema below and executes the model
!! recursively to each column C2 under the diagonal.
!!
!! |-------|-------| |--------------------|--------------------|
!! | A1 | 0 | | C1 = trtri(A1) | 0 |
!! A = |-------|-------| C = |--------------------|--------------------|
!! | A2 | A3 | | C2 = -C3 * A2 * C1 | C3 = trtri(A3) |
!! |-------|-------| |--------------------|--------------------|
!!
!! The recursive steps of multiplication (C2 = -C3 * A2 * C1) is based on the Cannon's algorithms
!! for parallel matrix multiplication and is done with BLACS(dgemm)
!!
!!
!
! Arguments
! ============
!
! sll = local block of data
! ldx = leading dimension of one block
! n = size of the global array diributed among the blocks
! desc = descriptor of the matrix distribution
!
!
! written by Ivan Girotto
!
USE laxlib_descriptor
USE laxlib_parallel_include
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: ldx
!! leading dimension of sll
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
COMPLEX(DP), INTENT( INOUT ) :: sll( ldx, ldx )
!! matrix sll
COMPLEX(DP), PARAMETER :: ONE = (1.0_DP, 0.0_DP)
COMPLEX(DP), PARAMETER :: ZERO = (0.0_DP, 0.0_DP)
#if defined __MPI
INTEGER :: status(MPI_STATUS_SIZE)
#endif
INTEGER :: req(2), ierr, col_comm
INTEGER :: send, recv, group_rank, group_size
INTEGER :: myrow, mycol, np, myid, comm
! counters
INTEGER :: k, i, j, count, step_count, shiftcount, cicle
INTEGER :: C3dim ! Dimension of submatrix B
INTEGER :: nc, nr ! Local dimension of block
INTEGER :: info, sup_recv
INTEGER :: idrowref, idcolref, idref, idrecv
! B and BUF_RECV are used to overload the computation of matrix multiplication and the shift of the blocks
COMPLEX(DP), ALLOCATABLE, DIMENSION( :, : ) :: B, C, BUF_RECV
COMPLEX(DP) :: first
TYPE(la_descriptor) :: desc
CALL laxlib_intarray_to_desc(desc,idesc)
myrow = desc%myr
mycol = desc%myc
myid = desc%mype
np = desc%npr
comm = desc%comm
IF( desc%npr /= desc%npc ) THEN
CALL lax_error__( ' pztrtri ', ' only square grid are allowed ', 1 )
END IF
IF( ldx /= desc%nrcx ) THEN
CALL lax_error__( ' pztrtri ', ' wrong leading dimension ldx ', ldx )
END IF
nr = desc%nr
nc = desc%nc
! clear elements outside local meaningful block nr*nc
DO j = nc+1, ldx
DO i = 1, ldx
sll( i, j ) = zero
END DO
END DO
DO j = 1, ldx
DO i = nr+1, ldx
sll( i, j ) = zero
END DO
END DO
#if defined __MPI
ALLOCATE( B( ldx, ldx ) )
ALLOCATE( C( ldx, ldx ) )
ALLOCATE( BUF_RECV ( ldx, ldx ) )
IF( np == 2 ) THEN
!
! special case with 4 proc, 2x2 grid
!
IF( myrow == mycol ) THEN
CALL compute_ztrtri()
END IF
!
CALL GRID2D_RANK( 'R', np, np, 1, 0, idref )
!
IF( myrow == 0 .AND. mycol == 0 ) THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_COMPLEX, idref, 0, comm, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in mpi_send 1 ", ABS( ierr ) )
END IF
!
IF( myrow == 1 .AND. mycol == 1 ) THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_COMPLEX, idref, 1, comm, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in mpi_send 2 ", ABS( ierr ) )
END IF
!
IF( myrow == 1 .AND. mycol == 0 ) THEN
!
CALL GRID2D_RANK( 'R', np, np, 0, 0, i )
CALL GRID2D_RANK( 'R', np, np, 1, 1, j )
!
CALL MPI_Irecv( B, ldx*ldx, MPI_DOUBLE_COMPLEX, i, 0, comm, req(1), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in mpi_irecv 3 ", ABS( ierr ) )
!
CALL MPI_Irecv( C, ldx*ldx, MPI_DOUBLE_COMPLEX, j, 1, comm, req(2), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in mpi_irecv 4 ", ABS( ierr ) )
!
CALL MPI_Wait(req(1), status, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Wait 5 ", ABS( ierr ) )
!
CALL zgemm('N', 'N', ldx, ldx, ldx, ONE, sll, ldx, b, ldx, ZERO, buf_recv, ldx)
!
CALL MPI_Wait(req(2), status, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Wait 6 ", ABS( ierr ) )
!
CALL zgemm('N', 'N', ldx, ldx, ldx, -ONE, c, ldx, buf_recv, ldx, ZERO, sll, ldx)
!
END IF
!
IF( myrow == 0 .AND. mycol == 1 ) THEN
!
sll = zero
!
END IF
!
DEALLOCATE( b, c, buf_recv )
!
RETURN
!
END IF
IF( myrow >= mycol ) THEN
!
! only procs on lower triangle partecipates
!
CALL MPI_Comm_split( comm, mycol, myrow, col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Comm_split 9 ", ABS( ierr ) )
CALL MPI_Comm_size( col_comm, group_size, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Comm_size 10 ", ABS( ierr ) )
!
CALL MPI_Comm_rank( col_comm, group_rank, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Comm_rank 11 ", ABS( ierr ) )
!
ELSE
!
! other procs stay at the window!
!
CALL MPI_Comm_split( comm, MPI_UNDEFINED, MPI_UNDEFINED, col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Comm_split 12 ", ABS( ierr ) )
!
sll = zero
!
END IF
!
! Compute the inverse of a lower triangular
! along the diagonal of the global array with BLAS(ztrtri)
!
IF( mycol == myrow ) THEN
!
CALL compute_ztrtri()
!
ELSE IF( myrow > mycol ) THEN
!
buf_recv = sll
!
END IF
IF( myrow >= mycol ) THEN
!
! Broadcast the diagonal blocks to the processors under the diagonal
!
CALL MPI_Bcast( sll, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Bcast 13 ", ABS( ierr ) )
!
END IF
! Compute A2 * C1 and start the Cannon's algorithm shifting the blocks of column one place to the North
!
IF( myrow > mycol ) THEN
!
CALL zgemm( 'N', 'N', ldx, ldx, ldx, ONE, buf_recv, ldx, sll, ldx, ZERO, c, ldx )
!
send = shift( 1, group_rank, 1, ( group_size - 1 ), 'N' )
recv = shift( 1, group_rank, 1, ( group_size - 1 ), 'S' )
!
CALL MPI_Sendrecv( c, ldx*ldx, MPI_DOUBLE_COMPLEX, send, 0, buf_recv, &
ldx*ldx, MPI_DOUBLE_COMPLEX, recv, 0, col_comm, status, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Sendrecv 14 ", ABS( ierr ) )
!
END IF
! Execute the Cannon's algorithm to compute ricorsively the multiplication of C2 = -C3 * A2 * C1
!
DO count = ( np - 2 ), 0, -1
C3dim = (np-1) - count ! Dimension of the submatrix C3
first = ZERO
cicle = 0
IF( ( myrow > count ) .AND. ( mycol >= count ) ) THEN
idcolref = count + 1
idrowref = myrow
CALL GRID2D_RANK( 'R', np, np, idrowref, idcolref, idref )
idrecv = idref - 1
! Compute C2 = -C3 * A2 * C1
DO shiftcount = count, np-2
IF(mycol>count)THEN
! Execute the virtual shift of the matrix C3 along the row in order to know which processor
! have to send the block to C2
IF( cicle == 0)THEN
! virtual shift of the block i,j of the submatrix C3 i place to West
send = shift(idref, myid, myrow-count, C3dim, 'W')
ELSE
! virtual shift of the block i,j of the submatrix C3 i place to West
send = shift(idref, send, 1, C3dim, 'E')
END IF
IF(send==idref)THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_COMPLEX, idrecv, myid, comm, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Send 15 ", ABS( ierr ) )
END IF
ELSE
IF( cicle == 0)THEN
! virtual shift of the block i,j of the submatrix C3 i place to West
sup_recv = shift(idref, myid+1, myrow-count, C3dim, 'E')
ELSE
! virtual shift of the block i,j of the submatrix C3 i place to West
sup_recv = shift(idref, sup_recv, 1, C3dim, 'W')
END IF
CALL MPI_Recv(C, ldx*ldx, MPI_DOUBLE_COMPLEX, sup_recv, sup_recv, comm, status, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Recv 16 ", ABS( ierr ) )
send = shift(1, group_rank, 1, (group_size-1), 'S')
recv = shift(1, group_rank, 1, (group_size-1), 'N')
! with the no-blocking communication the computation and the shift of the column block are overapped
!
IF( MOD( cicle, 2 ) == 0 ) THEN
CALL MPI_Isend(BUF_RECV, ldx*ldx, MPI_DOUBLE_COMPLEX, send, group_rank+cicle, col_comm, req(1), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Isend 17 ", ABS( ierr ) )
CALL MPI_Irecv(B, ldx*ldx, MPI_DOUBLE_COMPLEX, recv, recv+cicle, col_comm, req(2), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Irecv 18 ", ABS( ierr ) )
CALL zgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, BUF_RECV, ldx, first, sll, ldx)
ELSE
CALL MPI_Isend(B, ldx*ldx, MPI_DOUBLE_COMPLEX, send, group_rank+cicle, col_comm, req(1), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Isend 19 ", ABS( ierr ) )
CALL MPI_Irecv(BUF_RECV, ldx*ldx, MPI_DOUBLE_COMPLEX, recv, recv+cicle, col_comm, req(2), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Irecv 20 ", ABS( ierr ) )
CALL zgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, B, ldx, ONE, sll, ldx)
END IF
!
CALL MPI_Wait(req(1), status, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Wait 21 ", ABS( ierr ) )
!
CALL MPI_Wait(req(2), status, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in MPI_Wait 22 ", ABS( ierr ) )
!
END IF
cicle = cicle + 1
first = ONE
END DO
END IF
END DO
! split communicator is present and must be freed on all processors
CALL mpi_comm_free( col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pztrtri ", " in mpi_comm_free 25 ", ABS( ierr ) )
DEALLOCATE(B)
DEALLOCATE(C)
DEALLOCATE(BUF_RECV)
#else
CALL compute_ztrtri()
#endif
CONTAINS
SUBROUTINE compute_ztrtri()
!
! clear the upper triangle (excluding diagonal terms) and
!
DO j = 1, ldx
DO i = 1, j-1
sll ( i, j ) = zero
END DO
END DO
!
CALL ztrtri( 'L', 'N', nr, sll, ldx, info )
!
IF( info /= 0 ) THEN
CALL lax_error__( ' pztrtri ', ' problem in the local inversion ', info )
END IF
!
END SUBROUTINE compute_ztrtri
INTEGER FUNCTION shift ( idref, id, pos, size, dir )
IMPLICIT NONE
INTEGER :: idref, id, pos, size
CHARACTER ( LEN = 1 ) :: dir
IF( ( dir == 'E' ) .OR. ( dir == 'S' ) ) THEN
shift = idref + MOD ( ( id - idref ) + pos, size )
ELSE IF( ( dir == 'W' ) .OR. ( dir == 'N' ) ) THEN
shift = idref + MOD ( ( id - idref ) - pos + size, size )
ELSE
shift = -1
END IF
RETURN
END FUNCTION shift
END SUBROUTINE laxlib_pztrtri_x
! now the Double Precision subroutine
SUBROUTINE laxlib_pdtrtri_x ( sll, ldx, n, idesc )
!
!!
!! pdtrtri computes the parallel inversion of a lower triangular matrix
!! distribuited among the processes using a 2-D block partitioning.
!! The algorithm is based on the schema below and executes the model
!! recursively to each column C2 under the diagonal.
!!
!! |-------|-------| |--------------------|--------------------|
!! | A1 | 0 | | C1 = trtri(A1) | 0 |
!! A = |-------|-------| C = |--------------------|--------------------|
!! | A2 | A3 | | C2 = -C3 * A2 * C1 | C3 = trtri(A3) |
!! |-------|-------| |--------------------|--------------------|
!!
!! The recursive steps of multiplication (C2 = -C3 * A2 * C1) is based on the Cannon's algorithms
!! for parallel matrix multiplication and is done with BLACS(dgemm)
!!
!
! Arguments
! ============
!
! sll = local block of data
! ldx = leading dimension of one block
! n = size of the global array diributed among the blocks
! idesc = descriptor of the matrix distribution
!
!
! written by Ivan Girotto
!
USE laxlib_descriptor
USE laxlib_parallel_include
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: ldx
!! leading dimension of sll
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
REAL(DP), INTENT( INOUT ) :: sll( ldx, ldx )
!! matrix sll
REAL(DP), PARAMETER :: ONE = 1.0_DP
REAL(DP), PARAMETER :: ZERO = 0.0_DP
#if defined __MPI
INTEGER :: status(MPI_STATUS_SIZE)
#endif
INTEGER :: req(2), ierr, col_comm
INTEGER :: send, recv, group_rank, group_size
INTEGER :: myrow, mycol, np, myid, comm
! counters
INTEGER :: k, i, j, count, step_count, shiftcount, cicle
INTEGER :: C3dim ! Dimension of submatrix B
INTEGER :: nc, nr ! Local dimension of block
INTEGER :: info, sup_recv
INTEGER :: idrowref, idcolref, idref, idrecv
! B and BUF_RECV are used to overload the computation of matrix multiplication and the shift of the blocks
REAL(DP), ALLOCATABLE, DIMENSION( :, : ) :: B, C, BUF_RECV
REAL(DP) :: first
TYPE(la_descriptor) :: desc
CALL laxlib_intarray_to_desc(desc,idesc)
myrow = desc%myr
mycol = desc%myc
myid = desc%mype
np = desc%npr
comm = desc%comm
IF( desc%npr /= desc%npc ) THEN
CALL lax_error__( ' pdtrtri ', ' only square grid are allowed ', 1 )
END IF
IF( ldx /= desc%nrcx ) THEN
CALL lax_error__( ' pdtrtri ', ' wrong leading dimension ldx ', ldx )
END IF
nr = desc%nr
nc = desc%nc
! clear elements outside local meaningful block nr*nc
DO j = nc+1, ldx
DO i = 1, ldx
sll( i, j ) = zero
END DO
END DO
DO j = 1, ldx
DO i = nr+1, ldx
sll( i, j ) = zero
END DO
END DO
#if defined __MPI
ALLOCATE( B( ldx, ldx ) )
ALLOCATE( C( ldx, ldx ) )
ALLOCATE( BUF_RECV ( ldx, ldx ) )
IF( np == 2 ) THEN
!
! special case with 4 proc, 2x2 grid
!
IF( myrow == mycol ) THEN
CALL compute_dtrtri()
END IF
!
CALL GRID2D_RANK( 'R', np, np, 1, 0, idref )
!
IF( myrow == 0 .AND. mycol == 0 ) THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_PRECISION, idref, 0, comm, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Send 1 ", ABS( ierr ) )
END IF
!
IF( myrow == 1 .AND. mycol == 1 ) THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_PRECISION, idref, 1, comm, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Send 2 ", ABS( ierr ) )
END IF
!
IF( myrow == 1 .AND. mycol == 0 ) THEN
!
CALL GRID2D_RANK( 'R', np, np, 0, 0, i )
CALL GRID2D_RANK( 'R', np, np, 1, 1, j )
!
CALL MPI_Irecv( B, ldx*ldx, MPI_DOUBLE_PRECISION, i, 0, comm, req(1), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Irecv 3 ", ABS( ierr ) )
!
CALL MPI_Irecv( C, ldx*ldx, MPI_DOUBLE_PRECISION, j, 1, comm, req(2), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Irecv 4 ", ABS( ierr ) )
!
CALL MPI_Wait(req(1), status, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Wait 5 ", ABS( ierr ) )
!
CALL dgemm('N', 'N', ldx, ldx, ldx, ONE, sll, ldx, b, ldx, ZERO, buf_recv, ldx)
!
CALL MPI_Wait(req(2), status, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Wait 6 ", ABS( ierr ) )
!
CALL dgemm('N', 'N', ldx, ldx, ldx, -ONE, c, ldx, buf_recv, ldx, ZERO, sll, ldx)
!
END IF
!
IF( myrow == 0 .AND. mycol == 1 ) THEN
!
sll = zero
!
END IF
!
DEALLOCATE( b, c, buf_recv )
!
RETURN
!
END IF
IF( myrow >= mycol ) THEN
!
! only procs on lower triangle partecipates
!
CALL MPI_Comm_split( comm, mycol, myrow, col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Comm_split 9 ", ABS( ierr ) )
CALL MPI_Comm_size( col_comm, group_size, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Comm_size 10 ", ABS( ierr ) )
CALL MPI_Comm_rank( col_comm, group_rank, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Comm_rank 11 ", ABS( ierr ) )
!
ELSE
!
! other procs stay at the window!
!
CALL MPI_Comm_split( comm, MPI_UNDEFINED, MPI_UNDEFINED, col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Comm_split 12 ", ABS( ierr ) )
!
sll = zero
!
END IF
!
! Compute the inverse of a lower triangular
! along the diagonal of the global array with BLAS(ztrtri)
!
IF( mycol == myrow ) THEN
!
CALL compute_dtrtri()
!
ELSE IF( myrow > mycol ) THEN
!
buf_recv = sll
!
END IF
IF( myrow >= mycol ) THEN
!
! Broadcast the diagonal blocks to the processors under the diagonal
!
CALL MPI_Bcast( sll, ldx*ldx, MPI_DOUBLE_PRECISION, 0, col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Bcast 13 ", ABS( ierr ) )
!
END IF
! Compute A2 * C1 and start the Cannon's algorithm shifting the blocks of column one place to the North
!
IF( myrow > mycol ) THEN
!
CALL dgemm( 'N', 'N', ldx, ldx, ldx, ONE, buf_recv, ldx, sll, ldx, ZERO, c, ldx )
!
send = shift( 1, group_rank, 1, ( group_size - 1 ), 'N' )
recv = shift( 1, group_rank, 1, ( group_size - 1 ), 'S' )
!
CALL MPI_Sendrecv( c, ldx*ldx, MPI_DOUBLE_PRECISION, send, 0, buf_recv, &
ldx*ldx, MPI_DOUBLE_PRECISION, recv, 0, col_comm, status, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Sendrecv 14 ", ABS( ierr ) )
!
END IF
! Execute the Cannon's algorithm to compute ricorsively the multiplication of C2 = -C3 * A2 * C1
!
DO count = ( np - 2 ), 0, -1
C3dim = (np-1) - count ! Dimension of the submatrix C3
first = ZERO
cicle = 0
IF( ( myrow > count ) .AND. ( mycol >= count ) ) THEN
idcolref = count + 1
idrowref = myrow
CALL GRID2D_RANK( 'R', np, np, idrowref, idcolref, idref )
idrecv = idref - 1
! Compute C2 = -C3 * A2 * C1
DO shiftcount = count, np-2
IF(mycol>count)THEN
! Execute the virtual shift of the matrix C3 along the row in order to know which processor
! have to send the block to C2
IF( cicle == 0)THEN
! virtual shift of the block i,j of the submatrix C3 i place to West
send = shift(idref, myid, myrow-count, C3dim, 'W')
ELSE
! virtual shift of the block i,j of the submatrix C3 i place to West
send = shift(idref, send, 1, C3dim, 'E')
END IF
IF(send==idref)THEN
CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_PRECISION, idrecv, myid, comm, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Send 15 ", ABS( ierr ) )
END IF
ELSE
IF( cicle == 0)THEN
! virtual shift of the block i,j of the submatrix C3 i place to West
sup_recv = shift(idref, myid+1, myrow-count, C3dim, 'E')
ELSE
! virtual shift of the block i,j of the submatrix C3 i place to West
sup_recv = shift(idref, sup_recv, 1, C3dim, 'W')
END IF
CALL MPI_Recv(C, ldx*ldx, MPI_DOUBLE_PRECISION, sup_recv, sup_recv, comm, status, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Recv 16 ", ABS( ierr ) )
send = shift(1, group_rank, 1, (group_size-1), 'S')
recv = shift(1, group_rank, 1, (group_size-1), 'N')
! with the no-blocking communication the computation and the shift of the column block are overapped
IF( MOD( cicle, 2 ) == 0 ) THEN
!
CALL MPI_Isend(BUF_RECV, ldx*ldx, MPI_DOUBLE_PRECISION, send, group_rank+cicle, col_comm, req(1), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Isend 17 ", ABS( ierr ) )
CALL MPI_Irecv(B, ldx*ldx, MPI_DOUBLE_PRECISION, recv, recv+cicle, col_comm, req(2), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Irecv 18 ", ABS( ierr ) )
!
CALL dgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, BUF_RECV, ldx, first, sll, ldx)
!
ELSE
!
CALL MPI_Isend(B, ldx*ldx, MPI_DOUBLE_PRECISION, send, group_rank+cicle, col_comm, req(1), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Isend 19 ", ABS( ierr ) )
CALL MPI_Irecv(BUF_RECV, ldx*ldx, MPI_DOUBLE_PRECISION, recv, recv+cicle, col_comm, req(2), ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Irecv 20 ", ABS( ierr ) )
!
CALL dgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, B, ldx, ONE, sll, ldx)
!
END IF
!
CALL MPI_Wait(req(1), status, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Wait 21 ", ABS( ierr ) )
CALL MPI_Wait(req(2), status, ierr)
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in MPI_Wait 22 ", ABS( ierr ) )
!
END IF
cicle = cicle + 1
first = ONE
END DO
END IF
END DO
! split communicator is present and must be freed on all processors
CALL mpi_comm_free( col_comm, ierr )
IF( ierr /= 0 ) &
CALL lax_error__( " pdtrtri ", " in mpi_comm_free 25 ", ABS( ierr ) )
DEALLOCATE(B)
DEALLOCATE(C)
DEALLOCATE(BUF_RECV)
#else
CALL compute_dtrtri()
#endif
CONTAINS
SUBROUTINE compute_dtrtri()
!
! clear the upper triangle (excluding diagonal terms) and
!
DO j = 1, ldx
DO i = 1, j-1
sll ( i, j ) = zero
END DO
END DO
!
CALL dtrtri( 'L', 'N', nr, sll, ldx, info )
!
IF( info /= 0 ) THEN
CALL lax_error__( ' pdtrtri ', ' problem in the local inversion ', info )
END IF
!
END SUBROUTINE compute_dtrtri
INTEGER FUNCTION shift ( idref, id, pos, size, dir )
IMPLICIT NONE
INTEGER :: idref, id, pos, size
CHARACTER ( LEN = 1 ) :: dir
IF( ( dir == 'E' ) .OR. ( dir == 'S' ) ) THEN
shift = idref + MOD ( ( id - idref ) + pos, size )
ELSE IF( ( dir == 'W' ) .OR. ( dir == 'N' ) ) THEN
shift = idref + MOD ( ( id - idref ) - pos + size, size )
ELSE
shift = -1
END IF
RETURN
END FUNCTION shift
END SUBROUTINE laxlib_pdtrtri_x
SUBROUTINE laxlib_pdsyevd_x( tv, n, idesc, hh, ldh, e )
!
!! Parallel version of the HOUSEHOLDER tridiagonalization Algorithm for simmetric matrix.
!! double precision version
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
include 'laxlib_low.fh'
LOGICAL, INTENT(IN) :: tv
!! if true compute eigenvalues and eigenvectors (not used)
INTEGER, INTENT(IN) :: n
!! global dimension
INTEGER, INTENT(IN) :: ldh
!! leading dimension of hh
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
REAL(DP) :: hh( ldh, ldh )
!! matrix to be diagonalized and output eigenvectors
REAL(DP) :: e( n )
!! eigenvalues
INTEGER :: nrlx, nrl, nproc
REAL(DP), ALLOCATABLE :: diag(:,:), vv(:,:)
CHARACTER :: jobv
nrl = idesc(LAX_DESC_NRL)
nrlx = idesc(LAX_DESC_NRLX)
nproc = idesc(LAX_DESC_NPC) * idesc(LAX_DESC_NPR)
ALLOCATE( diag( nrlx, n ) )
ALLOCATE( vv( nrlx, n ) )
jobv = 'N'
IF( tv ) jobv = 'V'
!
! Redistribute matrix "hh" into "diag",
! matrix "hh" is block distributed, matrix diag is cyclic distributed
CALL blk2cyc_redist( n, diag, nrlx, n, hh, ldh, ldh, idesc )
!
CALL dspev_drv( jobv, diag, nrlx, e, vv, nrlx, nrl, n, &
nproc, idesc(LAX_DESC_MYPE), idesc(LAX_DESC_COMM) )
!
IF( tv ) CALL cyc2blk_redist( n, vv, nrlx, n, hh, ldh, ldh, idesc )
!
DEALLOCATE( vv )
DEALLOCATE( diag )
RETURN
END SUBROUTINE
SUBROUTINE laxlib_pzheevd_x( tv, n, idesc, hh, ldh, e )
!
!! Parallel version of the HOUSEHOLDER tridiagonalization Algorithm for simmetric matrix.
!! double precision complex(Z) version
!
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
include 'laxlib_low.fh'
LOGICAL, INTENT(IN) :: tv
!! if true compute eigenvalues and eigenvectors (not used)
INTEGER, INTENT(IN) :: n
!! global dimensio of matrix
INTEGER, INTENT(IN) :: ldh
!! leading dimension of hh
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor
COMPLEX(DP) :: hh( ldh, ldh )
!! matrix to be diagonalized and output eigenvectors
REAL(DP) :: e( n )
!! eigenvalues
INTEGER :: nrlx, nrl
COMPLEX(DP), ALLOCATABLE :: diag(:,:), vv(:,:)
CHARACTER :: jobv
nrl = idesc(LAX_DESC_NRL)
nrlx = idesc(LAX_DESC_NRLX)
!
ALLOCATE( diag( nrlx, n ) )
ALLOCATE( vv( nrlx, n ) )
!
jobv = 'N'
IF( tv ) jobv = 'V'
CALL blk2cyc_redist( n, diag, nrlx, n, hh, ldh, ldh, idesc )
!
CALL zhpev_drv( jobv, diag, nrlx, e, vv, nrlx, nrl, n, &
idesc(LAX_DESC_NPC) * idesc(LAX_DESC_NPR), idesc(LAX_DESC_MYPE), idesc(LAX_DESC_COMM) )
!
if( tv ) CALL cyc2blk_redist( n, vv, nrlx, n, hh, ldh, ldh, idesc )
!
DEALLOCATE( vv )
DEALLOCATE( diag )
RETURN
END SUBROUTINE
SUBROUTINE sqr_dsetmat_x( what, n, alpha, a, lda, idesc )
!
!!
!! Set the double precision values of a square distributed matrix
!!
!
IMPLICIT NONE
!
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
CHARACTER(LEN=1), INTENT(IN) :: what
!! 'A' set all the values of "a" equal to alpha
!! 'U' set the values in the upper triangle of "a" equal to alpha
!! 'L' set the values in the lower triangle of "a" equal to alpha
!! 'D' set the values in the diagonal of "a" equal to alpha
INTEGER, INTENT(IN) :: n
!! global dimension of the matrix
REAL(DP), INTENT(IN) :: alpha
!! value to be assigned to elements of "a"
INTEGER, INTENT(IN) :: lda
!!! leading dimension of a
REAL(DP) :: a(lda,*)
!! matrix whose values have to be set
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor of matrix a
INTEGER :: i, j
IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
!
! processors not interested in this computation return quickly
!
RETURN
!
END IF
SELECT CASE( what )
CASE( 'U', 'u' )
IF( idesc(LAX_DESC_MYC) > idesc(LAX_DESC_MYR) ) THEN
DO j = 1, idesc(LAX_DESC_NC)
DO i = 1, idesc(LAX_DESC_NR)
a( i, j ) = alpha
END DO
END DO
ELSE IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
DO j = 1, idesc(LAX_DESC_NC)
DO i = 1, j - 1
a( i, j ) = alpha
END DO
END DO
END IF
CASE( 'L', 'l' )
IF( idesc(LAX_DESC_MYC) < idesc(LAX_DESC_MYR) ) THEN
DO j = 1, idesc(LAX_DESC_NC)
DO i = 1, idesc(LAX_DESC_NR)
a( i, j ) = alpha
END DO
END DO
ELSE IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
DO j = 1, idesc(LAX_DESC_NC)
DO i = j + 1, idesc(LAX_DESC_NR)
a( i, j ) = alpha
END DO
END DO
END IF
CASE( 'D', 'd' )
IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
DO i = 1, idesc(LAX_DESC_NR)
a( i, i ) = alpha
END DO
END IF
CASE DEFAULT
DO j = 1, idesc(LAX_DESC_NC)
DO i = 1, idesc(LAX_DESC_NR)
a( i, j ) = alpha
END DO
END DO
END SELECT
!
RETURN
END SUBROUTINE sqr_dsetmat_x
SUBROUTINE sqr_zsetmat_x( what, n, alpha, a, lda, idesc )
!
!! Set the double precision complex(Z) values of a square distributed matrix
!
IMPLICIT NONE
!
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
!
CHARACTER(LEN=1), INTENT(IN) :: what
!! 'A' set all the values of "a" equal to alpha
!! 'U' set the values in the upper triangle of "a" equal to alpha
!! 'L' set the values in the lower triangle of "a" equal to alpha
!! 'D' set the values in the diagonal of "a" equal to alpha
!! 'H' clear the imaginary part of the diagonal of "a"
INTEGER, INTENT(IN) :: n
!! global dimension of the matrix
COMPLEX(DP), INTENT(IN) :: alpha
!! value to be assigned to elements of "a"
INTEGER, INTENT(IN) :: lda
!! leading dimension of a
COMPLEX(DP) :: a(lda,*)
!! matrix whose values have to be set
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
!! integer laxlib descriptor of matrix a
INTEGER :: i, j
IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
!
! processors not interested in this computation return quickly
!
RETURN
!
END IF
SELECT CASE( what )
CASE( 'U', 'u' )
IF( idesc(LAX_DESC_MYC) > idesc(LAX_DESC_MYR) ) THEN
DO j = 1, idesc(LAX_DESC_NC)
DO i = 1, idesc(LAX_DESC_NR)
a( i, j ) = alpha
END DO
END DO
ELSE IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
DO j = 1, idesc(LAX_DESC_NC)
DO i = 1, j - 1
a( i, j ) = alpha
END DO
END DO
END IF
CASE( 'L', 'l' )
IF( idesc(LAX_DESC_MYC) < idesc(LAX_DESC_MYR) ) THEN
DO j = 1, idesc(LAX_DESC_NC)
DO i = 1, idesc(LAX_DESC_NR)
a( i, j ) = alpha
END DO
END DO
ELSE IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
DO j = 1, idesc(LAX_DESC_NC)
DO i = j + 1, idesc(LAX_DESC_NR)
a( i, j ) = alpha
END DO
END DO
END IF
CASE( 'D', 'd' )
IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
DO i = 1, idesc(LAX_DESC_NR)
a( i, i ) = alpha
END DO
END IF
CASE( 'H', 'h' )
IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
DO i = 1, idesc(LAX_DESC_NR)
a( i, i ) = CMPLX( DBLE( a(i,i) ), 0_DP, KIND=DP )
END DO
END IF
CASE DEFAULT
DO j = 1, idesc(LAX_DESC_NC)
DO i = 1, idesc(LAX_DESC_NR)
a( i, j ) = alpha
END DO
END DO
END SELECT
!
RETURN
END SUBROUTINE sqr_zsetmat_x
!------------------------------------------------------------------------
SUBROUTINE distribute_lambda_x( lambda_repl, lambda_dist, idesc )
!------------------------------------------------------------------------
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
REAL(DP), INTENT(IN) :: lambda_repl(:,:)
REAL(DP), INTENT(OUT) :: lambda_dist(:,:)
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
INTEGER :: i, j, ic, ir
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
ir = idesc(LAX_DESC_IR)
ic = idesc(LAX_DESC_IC)
DO j = 1, idesc(LAX_DESC_NC)
DO i = 1, idesc(LAX_DESC_NR)
lambda_dist( i, j ) = lambda_repl( i + ir - 1, j + ic - 1 )
END DO
END DO
END IF
RETURN
END SUBROUTINE distribute_lambda_x
!
!------------------------------------------------------------------------
SUBROUTINE collect_lambda_x( lambda_repl, lambda_dist, idesc )
!------------------------------------------------------------------------
USE laxlib_processors_grid, ONLY: ortho_parent_comm
USE laxlib_parallel_include
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
REAL(DP), INTENT(OUT) :: lambda_repl(:,:)
REAL(DP), INTENT(IN) :: lambda_dist(:,:)
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
INTEGER :: i, j, ic, ir, ierr
lambda_repl = 0.0d0
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
ir = idesc(LAX_DESC_IR)
ic = idesc(LAX_DESC_IC)
DO j = 1, idesc(LAX_DESC_NC)
DO i = 1, idesc(LAX_DESC_NR)
lambda_repl( i + ir - 1, j + ic - 1 ) = lambda_dist( i, j )
END DO
END DO
END IF
#if defined __MPI
CALL MPI_ALLREDUCE( MPI_IN_PLACE, lambda_repl, SIZE(lambda_repl), MPI_DOUBLE_PRECISION, &
MPI_SUM, ortho_parent_comm, ierr )
#endif
RETURN
END SUBROUTINE collect_lambda_x
!------------------------------------------------------------------------
SUBROUTINE collect_zmat_x( zmat_repl, zmat_dist, idesc )
!------------------------------------------------------------------------
USE laxlib_processors_grid, ONLY: ortho_parent_comm
USE laxlib_parallel_include
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
REAL(DP), INTENT(OUT) :: zmat_repl(:,:)
REAL(DP), INTENT(IN) :: zmat_dist(:,:)
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
INTEGER :: i, ii, j, me, np, nrl, ierr
zmat_repl = 0.0d0
me = idesc(LAX_DESC_MYPE)
np = idesc(LAX_DESC_NPC) * idesc(LAX_DESC_NPR)
nrl = idesc(LAX_DESC_NRL)
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
DO j = 1, idesc(LAX_DESC_N)
ii = me + 1
DO i = 1, nrl
zmat_repl( ii, j ) = zmat_dist( i, j )
ii = ii + np
END DO
END DO
END IF
#if defined __MPI
CALL MPI_ALLREDUCE( MPI_IN_PLACE, zmat_repl, SIZE(zmat_repl), MPI_DOUBLE_PRECISION, &
MPI_SUM, ortho_parent_comm, ierr )
#endif
RETURN
END SUBROUTINE collect_zmat_x
!------------------------------------------------------------------------
SUBROUTINE setval_lambda_x( lambda_dist, i, j, val, idesc )
!------------------------------------------------------------------------
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
REAL(DP), INTENT(OUT) :: lambda_dist(:,:)
INTEGER, INTENT(IN) :: i, j
REAL(DP), INTENT(IN) :: val
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
INTEGER :: ir, ic
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
ir = idesc(LAX_DESC_IR)
ic = idesc(LAX_DESC_IC)
IF( ( i >= ir ) .AND. ( i - ir + 1 <= idesc(LAX_DESC_NR) ) ) THEN
IF( ( j >= ic ) .AND. ( j - ic + 1 <= idesc(LAX_DESC_NC) ) ) THEN
lambda_dist( i - ir + 1, j - ic + 1 ) = val
END IF
END IF
END IF
RETURN
END SUBROUTINE setval_lambda_x
!------------------------------------------------------------------------
SUBROUTINE distribute_zmat_x( zmat_repl, zmat_dist, idesc )
!------------------------------------------------------------------------
IMPLICIT NONE
include 'laxlib_kinds.fh'
include 'laxlib_param.fh'
REAL(DP), INTENT(IN) :: zmat_repl(:,:)
REAL(DP), INTENT(OUT) :: zmat_dist(:,:)
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
INTEGER :: i, ii, j, me, np
me = idesc(LAX_DESC_MYPE)
np = idesc(LAX_DESC_NPC) * idesc(LAX_DESC_NPR)
IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
DO j = 1, idesc(LAX_DESC_N)
ii = me + 1
DO i = 1, idesc(LAX_DESC_NRL)
zmat_dist( i, j ) = zmat_repl( ii, j )
ii = ii + np
END DO
END DO
END IF
RETURN
END SUBROUTINE distribute_zmat_x
!